# # Cython -- encoding related tools # from __future__ import absolute_import import re import sys if sys.version_info[0] >= 3: _unicode, _str, _bytes, _unichr = str, str, bytes, chr IS_PYTHON3 = True else: _unicode, _str, _bytes, _unichr = unicode, str, str, unichr IS_PYTHON3 = False empty_bytes = _bytes() empty_unicode = _unicode() join_bytes = empty_bytes.join class UnicodeLiteralBuilder(object): """Assemble a unicode string. """ def __init__(self): self.chars = [] def append(self, characters): if isinstance(characters, _bytes): # this came from a Py2 string literal in the parser code characters = characters.decode("ASCII") assert isinstance(characters, _unicode), str(type(characters)) self.chars.append(characters) if sys.maxunicode == 65535: def append_charval(self, char_number): if char_number > 65535: # wide Unicode character on narrow platform => replace # by surrogate pair char_number -= 0x10000 self.chars.append( _unichr((char_number // 1024) + 0xD800) ) self.chars.append( _unichr((char_number % 1024) + 0xDC00) ) else: self.chars.append( _unichr(char_number) ) else: def append_charval(self, char_number): self.chars.append( _unichr(char_number) ) def append_uescape(self, char_number, escape_string): self.append_charval(char_number) def getstring(self): return EncodedString(u''.join(self.chars)) def getstrings(self): return (None, self.getstring()) class BytesLiteralBuilder(object): """Assemble a byte string or char value. """ def __init__(self, target_encoding): self.chars = [] self.target_encoding = target_encoding def append(self, characters): if isinstance(characters, _unicode): characters = characters.encode(self.target_encoding) assert isinstance(characters, _bytes), str(type(characters)) self.chars.append(characters) def append_charval(self, char_number): self.chars.append( _unichr(char_number).encode('ISO-8859-1') ) def append_uescape(self, char_number, escape_string): self.append(escape_string) def getstring(self): # this *must* return a byte string! return bytes_literal(join_bytes(self.chars), self.target_encoding) def getchar(self): # this *must* return a byte string! return self.getstring() def getstrings(self): return (self.getstring(), None) class StrLiteralBuilder(object): """Assemble both a bytes and a unicode representation of a string. """ def __init__(self, target_encoding): self._bytes = BytesLiteralBuilder(target_encoding) self._unicode = UnicodeLiteralBuilder() def append(self, characters): self._bytes.append(characters) self._unicode.append(characters) def append_charval(self, char_number): self._bytes.append_charval(char_number) self._unicode.append_charval(char_number) def append_uescape(self, char_number, escape_string): self._bytes.append(escape_string) self._unicode.append_charval(char_number) def getstrings(self): return (self._bytes.getstring(), self._unicode.getstring()) class EncodedString(_unicode): # unicode string subclass to keep track of the original encoding. # 'encoding' is None for unicode strings and the source encoding # otherwise encoding = None def __deepcopy__(self, memo): return self def byteencode(self): assert self.encoding is not None return self.encode(self.encoding) def utf8encode(self): assert self.encoding is None return self.encode("UTF-8") @property def is_unicode(self): return self.encoding is None def contains_surrogates(self): return string_contains_surrogates(self) def as_utf8_string(self): return bytes_literal(self.utf8encode(), 'utf8') def string_contains_surrogates(ustring): """ Check if the unicode string contains surrogate code points on a CPython platform with wide (UCS-4) or narrow (UTF-16) Unicode, i.e. characters that would be spelled as two separate code units on a narrow platform. """ for c in map(ord, ustring): if c > 65535: # can only happen on wide platforms return True if 0xD800 <= c <= 0xDFFF: return True return False class BytesLiteral(_bytes): # bytes subclass that is compatible with EncodedString encoding = None def __deepcopy__(self, memo): return self def byteencode(self): if IS_PYTHON3: return _bytes(self) else: # fake-recode the string to make it a plain bytes object return self.decode('ISO-8859-1').encode('ISO-8859-1') def utf8encode(self): assert False, "this is not a unicode string: %r" % self def __str__(self): """Fake-decode the byte string to unicode to support % formatting of unicode strings. """ return self.decode('ISO-8859-1') is_unicode = False def as_c_string_literal(self): value = split_string_literal(escape_byte_string(self)) return '"%s"' % value def bytes_literal(s, encoding): assert isinstance(s, bytes) s = BytesLiteral(s) s.encoding = encoding return s def encoded_string(s, encoding): assert isinstance(s, (_unicode, bytes)) s = EncodedString(s) if encoding is not None: s.encoding = encoding return s char_from_escape_sequence = { r'\a' : u'\a', r'\b' : u'\b', r'\f' : u'\f', r'\n' : u'\n', r'\r' : u'\r', r'\t' : u'\t', r'\v' : u'\v', }.get _c_special = ('\\', '??', '"') + tuple(map(chr, range(32))) def _to_escape_sequence(s): if s in '\n\r\t': return repr(s)[1:-1] elif s == '"': return r'\"' elif s == '\\': return r'\\' else: # within a character sequence, oct passes much better than hex return ''.join(['\\%03o' % ord(c) for c in s]) def _build_specials_replacer(): subexps = [] replacements = {} for special in _c_special: regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special]) subexps.append(regexp) replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII') sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub def replace_specials(m): return replacements[m.group(1)] def replace(s): return sub(replace_specials, s) return replace _replace_specials = _build_specials_replacer() def escape_char(c): if IS_PYTHON3: c = c.decode('ISO-8859-1') if c in '\n\r\t\\': return repr(c)[1:-1] elif c == "'": return "\\'" n = ord(c) if n < 32 or n > 127: # hex works well for characters return "\\x%02X" % n else: return c def escape_byte_string(s): """Escape a byte string so that it can be written into C code. Note that this returns a Unicode string instead which, when encoded as ISO-8859-1, will result in the correct byte sequence being written. """ s = _replace_specials(s) try: return s.decode("ASCII") # trial decoding: plain ASCII => done except UnicodeDecodeError: pass if IS_PYTHON3: s_new = bytearray() append, extend = s_new.append, s_new.extend for b in s: if b >= 128: extend(('\\%3o' % b).encode('ASCII')) else: append(b) return s_new.decode('ISO-8859-1') else: l = [] append = l.append for c in s: o = ord(c) if o >= 128: append('\\%3o' % o) else: append(c) return join_bytes(l).decode('ISO-8859-1') def split_string_literal(s, limit=2000): # MSVC can't handle long string literals. if len(s) < limit: return s else: start = 0 chunks = [] while start < len(s): end = start + limit if len(s) > end-4 and '\\' in s[end-4:end]: end -= 4 - s[end-4:end].find('\\') # just before the backslash while s[end-1] == '\\': end -= 1 if end == start: # must have been a long line of backslashes end = start + limit - (limit % 2) - 4 break chunks.append(s[start:end]) start = end return '""'.join(chunks) def encode_pyunicode_string(s): """Create Py_UNICODE[] representation of a given unicode string. """ s = list(map(ord, s)) + [0] if sys.maxunicode >= 0x10000: # Wide build or Py3.3 utf16, utf32 = [], s for code_point in s: if code_point >= 0x10000: # outside of BMP high, low = divmod(code_point - 0x10000, 1024) utf16.append(high + 0xD800) utf16.append(low + 0xDC00) else: utf16.append(code_point) else: utf16, utf32 = s, [] for code_unit in s: if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF: high, low = utf32[-1], code_unit utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000 else: utf32.append(code_unit) if utf16 == utf32: utf16 = [] return ",".join(map(_unicode, utf16)), ",".join(map(_unicode, utf32))