diff options
Diffstat (limited to 'Lib/email/charset.py')
-rw-r--r-- | Lib/email/charset.py | 238 |
1 files changed, 120 insertions, 118 deletions
diff --git a/Lib/email/charset.py b/Lib/email/charset.py index dddaa76c55d..f22be2c52c1 100644 --- a/Lib/email/charset.py +++ b/Lib/email/charset.py @@ -1,4 +1,4 @@ -# Copyright (C) 2001-2006 Python Software Foundation +# Copyright (C) 2001-2007 Python Software Foundation # Author: Ben Gertzfield, Barry Warsaw # Contact: email-sig@python.org @@ -9,7 +9,8 @@ __all__ = [ 'add_codec', ] -import codecs +from functools import partial + import email.base64mime import email.quoprimime @@ -24,9 +25,11 @@ BASE64 = 2 # Base64 SHORTEST = 3 # the shorter of QP and base64, but only for headers # In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7 -MISC_LEN = 7 +RFC2047_CHROME_LEN = 7 DEFAULT_CHARSET = 'us-ascii' +UNKNOWN8BIT = 'unknown-8bit' +EMPTYSTRING = '' @@ -58,8 +61,6 @@ CHARSETS = { 'iso-2022-jp': (BASE64, None, None), 'koi8-r': (BASE64, BASE64, None), 'utf-8': (SHORTEST, BASE64, 'utf-8'), - # We're making this one up to represent raw unencoded 8-bit - '8bit': (None, BASE64, 'utf-8'), } # Aliases for other commonly-used names for character sets. Map @@ -153,6 +154,16 @@ def add_codec(charset, codecname): +# Convenience function for encoding strings, taking into account +# that they might be unknown-8bit (ie: have surrogate-escaped bytes) +def _encode(string, codec): + if codec == UNKNOWN8BIT: + return string.encode('ascii', 'surrogateescape') + else: + return string.encode(codec) + + + class Charset: """Map character sets to their email properties. @@ -203,19 +214,14 @@ class Charset: # is already a unicode, we leave it at that, but ensure that the # charset is ASCII, as the standard (RFC XXX) requires. try: - if isinstance(input_charset, unicode): + if isinstance(input_charset, str): input_charset.encode('ascii') else: - input_charset = unicode(input_charset, 'ascii') + input_charset = str(input_charset, 'ascii') except UnicodeError: raise errors.CharsetError(input_charset) - input_charset = input_charset.lower().encode('ascii') - # Set the input charset after filtering through the aliases and/or codecs - if not (input_charset in ALIASES or input_charset in CHARSETS): - try: - input_charset = codecs.lookup(input_charset).name - except LookupError: - pass + input_charset = input_charset.lower() + # Set the input charset after filtering through the aliases self.input_charset = ALIASES.get(input_charset, input_charset) # We can try to guess which encoding and conversion to use by the # charset_map dictionary. Try that first, but let the user override @@ -257,7 +263,7 @@ class Charset: Returns "quoted-printable" if self.body_encoding is QP. Returns "base64" if self.body_encoding is BASE64. - Returns "7bit" otherwise. + Returns conversion function otherwise. """ assert self.body_encoding != SHORTEST if self.body_encoding == QP: @@ -267,60 +273,6 @@ class Charset: else: return encode_7or8bit - def convert(self, s): - """Convert a string from the input_codec to the output_codec.""" - if self.input_codec != self.output_codec: - return unicode(s, self.input_codec).encode(self.output_codec) - else: - return s - - def to_splittable(self, s): - """Convert a possibly multibyte string to a safely splittable format. - - Uses the input_codec to try and convert the string to Unicode, so it - can be safely split on character boundaries (even for multibyte - characters). - - Returns the string as-is if it isn't known how to convert it to - Unicode with the input_charset. - - Characters that could not be converted to Unicode will be replaced - with the Unicode replacement character U+FFFD. - """ - if isinstance(s, unicode) or self.input_codec is None: - return s - try: - return unicode(s, self.input_codec, 'replace') - except LookupError: - # Input codec not installed on system, so return the original - # string unchanged. - return s - - def from_splittable(self, ustr, to_output=True): - """Convert a splittable string back into an encoded string. - - Uses the proper codec to try and convert the string from Unicode back - into an encoded format. Return the string as-is if it is not Unicode, - or if it could not be converted from Unicode. - - Characters that could not be converted from Unicode will be replaced - with an appropriate character (usually '?'). - - If to_output is True (the default), uses output_codec to convert to an - encoded format. If to_output is False, uses input_codec. - """ - if to_output: - codec = self.output_codec - else: - codec = self.input_codec - if not isinstance(ustr, unicode) or codec is None: - return ustr - try: - return ustr.encode(codec, 'replace') - except LookupError: - # Output codec not installed - return ustr - def get_output_charset(self): """Return the output character set. @@ -329,69 +281,119 @@ class Charset: """ return self.output_charset or self.input_charset - def encoded_header_len(self, s): - """Return the length of the encoded header string.""" - cset = self.get_output_charset() - # The len(s) of a 7bit encoding is len(s) - if self.header_encoding == BASE64: - return email.base64mime.base64_len(s) + len(cset) + MISC_LEN - elif self.header_encoding == QP: - return email.quoprimime.header_quopri_len(s) + len(cset) + MISC_LEN - elif self.header_encoding == SHORTEST: - lenb64 = email.base64mime.base64_len(s) - lenqp = email.quoprimime.header_quopri_len(s) - return min(lenb64, lenqp) + len(cset) + MISC_LEN - else: - return len(s) - - def header_encode(self, s, convert=False): - """Header-encode a string, optionally converting it to output_charset. - - If convert is True, the string will be converted from the input - charset to the output charset automatically. This is not useful for - multibyte character sets, which have line length issues (multibyte - characters must be split on a character, not a byte boundary); use the - high-level Header class to deal with these issues. convert defaults - to False. + def header_encode(self, string): + """Header-encode a string by converting it first to bytes. The type of encoding (base64 or quoted-printable) will be based on - self.header_encoding. + this charset's `header_encoding`. + + :param string: A unicode string for the header. It must be possible + to encode this string to bytes using the character set's + output codec. + :return: The encoded string, with RFC 2047 chrome. """ - cset = self.get_output_charset() - if convert: - s = self.convert(s) + codec = self.output_codec or 'us-ascii' + header_bytes = _encode(string, codec) # 7bit/8bit encodings return the string unchanged (modulo conversions) + encoder_module = self._get_encoder(header_bytes) + if encoder_module is None: + return string + return encoder_module.header_encode(header_bytes, codec) + + def header_encode_lines(self, string, maxlengths): + """Header-encode a string by converting it first to bytes. + + This is similar to `header_encode()` except that the string is fit + into maximum line lengths as given by the argument. + + :param string: A unicode string for the header. It must be possible + to encode this string to bytes using the character set's + output codec. + :param maxlengths: Maximum line length iterator. Each element + returned from this iterator will provide the next maximum line + length. This parameter is used as an argument to built-in next() + and should never be exhausted. The maximum line lengths should + not count the RFC 2047 chrome. These line lengths are only a + hint; the splitter does the best it can. + :return: Lines of encoded strings, each with RFC 2047 chrome. + """ + # See which encoding we should use. + codec = self.output_codec or 'us-ascii' + header_bytes = _encode(string, codec) + encoder_module = self._get_encoder(header_bytes) + encoder = partial(encoder_module.header_encode, charset=codec) + # Calculate the number of characters that the RFC 2047 chrome will + # contribute to each line. + charset = self.get_output_charset() + extra = len(charset) + RFC2047_CHROME_LEN + # Now comes the hard part. We must encode bytes but we can't split on + # bytes because some character sets are variable length and each + # encoded word must stand on its own. So the problem is you have to + # encode to bytes to figure out this word's length, but you must split + # on characters. This causes two problems: first, we don't know how + # many octets a specific substring of unicode characters will get + # encoded to, and second, we don't know how many ASCII characters + # those octets will get encoded to. Unless we try it. Which seems + # inefficient. In the interest of being correct rather than fast (and + # in the hope that there will be few encoded headers in any such + # message), brute force it. :( + lines = [] + current_line = [] + maxlen = next(maxlengths) - extra + for character in string: + current_line.append(character) + this_line = EMPTYSTRING.join(current_line) + length = encoder_module.header_length(_encode(this_line, charset)) + if length > maxlen: + # This last character doesn't fit so pop it off. + current_line.pop() + # Does nothing fit on the first line? + if not lines and not current_line: + lines.append(None) + else: + separator = (' ' if lines else '') + joined_line = EMPTYSTRING.join(current_line) + header_bytes = _encode(joined_line, codec) + lines.append(encoder(header_bytes)) + current_line = [character] + maxlen = next(maxlengths) - extra + joined_line = EMPTYSTRING.join(current_line) + header_bytes = _encode(joined_line, codec) + lines.append(encoder(header_bytes)) + return lines + + def _get_encoder(self, header_bytes): if self.header_encoding == BASE64: - return email.base64mime.header_encode(s, cset) + return email.base64mime elif self.header_encoding == QP: - return email.quoprimime.header_encode(s, cset, maxlinelen=None) + return email.quoprimime elif self.header_encoding == SHORTEST: - lenb64 = email.base64mime.base64_len(s) - lenqp = email.quoprimime.header_quopri_len(s) - if lenb64 < lenqp: - return email.base64mime.header_encode(s, cset) + len64 = email.base64mime.header_length(header_bytes) + lenqp = email.quoprimime.header_length(header_bytes) + if len64 < lenqp: + return email.base64mime else: - return email.quoprimime.header_encode(s, cset, maxlinelen=None) + return email.quoprimime else: - return s - - def body_encode(self, s, convert=True): - """Body-encode a string and convert it to output_charset. + return None - If convert is True (the default), the string will be converted from - the input charset to output charset automatically. Unlike - header_encode(), there are no issues with byte boundaries and - multibyte charsets in email bodies, so this is usually pretty safe. + def body_encode(self, string): + """Body-encode a string by converting it first to bytes. The type of encoding (base64 or quoted-printable) will be based on - self.body_encoding. + self.body_encoding. If body_encoding is None, we assume the + output charset is a 7bit encoding, so re-encoding the decoded + string using the ascii codec produces the correct string version + of the content. """ - if convert: - s = self.convert(s) # 7bit/8bit encodings return the string unchanged (module conversions) if self.body_encoding is BASE64: - return email.base64mime.body_encode(s) + if isinstance(string, str): + string = string.encode(self.output_charset) + return email.base64mime.body_encode(string) elif self.body_encoding is QP: - return email.quoprimime.body_encode(s) + return email.quoprimime.body_encode(string) else: - return s + if isinstance(string, str): + string = string.encode(self.output_charset).decode('ascii') + return string |