diff options
Diffstat (limited to 'Lib/email/charset.py')
-rw-r--r-- | Lib/email/charset.py | 33 |
1 files changed, 24 insertions, 9 deletions
diff --git a/Lib/email/charset.py b/Lib/email/charset.py index 898beedb00b..f22be2c52c1 100644 --- a/Lib/email/charset.py +++ b/Lib/email/charset.py @@ -28,6 +28,7 @@ SHORTEST = 3 # the shorter of QP and base64, but only for headers RFC2047_CHROME_LEN = 7 DEFAULT_CHARSET = 'us-ascii' +UNKNOWN8BIT = 'unknown-8bit' EMPTYSTRING = '' @@ -153,6 +154,16 @@ def add_codec(charset, codecname): +# Convenience function for encoding strings, taking into account +# that they might be unknown-8bit (ie: have surrogate-escaped bytes) +def _encode(string, codec): + if codec == UNKNOWN8BIT: + return string.encode('ascii', 'surrogateescape') + else: + return string.encode(codec) + + + class Charset: """Map character sets to their email properties. @@ -252,7 +263,7 @@ class Charset: Returns "quoted-printable" if self.body_encoding is QP. Returns "base64" if self.body_encoding is BASE64. - Returns "7bit" otherwise. + Returns conversion function otherwise. """ assert self.body_encoding != SHORTEST if self.body_encoding == QP: @@ -282,8 +293,7 @@ class Charset: :return: The encoded string, with RFC 2047 chrome. """ codec = self.output_codec or 'us-ascii' - charset = self.get_output_charset() - header_bytes = string.encode(codec) + header_bytes = _encode(string, codec) # 7bit/8bit encodings return the string unchanged (modulo conversions) encoder_module = self._get_encoder(header_bytes) if encoder_module is None: @@ -309,9 +319,9 @@ class Charset: """ # See which encoding we should use. codec = self.output_codec or 'us-ascii' - header_bytes = string.encode(codec) + header_bytes = _encode(string, codec) encoder_module = self._get_encoder(header_bytes) - encoder = partial(encoder_module.header_encode, charset=str(self)) + encoder = partial(encoder_module.header_encode, charset=codec) # Calculate the number of characters that the RFC 2047 chrome will # contribute to each line. charset = self.get_output_charset() @@ -333,7 +343,7 @@ class Charset: for character in string: current_line.append(character) this_line = EMPTYSTRING.join(current_line) - length = encoder_module.header_length(this_line.encode(charset)) + length = encoder_module.header_length(_encode(this_line, charset)) if length > maxlen: # This last character doesn't fit so pop it off. current_line.pop() @@ -343,12 +353,12 @@ class Charset: else: separator = (' ' if lines else '') joined_line = EMPTYSTRING.join(current_line) - header_bytes = joined_line.encode(codec) + header_bytes = _encode(joined_line, codec) lines.append(encoder(header_bytes)) current_line = [character] maxlen = next(maxlengths) - extra joined_line = EMPTYSTRING.join(current_line) - header_bytes = joined_line.encode(codec) + header_bytes = _encode(joined_line, codec) lines.append(encoder(header_bytes)) return lines @@ -371,7 +381,10 @@ class Charset: """Body-encode a string by converting it first to bytes. The type of encoding (base64 or quoted-printable) will be based on - self.body_encoding. + self.body_encoding. If body_encoding is None, we assume the + output charset is a 7bit encoding, so re-encoding the decoded + string using the ascii codec produces the correct string version + of the content. """ # 7bit/8bit encodings return the string unchanged (module conversions) if self.body_encoding is BASE64: @@ -381,4 +394,6 @@ class Charset: elif self.body_encoding is QP: return email.quoprimime.body_encode(string) else: + if isinstance(string, str): + string = string.encode(self.output_charset).decode('ascii') return string |