aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/Lib/email/charset.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/email/charset.py')
-rw-r--r--Lib/email/charset.py238
1 files changed, 120 insertions, 118 deletions
diff --git a/Lib/email/charset.py b/Lib/email/charset.py
index dddaa76c55d..f22be2c52c1 100644
--- a/Lib/email/charset.py
+++ b/Lib/email/charset.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2001-2006 Python Software Foundation
+# Copyright (C) 2001-2007 Python Software Foundation
# Author: Ben Gertzfield, Barry Warsaw
# Contact: email-sig@python.org
@@ -9,7 +9,8 @@ __all__ = [
'add_codec',
]
-import codecs
+from functools import partial
+
import email.base64mime
import email.quoprimime
@@ -24,9 +25,11 @@ BASE64 = 2 # Base64
SHORTEST = 3 # the shorter of QP and base64, but only for headers
# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
-MISC_LEN = 7
+RFC2047_CHROME_LEN = 7
DEFAULT_CHARSET = 'us-ascii'
+UNKNOWN8BIT = 'unknown-8bit'
+EMPTYSTRING = ''
@@ -58,8 +61,6 @@ CHARSETS = {
'iso-2022-jp': (BASE64, None, None),
'koi8-r': (BASE64, BASE64, None),
'utf-8': (SHORTEST, BASE64, 'utf-8'),
- # We're making this one up to represent raw unencoded 8-bit
- '8bit': (None, BASE64, 'utf-8'),
}
# Aliases for other commonly-used names for character sets. Map
@@ -153,6 +154,16 @@ def add_codec(charset, codecname):
+# Convenience function for encoding strings, taking into account
+# that they might be unknown-8bit (ie: have surrogate-escaped bytes)
+def _encode(string, codec):
+ if codec == UNKNOWN8BIT:
+ return string.encode('ascii', 'surrogateescape')
+ else:
+ return string.encode(codec)
+
+
+
class Charset:
"""Map character sets to their email properties.
@@ -203,19 +214,14 @@ class Charset:
# is already a unicode, we leave it at that, but ensure that the
# charset is ASCII, as the standard (RFC XXX) requires.
try:
- if isinstance(input_charset, unicode):
+ if isinstance(input_charset, str):
input_charset.encode('ascii')
else:
- input_charset = unicode(input_charset, 'ascii')
+ input_charset = str(input_charset, 'ascii')
except UnicodeError:
raise errors.CharsetError(input_charset)
- input_charset = input_charset.lower().encode('ascii')
- # Set the input charset after filtering through the aliases and/or codecs
- if not (input_charset in ALIASES or input_charset in CHARSETS):
- try:
- input_charset = codecs.lookup(input_charset).name
- except LookupError:
- pass
+ input_charset = input_charset.lower()
+ # Set the input charset after filtering through the aliases
self.input_charset = ALIASES.get(input_charset, input_charset)
# We can try to guess which encoding and conversion to use by the
# charset_map dictionary. Try that first, but let the user override
@@ -257,7 +263,7 @@ class Charset:
Returns "quoted-printable" if self.body_encoding is QP.
Returns "base64" if self.body_encoding is BASE64.
- Returns "7bit" otherwise.
+ Returns conversion function otherwise.
"""
assert self.body_encoding != SHORTEST
if self.body_encoding == QP:
@@ -267,60 +273,6 @@ class Charset:
else:
return encode_7or8bit
- def convert(self, s):
- """Convert a string from the input_codec to the output_codec."""
- if self.input_codec != self.output_codec:
- return unicode(s, self.input_codec).encode(self.output_codec)
- else:
- return s
-
- def to_splittable(self, s):
- """Convert a possibly multibyte string to a safely splittable format.
-
- Uses the input_codec to try and convert the string to Unicode, so it
- can be safely split on character boundaries (even for multibyte
- characters).
-
- Returns the string as-is if it isn't known how to convert it to
- Unicode with the input_charset.
-
- Characters that could not be converted to Unicode will be replaced
- with the Unicode replacement character U+FFFD.
- """
- if isinstance(s, unicode) or self.input_codec is None:
- return s
- try:
- return unicode(s, self.input_codec, 'replace')
- except LookupError:
- # Input codec not installed on system, so return the original
- # string unchanged.
- return s
-
- def from_splittable(self, ustr, to_output=True):
- """Convert a splittable string back into an encoded string.
-
- Uses the proper codec to try and convert the string from Unicode back
- into an encoded format. Return the string as-is if it is not Unicode,
- or if it could not be converted from Unicode.
-
- Characters that could not be converted from Unicode will be replaced
- with an appropriate character (usually '?').
-
- If to_output is True (the default), uses output_codec to convert to an
- encoded format. If to_output is False, uses input_codec.
- """
- if to_output:
- codec = self.output_codec
- else:
- codec = self.input_codec
- if not isinstance(ustr, unicode) or codec is None:
- return ustr
- try:
- return ustr.encode(codec, 'replace')
- except LookupError:
- # Output codec not installed
- return ustr
-
def get_output_charset(self):
"""Return the output character set.
@@ -329,69 +281,119 @@ class Charset:
"""
return self.output_charset or self.input_charset
- def encoded_header_len(self, s):
- """Return the length of the encoded header string."""
- cset = self.get_output_charset()
- # The len(s) of a 7bit encoding is len(s)
- if self.header_encoding == BASE64:
- return email.base64mime.base64_len(s) + len(cset) + MISC_LEN
- elif self.header_encoding == QP:
- return email.quoprimime.header_quopri_len(s) + len(cset) + MISC_LEN
- elif self.header_encoding == SHORTEST:
- lenb64 = email.base64mime.base64_len(s)
- lenqp = email.quoprimime.header_quopri_len(s)
- return min(lenb64, lenqp) + len(cset) + MISC_LEN
- else:
- return len(s)
-
- def header_encode(self, s, convert=False):
- """Header-encode a string, optionally converting it to output_charset.
-
- If convert is True, the string will be converted from the input
- charset to the output charset automatically. This is not useful for
- multibyte character sets, which have line length issues (multibyte
- characters must be split on a character, not a byte boundary); use the
- high-level Header class to deal with these issues. convert defaults
- to False.
+ def header_encode(self, string):
+ """Header-encode a string by converting it first to bytes.
The type of encoding (base64 or quoted-printable) will be based on
- self.header_encoding.
+ this charset's `header_encoding`.
+
+ :param string: A unicode string for the header. It must be possible
+ to encode this string to bytes using the character set's
+ output codec.
+ :return: The encoded string, with RFC 2047 chrome.
"""
- cset = self.get_output_charset()
- if convert:
- s = self.convert(s)
+ codec = self.output_codec or 'us-ascii'
+ header_bytes = _encode(string, codec)
# 7bit/8bit encodings return the string unchanged (modulo conversions)
+ encoder_module = self._get_encoder(header_bytes)
+ if encoder_module is None:
+ return string
+ return encoder_module.header_encode(header_bytes, codec)
+
+ def header_encode_lines(self, string, maxlengths):
+ """Header-encode a string by converting it first to bytes.
+
+ This is similar to `header_encode()` except that the string is fit
+ into maximum line lengths as given by the argument.
+
+ :param string: A unicode string for the header. It must be possible
+ to encode this string to bytes using the character set's
+ output codec.
+ :param maxlengths: Maximum line length iterator. Each element
+ returned from this iterator will provide the next maximum line
+ length. This parameter is used as an argument to built-in next()
+ and should never be exhausted. The maximum line lengths should
+ not count the RFC 2047 chrome. These line lengths are only a
+ hint; the splitter does the best it can.
+ :return: Lines of encoded strings, each with RFC 2047 chrome.
+ """
+ # See which encoding we should use.
+ codec = self.output_codec or 'us-ascii'
+ header_bytes = _encode(string, codec)
+ encoder_module = self._get_encoder(header_bytes)
+ encoder = partial(encoder_module.header_encode, charset=codec)
+ # Calculate the number of characters that the RFC 2047 chrome will
+ # contribute to each line.
+ charset = self.get_output_charset()
+ extra = len(charset) + RFC2047_CHROME_LEN
+ # Now comes the hard part. We must encode bytes but we can't split on
+ # bytes because some character sets are variable length and each
+ # encoded word must stand on its own. So the problem is you have to
+ # encode to bytes to figure out this word's length, but you must split
+ # on characters. This causes two problems: first, we don't know how
+ # many octets a specific substring of unicode characters will get
+ # encoded to, and second, we don't know how many ASCII characters
+ # those octets will get encoded to. Unless we try it. Which seems
+ # inefficient. In the interest of being correct rather than fast (and
+ # in the hope that there will be few encoded headers in any such
+ # message), brute force it. :(
+ lines = []
+ current_line = []
+ maxlen = next(maxlengths) - extra
+ for character in string:
+ current_line.append(character)
+ this_line = EMPTYSTRING.join(current_line)
+ length = encoder_module.header_length(_encode(this_line, charset))
+ if length > maxlen:
+ # This last character doesn't fit so pop it off.
+ current_line.pop()
+ # Does nothing fit on the first line?
+ if not lines and not current_line:
+ lines.append(None)
+ else:
+ separator = (' ' if lines else '')
+ joined_line = EMPTYSTRING.join(current_line)
+ header_bytes = _encode(joined_line, codec)
+ lines.append(encoder(header_bytes))
+ current_line = [character]
+ maxlen = next(maxlengths) - extra
+ joined_line = EMPTYSTRING.join(current_line)
+ header_bytes = _encode(joined_line, codec)
+ lines.append(encoder(header_bytes))
+ return lines
+
+ def _get_encoder(self, header_bytes):
if self.header_encoding == BASE64:
- return email.base64mime.header_encode(s, cset)
+ return email.base64mime
elif self.header_encoding == QP:
- return email.quoprimime.header_encode(s, cset, maxlinelen=None)
+ return email.quoprimime
elif self.header_encoding == SHORTEST:
- lenb64 = email.base64mime.base64_len(s)
- lenqp = email.quoprimime.header_quopri_len(s)
- if lenb64 < lenqp:
- return email.base64mime.header_encode(s, cset)
+ len64 = email.base64mime.header_length(header_bytes)
+ lenqp = email.quoprimime.header_length(header_bytes)
+ if len64 < lenqp:
+ return email.base64mime
else:
- return email.quoprimime.header_encode(s, cset, maxlinelen=None)
+ return email.quoprimime
else:
- return s
-
- def body_encode(self, s, convert=True):
- """Body-encode a string and convert it to output_charset.
+ return None
- If convert is True (the default), the string will be converted from
- the input charset to output charset automatically. Unlike
- header_encode(), there are no issues with byte boundaries and
- multibyte charsets in email bodies, so this is usually pretty safe.
+ def body_encode(self, string):
+ """Body-encode a string by converting it first to bytes.
The type of encoding (base64 or quoted-printable) will be based on
- self.body_encoding.
+ self.body_encoding. If body_encoding is None, we assume the
+ output charset is a 7bit encoding, so re-encoding the decoded
+ string using the ascii codec produces the correct string version
+ of the content.
"""
- if convert:
- s = self.convert(s)
# 7bit/8bit encodings return the string unchanged (module conversions)
if self.body_encoding is BASE64:
- return email.base64mime.body_encode(s)
+ if isinstance(string, str):
+ string = string.encode(self.output_charset)
+ return email.base64mime.body_encode(string)
elif self.body_encoding is QP:
- return email.quoprimime.body_encode(s)
+ return email.quoprimime.body_encode(string)
else:
- return s
+ if isinstance(string, str):
+ string = string.encode(self.output_charset).decode('ascii')
+ return string