diff options
Diffstat (limited to 'Lib/email/message.py')
-rw-r--r-- | Lib/email/message.py | 210 |
1 files changed, 137 insertions, 73 deletions
diff --git a/Lib/email/message.py b/Lib/email/message.py index 7c93370984c..f1ffcdb4de0 100644 --- a/Lib/email/message.py +++ b/Lib/email/message.py @@ -1,4 +1,4 @@ -# Copyright (C) 2001-2006 Python Software Foundation +# Copyright (C) 2001-2007 Python Software Foundation # Author: Barry Warsaw # Contact: email-sig@python.org @@ -8,14 +8,17 @@ __all__ = ['Message'] import re import uu +import base64 import binascii import warnings -from cStringIO import StringIO +from io import BytesIO, StringIO # Intrapackage imports -import email.charset from email import utils from email import errors +from email import header +from email import charset as _charset +Charset = _charset.Charset SEMISPACE = '; ' @@ -23,14 +26,31 @@ SEMISPACE = '; ' # existence of which force quoting of the parameter value. tspecials = re.compile(r'[ \(\)<>@,;:\\"/\[\]\?=]') +# How to figure out if we are processing strings that come from a byte +# source with undecodable characters. +_has_surrogates = re.compile( + '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search + # Helper functions +def _sanitize_header(name, value): + # If the header value contains surrogates, return a Header using + # the unknown-8bit charset to encode the bytes as encoded words. + if not isinstance(value, str): + # Assume it is already a header object + return value + if _has_surrogates(value): + return header.Header(value, charset=_charset.UNKNOWN8BIT, + header_name=name) + else: + return value + def _splitparam(param): # Split header parameters. BAW: this may be too simple. It isn't # strictly RFC 2045 (section 5.1) compliant, but it catches most headers - # found in the wild. We may eventually need a full fledged parser - # eventually. - a, sep, b = param.partition(';') + # found in the wild. We may eventually need a full fledged parser. + # RDM: we might have a Header here; for now just stringify it. + a, sep, b = str(param).partition(';') if not sep: return a.strip(), None return a.strip(), b.strip() @@ -40,16 +60,26 @@ def _formatparam(param, value=None, quote=True): This will quote the value if needed or if quote is true. If value is a three tuple (charset, language, value), it will be encoded according - to RFC2231 rules. + to RFC2231 rules. If it contains non-ascii characters it will likewise + be encoded according to RFC2231 rules, using the utf-8 charset and + a null language. """ if value is not None and len(value) > 0: # A tuple is used for RFC 2231 encoded parameter values where items # are (charset, language, value). charset is a string, not a Charset - # instance. + # instance. RFC 2231 encoded values are never quoted, per RFC. if isinstance(value, tuple): # Encode as per RFC 2231 param += '*' value = utils.encode_rfc2231(value[2], value[0], value[1]) + return '%s=%s' % (param, value) + else: + try: + value.encode('ascii') + except UnicodeEncodeError: + param += '*' + value = utils.encode_rfc2231(value, 'utf-8', '') + return '%s=%s' % (param, value) # BAW: Please check this. I think that if quote is set it should # force quoting even if not necessary. if quote or tspecials.search(value): @@ -60,6 +90,8 @@ def _formatparam(param, value=None, quote=True): return param def _parseparam(s): + # RDM This might be a Header, so for now stringify it. + s = ';' + str(s) plist = [] while s[:1] == ';': s = s[1:] @@ -119,21 +151,20 @@ class Message: """Return the entire formatted message as a string. This includes the headers, body, and envelope header. """ - return self.as_string(unixfrom=True) + return self.as_string() - def as_string(self, unixfrom=False): + def as_string(self, unixfrom=False, maxheaderlen=0): """Return the entire formatted message as a string. Optional `unixfrom' when True, means include the Unix From_ envelope header. This is a convenience method and may not generate the message exactly - as you intend because by default it mangles lines that begin with - "From ". For more flexibility, use the flatten() method of a + as you intend. For more flexibility, use the flatten() method of a Generator instance. """ from email.generator import Generator fp = StringIO() - g = Generator(fp) + g = Generator(fp, mangle_from_=False, maxheaderlen=maxheaderlen) g.flatten(self, unixfrom=unixfrom) return fp.getvalue() @@ -185,34 +216,73 @@ class Message: If the message is a multipart and the decode flag is True, then None is returned. """ - if i is None: - payload = self._payload - elif not isinstance(self._payload, list): - raise TypeError('Expected list, got %s' % type(self._payload)) - else: - payload = self._payload[i] - if decode: - if self.is_multipart(): + # Here is the logic table for this code, based on the email5.0.0 code: + # i decode is_multipart result + # ------ ------ ------------ ------------------------------ + # None True True None + # i True True None + # None False True _payload (a list) + # i False True _payload element i (a Message) + # i False False error (not a list) + # i True False error (not a list) + # None False False _payload + # None True False _payload decoded (bytes) + # Note that Barry planned to factor out the 'decode' case, but that + # isn't so easy now that we handle the 8 bit data, which needs to be + # converted in both the decode and non-decode path. + if self.is_multipart(): + if decode: return None - cte = self.get('content-transfer-encoding', '').lower() - if cte == 'quoted-printable': - return utils._qdecode(payload) - elif cte == 'base64': - try: - return utils._bdecode(payload) - except binascii.Error: - # Incorrect padding - return payload - elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'): - sfp = StringIO() + if i is None: + return self._payload + else: + return self._payload[i] + # For backward compatibility, Use isinstance and this error message + # instead of the more logical is_multipart test. + if i is not None and not isinstance(self._payload, list): + raise TypeError('Expected list, got %s' % type(self._payload)) + payload = self._payload + # cte might be a Header, so for now stringify it. + cte = str(self.get('content-transfer-encoding', '')).lower() + # payload may be bytes here. + if isinstance(payload, str): + if _has_surrogates(payload): + bpayload = payload.encode('ascii', 'surrogateescape') + if not decode: + try: + payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace') + except LookupError: + payload = bpayload.decode('ascii', 'replace') + elif decode: try: - uu.decode(StringIO(payload+'\n'), sfp, quiet=True) - payload = sfp.getvalue() - except uu.Error: - # Some decoding problem - return payload - # Everything else, including encodings with 8bit or 7bit are returned - # unchanged. + bpayload = payload.encode('ascii') + except UnicodeError: + # This won't happen for RFC compliant messages (messages + # containing only ASCII codepoints in the unicode input). + # If it does happen, turn the string into bytes in a way + # guaranteed not to fail. + bpayload = payload.encode('raw-unicode-escape') + if not decode: + return payload + if cte == 'quoted-printable': + return utils._qdecode(bpayload) + elif cte == 'base64': + try: + return base64.b64decode(bpayload) + except binascii.Error: + # Incorrect padding + return bpayload + elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'): + in_file = BytesIO(bpayload) + out_file = BytesIO() + try: + uu.decode(in_file, out_file, quiet=True) + return out_file.getvalue() + except uu.Error: + # Some decoding problem + return bpayload + if isinstance(payload, str): + return bpayload return payload def set_payload(self, payload, charset=None): @@ -238,18 +308,13 @@ class Message: and encoded properly, if needed, when generating the plain text representation of the message. MIME headers (MIME-Version, Content-Type, Content-Transfer-Encoding) will be added as needed. - """ if charset is None: self.del_param('charset') self._charset = None return - if isinstance(charset, basestring): - charset = email.charset.Charset(charset) - if not isinstance(charset, email.charset.Charset): - raise TypeError(charset) - # BAW: should we accept strings that can serve as arguments to the - # Charset constructor? + if not isinstance(charset, Charset): + charset = Charset(charset) self._charset = charset if 'MIME-Version' not in self: self.add_header('MIME-Version', '1.0') @@ -258,9 +323,7 @@ class Message: charset=charset.get_output_charset()) else: self.set_param('charset', charset.get_output_charset()) - if isinstance(self._payload, unicode): - self._payload = self._payload.encode(charset.output_charset) - if str(charset) != charset.get_output_charset(): + if charset != charset.get_output_charset(): self._payload = charset.body_encode(self._payload) if 'Content-Transfer-Encoding' not in self: cte = charset.get_body_encoding() @@ -316,10 +379,9 @@ class Message: def __contains__(self, name): return name.lower() in [k.lower() for k, v in self._headers] - def has_key(self, name): - """Return true if the message contains the header.""" - missing = object() - return self.get(name, missing) is not missing + def __iter__(self): + for field, value in self._headers: + yield field def keys(self): """Return a list of all the message's header field names. @@ -339,7 +401,7 @@ class Message: Any fields deleted and re-inserted are always appended to the header list. """ - return [v for k, v in self._headers] + return [_sanitize_header(k, v) for k, v in self._headers] def items(self): """Get all the message's header fields and values. @@ -349,7 +411,7 @@ class Message: Any fields deleted and re-inserted are always appended to the header list. """ - return self._headers[:] + return [(k, _sanitize_header(k, v)) for k, v in self._headers] def get(self, name, failobj=None): """Get a header value. @@ -360,7 +422,7 @@ class Message: name = name.lower() for k, v in self._headers: if k.lower() == name: - return v + return _sanitize_header(k, v) return failobj # @@ -380,7 +442,7 @@ class Message: name = name.lower() for k, v in self._headers: if k.lower() == name: - values.append(v) + values.append(_sanitize_header(k, v)) if not values: return failobj return values @@ -392,13 +454,18 @@ class Message: additional parameters for the header field, with underscores converted to dashes. Normally the parameter will be added as key="value" unless value is None, in which case only the key will be added. If a - parameter value contains non-ASCII characters it must be specified as a + parameter value contains non-ASCII characters it can be specified as a three-tuple of (charset, language, value), in which case it will be - encoded according to RFC2231 rules. + encoded according to RFC2231 rules. Otherwise it will be encoded using + the utf-8 charset and a language of ''. - Example: + Examples: msg.add_header('content-disposition', 'attachment', filename='bud.gif') + msg.add_header('content-disposition', 'attachment', + filename=('utf-8', '', Fußballer.ppt')) + msg.add_header('content-disposition', 'attachment', + filename='Fußballer.ppt')) """ parts = [] for k, v in _params.items(): @@ -497,7 +564,7 @@ class Message: if value is missing: return failobj params = [] - for p in _parseparam(';' + value): + for p in _parseparam(value): try: name, val = p.split('=', 1) name = name.strip() @@ -546,17 +613,15 @@ class Message: the form (CHARSET, LANGUAGE, VALUE). Note that both CHARSET and LANGUAGE can be None, in which case you should consider VALUE to be encoded in the us-ascii charset. You can usually ignore LANGUAGE. + The parameter value (either the returned string, or the VALUE item in + the 3-tuple) is always unquoted, unless unquote is set to False. - Your application should be prepared to deal with 3-tuple return - values, and can convert the parameter to a Unicode string like so: + If your application doesn't care whether the parameter was RFC 2231 + encoded, it can turn the return value into a string as follows: param = msg.get_param('foo') - if isinstance(param, tuple): - param = unicode(param[2], param[0] or 'us-ascii') + param = email.utils.collapse_rfc2231_value(rawparam) - In any case, the parameter value (either the returned string, or the - VALUE item in the 3-tuple) is always unquoted, unless unquote is set - to False. """ if header not in self: return failobj @@ -762,14 +827,13 @@ class Message: # LookupError will be raised if the charset isn't known to # Python. UnicodeError will be raised if the encoded text # contains a character not in the charset. - charset = unicode(charset[2], pcharset).encode('us-ascii') + as_bytes = charset[2].encode('raw-unicode-escape') + charset = str(as_bytes, pcharset) except (LookupError, UnicodeError): charset = charset[2] - # charset character must be in us-ascii range + # charset characters must be in us-ascii range try: - if isinstance(charset, str): - charset = unicode(charset, 'us-ascii') - charset = charset.encode('us-ascii') + charset.encode('us-ascii') except UnicodeError: return failobj # RFC 2046, $4.1.2 says charsets are not case sensitive |