1 files changed, 137 insertions, 73 deletions
diff --git a/Lib/email/message.py b/Lib/email/message.py
index 7c93370984c..f1ffcdb4de0 100644
--- a/Lib/email/message.py
+++ b/Lib/email/message.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2001-2006 Python Software Foundation
+# Copyright (C) 2001-2007 Python Software Foundation
 # Author: Barry Warsaw
 # Contact: email-sig@python.org
 
@@ -8,14 +8,17 @@ __all__ = ['Message']
 
 import re
 import uu
+import base64
 import binascii
 import warnings
-from cStringIO import StringIO
+from io import BytesIO, StringIO
 
 # Intrapackage imports
-import email.charset
 from email import utils
 from email import errors
+from email import header
+from email import charset as _charset
+Charset = _charset.Charset
 
 SEMISPACE = '; '
 
@@ -23,14 +26,31 @@ SEMISPACE = '; '
 # existence of which force quoting of the parameter value.
 tspecials = re.compile(r'[ \(\)<>@,;:\\"/\[\]\?=]')
 
+# How to figure out if we are processing strings that come from a byte
+# source with undecodable characters.
+_has_surrogates = re.compile(
+    '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search
+
 
 # Helper functions
+def _sanitize_header(name, value):
+    # If the header value contains surrogates, return a Header using
+    # the unknown-8bit charset to encode the bytes as encoded words.
+    if not isinstance(value, str):
+        # Assume it is already a header object
+        return value
+    if _has_surrogates(value):
+        return header.Header(value, charset=_charset.UNKNOWN8BIT,
+                             header_name=name)
+    else:
+        return value
+
 def _splitparam(param):
     # Split header parameters.  BAW: this may be too simple.  It isn't
     # strictly RFC 2045 (section 5.1) compliant, but it catches most headers
-    # found in the wild.  We may eventually need a full fledged parser
-    # eventually.
-    a, sep, b = param.partition(';')
+    # found in the wild.  We may eventually need a full fledged parser.
+    # RDM: we might have a Header here; for now just stringify it.
+    a, sep, b = str(param).partition(';')
     if not sep:
         return a.strip(), None
     return a.strip(), b.strip()
@@ -40,16 +60,26 @@ def _formatparam(param, value=None, quote=True):
 
     This will quote the value if needed or if quote is true.  If value is a
     three tuple (charset, language, value), it will be encoded according
-    to RFC2231 rules.
+    to RFC2231 rules.  If it contains non-ascii characters it will likewise
+    be encoded according to RFC2231 rules, using the utf-8 charset and
+    a null language.
     """
     if value is not None and len(value) > 0:
         # A tuple is used for RFC 2231 encoded parameter values where items
         # are (charset, language, value).  charset is a string, not a Charset
-        # instance.
+        # instance.  RFC 2231 encoded values are never quoted, per RFC.
         if isinstance(value, tuple):
             # Encode as per RFC 2231
             param += '*'
             value = utils.encode_rfc2231(value[2], value[0], value[1])
+            return '%s=%s' % (param, value)
+        else:
+            try:
+                value.encode('ascii')
+            except UnicodeEncodeError:
+                param += '*'
+                value = utils.encode_rfc2231(value, 'utf-8', '')
+                return '%s=%s' % (param, value)
         # BAW: Please check this.  I think that if quote is set it should
         # force quoting even if not necessary.
         if quote or tspecials.search(value):
@@ -60,6 +90,8 @@ def _formatparam(param, value=None, quote=True):
         return param
 
 def _parseparam(s):
+    # RDM This might be a Header, so for now stringify it.
+    s = ';' + str(s)
     plist = []
     while s[:1] == ';':
         s = s[1:]
@@ -119,21 +151,20 @@ class Message:
         """Return the entire formatted message as a string.
         This includes the headers, body, and envelope header.
         """
-        return self.as_string(unixfrom=True)
+        return self.as_string()
 
-    def as_string(self, unixfrom=False):
+    def as_string(self, unixfrom=False, maxheaderlen=0):
         """Return the entire formatted message as a string.
         Optional `unixfrom' when True, means include the Unix From_ envelope
         header.
 
         This is a convenience method and may not generate the message exactly
-        as you intend because by default it mangles lines that begin with
-        "From ".  For more flexibility, use the flatten() method of a
+        as you intend.  For more flexibility, use the flatten() method of a
         Generator instance.
         """
         from email.generator import Generator
         fp = StringIO()
-        g = Generator(fp)
+        g = Generator(fp, mangle_from_=False, maxheaderlen=maxheaderlen)
         g.flatten(self, unixfrom=unixfrom)
         return fp.getvalue()
 
@@ -185,34 +216,73 @@ class Message:
         If the message is a multipart and the decode flag is True, then None
         is returned.
         """
-        if i is None:
-            payload = self._payload
-        elif not isinstance(self._payload, list):
-            raise TypeError('Expected list, got %s' % type(self._payload))
-        else:
-            payload = self._payload[i]
-        if decode:
-            if self.is_multipart():
+        # Here is the logic table for this code, based on the email5.0.0 code:
+        #   i     decode  is_multipart  result
+        # ------  ------  ------------  ------------------------------
+        #  None   True    True          None
+        #   i     True    True          None
+        #  None   False   True          _payload (a list)
+        #   i     False   True          _payload element i (a Message)
+        #   i     False   False         error (not a list)
+        #   i     True    False         error (not a list)
+        #  None   False   False         _payload
+        #  None   True    False         _payload decoded (bytes)
+        # Note that Barry planned to factor out the 'decode' case, but that
+        # isn't so easy now that we handle the 8 bit data, which needs to be
+        # converted in both the decode and non-decode path.
+        if self.is_multipart():
+            if decode:
                 return None
-            cte = self.get('content-transfer-encoding', '').lower()
-            if cte == 'quoted-printable':
-                return utils._qdecode(payload)
-            elif cte == 'base64':
-                try:
-                    return utils._bdecode(payload)
-                except binascii.Error:
-                    # Incorrect padding
-                    return payload
-            elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'):
-                sfp = StringIO()
+            if i is None:
+                return self._payload
+            else:
+                return self._payload[i]
+        # For backward compatibility, Use isinstance and this error message
+        # instead of the more logical is_multipart test.
+        if i is not None and not isinstance(self._payload, list):
+            raise TypeError('Expected list, got %s' % type(self._payload))
+        payload = self._payload
+        # cte might be a Header, so for now stringify it.
+        cte = str(self.get('content-transfer-encoding', '')).lower()
+        # payload may be bytes here.
+        if isinstance(payload, str):
+            if _has_surrogates(payload):
+                bpayload = payload.encode('ascii', 'surrogateescape')
+                if not decode:
+                    try:
+                        payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace')
+                    except LookupError:
+                        payload = bpayload.decode('ascii', 'replace')
+            elif decode:
                 try:
-                    uu.decode(StringIO(payload+'\n'), sfp, quiet=True)
-                    payload = sfp.getvalue()
-                except uu.Error:
-                    # Some decoding problem
-                    return payload
-        # Everything else, including encodings with 8bit or 7bit are returned
-        # unchanged.
+                    bpayload = payload.encode('ascii')
+                except UnicodeError:
+                    # This won't happen for RFC compliant messages (messages
+                    # containing only ASCII codepoints in the unicode input).
+                    # If it does happen, turn the string into bytes in a way
+                    # guaranteed not to fail.
+                    bpayload = payload.encode('raw-unicode-escape')
+        if not decode:
+            return payload
+        if cte == 'quoted-printable':
+            return utils._qdecode(bpayload)
+        elif cte == 'base64':
+            try:
+                return base64.b64decode(bpayload)
+            except binascii.Error:
+                # Incorrect padding
+                return bpayload
+        elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'):
+            in_file = BytesIO(bpayload)
+            out_file = BytesIO()
+            try:
+                uu.decode(in_file, out_file, quiet=True)
+                return out_file.getvalue()
+            except uu.Error:
+                # Some decoding problem
+                return bpayload
+        if isinstance(payload, str):
+            return bpayload
         return payload
 
     def set_payload(self, payload, charset=None):
@@ -238,18 +308,13 @@ class Message:
         and encoded properly, if needed, when generating the plain text
         representation of the message.  MIME headers (MIME-Version,
         Content-Type, Content-Transfer-Encoding) will be added as needed.
-
         """
         if charset is None:
             self.del_param('charset')
             self._charset = None
             return
-        if isinstance(charset, basestring):
-            charset = email.charset.Charset(charset)
-        if not isinstance(charset, email.charset.Charset):
-            raise TypeError(charset)
-        # BAW: should we accept strings that can serve as arguments to the
-        # Charset constructor?
+        if not isinstance(charset, Charset):
+            charset = Charset(charset)
         self._charset = charset
         if 'MIME-Version' not in self:
             self.add_header('MIME-Version', '1.0')
@@ -258,9 +323,7 @@ class Message:
                             charset=charset.get_output_charset())
         else:
             self.set_param('charset', charset.get_output_charset())
-        if isinstance(self._payload, unicode):
-            self._payload = self._payload.encode(charset.output_charset)
-        if str(charset) != charset.get_output_charset():
+        if charset != charset.get_output_charset():
             self._payload = charset.body_encode(self._payload)
         if 'Content-Transfer-Encoding' not in self:
             cte = charset.get_body_encoding()
@@ -316,10 +379,9 @@ class Message:
     def __contains__(self, name):
         return name.lower() in [k.lower() for k, v in self._headers]
 
-    def has_key(self, name):
-        """Return true if the message contains the header."""
-        missing = object()
-        return self.get(name, missing) is not missing
+    def __iter__(self):
+        for field, value in self._headers:
+            yield field
 
     def keys(self):
         """Return a list of all the message's header field names.
@@ -339,7 +401,7 @@ class Message:
         Any fields deleted and re-inserted are always appended to the header
         list.
         """
-        return [v for k, v in self._headers]
+        return [_sanitize_header(k, v) for k, v in self._headers]
 
     def items(self):
         """Get all the message's header fields and values.
@@ -349,7 +411,7 @@ class Message:
         Any fields deleted and re-inserted are always appended to the header
         list.
         """
-        return self._headers[:]
+        return [(k, _sanitize_header(k, v)) for k, v in self._headers]
 
     def get(self, name, failobj=None):
         """Get a header value.
@@ -360,7 +422,7 @@ class Message:
         name = name.lower()
         for k, v in self._headers:
             if k.lower() == name:
-                return v
+                return _sanitize_header(k, v)
         return failobj
 
     #
@@ -380,7 +442,7 @@ class Message:
         name = name.lower()
         for k, v in self._headers:
             if k.lower() == name:
-                values.append(v)
+                values.append(_sanitize_header(k, v))
         if not values:
             return failobj
         return values
@@ -392,13 +454,18 @@ class Message:
         additional parameters for the header field, with underscores converted
         to dashes.  Normally the parameter will be added as key="value" unless
         value is None, in which case only the key will be added.  If a
-        parameter value contains non-ASCII characters it must be specified as a
+        parameter value contains non-ASCII characters it can be specified as a
         three-tuple of (charset, language, value), in which case it will be
-        encoded according to RFC2231 rules.
+        encoded according to RFC2231 rules.  Otherwise it will be encoded using
+        the utf-8 charset and a language of ''.
 
-        Example:
+        Examples:
 
         msg.add_header('content-disposition', 'attachment', filename='bud.gif')
+        msg.add_header('content-disposition', 'attachment',
+                       filename=('utf-8', '', Fußballer.ppt'))
+        msg.add_header('content-disposition', 'attachment',
+                       filename='Fußballer.ppt'))
         """
         parts = []
         for k, v in _params.items():
@@ -497,7 +564,7 @@ class Message:
         if value is missing:
             return failobj
         params = []
-        for p in _parseparam(';' + value):
+        for p in _parseparam(value):
             try:
                 name, val = p.split('=', 1)
                 name = name.strip()
@@ -546,17 +613,15 @@ class Message:
         the form (CHARSET, LANGUAGE, VALUE).  Note that both CHARSET and
         LANGUAGE can be None, in which case you should consider VALUE to be
         encoded in the us-ascii charset.  You can usually ignore LANGUAGE.
+        The parameter value (either the returned string, or the VALUE item in
+        the 3-tuple) is always unquoted, unless unquote is set to False.
 
-        Your application should be prepared to deal with 3-tuple return
-        values, and can convert the parameter to a Unicode string like so:
+        If your application doesn't care whether the parameter was RFC 2231
+        encoded, it can turn the return value into a string as follows:
 
             param = msg.get_param('foo')
-            if isinstance(param, tuple):
-                param = unicode(param[2], param[0] or 'us-ascii')
+            param = email.utils.collapse_rfc2231_value(rawparam)
 
-        In any case, the parameter value (either the returned string, or the
-        VALUE item in the 3-tuple) is always unquoted, unless unquote is set
-        to False.
         """
         if header not in self:
             return failobj
@@ -762,14 +827,13 @@ class Message:
                 # LookupError will be raised if the charset isn't known to
                 # Python.  UnicodeError will be raised if the encoded text
                 # contains a character not in the charset.
-                charset = unicode(charset[2], pcharset).encode('us-ascii')
+                as_bytes = charset[2].encode('raw-unicode-escape')
+                charset = str(as_bytes, pcharset)
             except (LookupError, UnicodeError):
                 charset = charset[2]
-        # charset character must be in us-ascii range
+        # charset characters must be in us-ascii range
         try:
-            if isinstance(charset, str):
-                charset = unicode(charset, 'us-ascii')
-            charset = charset.encode('us-ascii')
+            charset.encode('us-ascii')
         except UnicodeError:
             return failobj
         # RFC 2046, $4.1.2 says charsets are not case sensitive