aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/Lib/email/message.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/email/message.py')
-rw-r--r--Lib/email/message.py210
1 files changed, 137 insertions, 73 deletions
diff --git a/Lib/email/message.py b/Lib/email/message.py
index 7c93370984c..f1ffcdb4de0 100644
--- a/Lib/email/message.py
+++ b/Lib/email/message.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2001-2006 Python Software Foundation
+# Copyright (C) 2001-2007 Python Software Foundation
# Author: Barry Warsaw
# Contact: email-sig@python.org
@@ -8,14 +8,17 @@ __all__ = ['Message']
import re
import uu
+import base64
import binascii
import warnings
-from cStringIO import StringIO
+from io import BytesIO, StringIO
# Intrapackage imports
-import email.charset
from email import utils
from email import errors
+from email import header
+from email import charset as _charset
+Charset = _charset.Charset
SEMISPACE = '; '
@@ -23,14 +26,31 @@ SEMISPACE = '; '
# existence of which force quoting of the parameter value.
tspecials = re.compile(r'[ \(\)<>@,;:\\"/\[\]\?=]')
+# How to figure out if we are processing strings that come from a byte
+# source with undecodable characters.
+_has_surrogates = re.compile(
+ '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search
+
# Helper functions
+def _sanitize_header(name, value):
+ # If the header value contains surrogates, return a Header using
+ # the unknown-8bit charset to encode the bytes as encoded words.
+ if not isinstance(value, str):
+ # Assume it is already a header object
+ return value
+ if _has_surrogates(value):
+ return header.Header(value, charset=_charset.UNKNOWN8BIT,
+ header_name=name)
+ else:
+ return value
+
def _splitparam(param):
# Split header parameters. BAW: this may be too simple. It isn't
# strictly RFC 2045 (section 5.1) compliant, but it catches most headers
- # found in the wild. We may eventually need a full fledged parser
- # eventually.
- a, sep, b = param.partition(';')
+ # found in the wild. We may eventually need a full fledged parser.
+ # RDM: we might have a Header here; for now just stringify it.
+ a, sep, b = str(param).partition(';')
if not sep:
return a.strip(), None
return a.strip(), b.strip()
@@ -40,16 +60,26 @@ def _formatparam(param, value=None, quote=True):
This will quote the value if needed or if quote is true. If value is a
three tuple (charset, language, value), it will be encoded according
- to RFC2231 rules.
+ to RFC2231 rules. If it contains non-ascii characters it will likewise
+ be encoded according to RFC2231 rules, using the utf-8 charset and
+ a null language.
"""
if value is not None and len(value) > 0:
# A tuple is used for RFC 2231 encoded parameter values where items
# are (charset, language, value). charset is a string, not a Charset
- # instance.
+ # instance. RFC 2231 encoded values are never quoted, per RFC.
if isinstance(value, tuple):
# Encode as per RFC 2231
param += '*'
value = utils.encode_rfc2231(value[2], value[0], value[1])
+ return '%s=%s' % (param, value)
+ else:
+ try:
+ value.encode('ascii')
+ except UnicodeEncodeError:
+ param += '*'
+ value = utils.encode_rfc2231(value, 'utf-8', '')
+ return '%s=%s' % (param, value)
# BAW: Please check this. I think that if quote is set it should
# force quoting even if not necessary.
if quote or tspecials.search(value):
@@ -60,6 +90,8 @@ def _formatparam(param, value=None, quote=True):
return param
def _parseparam(s):
+ # RDM This might be a Header, so for now stringify it.
+ s = ';' + str(s)
plist = []
while s[:1] == ';':
s = s[1:]
@@ -119,21 +151,20 @@ class Message:
"""Return the entire formatted message as a string.
This includes the headers, body, and envelope header.
"""
- return self.as_string(unixfrom=True)
+ return self.as_string()
- def as_string(self, unixfrom=False):
+ def as_string(self, unixfrom=False, maxheaderlen=0):
"""Return the entire formatted message as a string.
Optional `unixfrom' when True, means include the Unix From_ envelope
header.
This is a convenience method and may not generate the message exactly
- as you intend because by default it mangles lines that begin with
- "From ". For more flexibility, use the flatten() method of a
+ as you intend. For more flexibility, use the flatten() method of a
Generator instance.
"""
from email.generator import Generator
fp = StringIO()
- g = Generator(fp)
+ g = Generator(fp, mangle_from_=False, maxheaderlen=maxheaderlen)
g.flatten(self, unixfrom=unixfrom)
return fp.getvalue()
@@ -185,34 +216,73 @@ class Message:
If the message is a multipart and the decode flag is True, then None
is returned.
"""
- if i is None:
- payload = self._payload
- elif not isinstance(self._payload, list):
- raise TypeError('Expected list, got %s' % type(self._payload))
- else:
- payload = self._payload[i]
- if decode:
- if self.is_multipart():
+ # Here is the logic table for this code, based on the email5.0.0 code:
+ # i decode is_multipart result
+ # ------ ------ ------------ ------------------------------
+ # None True True None
+ # i True True None
+ # None False True _payload (a list)
+ # i False True _payload element i (a Message)
+ # i False False error (not a list)
+ # i True False error (not a list)
+ # None False False _payload
+ # None True False _payload decoded (bytes)
+ # Note that Barry planned to factor out the 'decode' case, but that
+ # isn't so easy now that we handle the 8 bit data, which needs to be
+ # converted in both the decode and non-decode path.
+ if self.is_multipart():
+ if decode:
return None
- cte = self.get('content-transfer-encoding', '').lower()
- if cte == 'quoted-printable':
- return utils._qdecode(payload)
- elif cte == 'base64':
- try:
- return utils._bdecode(payload)
- except binascii.Error:
- # Incorrect padding
- return payload
- elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'):
- sfp = StringIO()
+ if i is None:
+ return self._payload
+ else:
+ return self._payload[i]
+ # For backward compatibility, Use isinstance and this error message
+ # instead of the more logical is_multipart test.
+ if i is not None and not isinstance(self._payload, list):
+ raise TypeError('Expected list, got %s' % type(self._payload))
+ payload = self._payload
+ # cte might be a Header, so for now stringify it.
+ cte = str(self.get('content-transfer-encoding', '')).lower()
+ # payload may be bytes here.
+ if isinstance(payload, str):
+ if _has_surrogates(payload):
+ bpayload = payload.encode('ascii', 'surrogateescape')
+ if not decode:
+ try:
+ payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace')
+ except LookupError:
+ payload = bpayload.decode('ascii', 'replace')
+ elif decode:
try:
- uu.decode(StringIO(payload+'\n'), sfp, quiet=True)
- payload = sfp.getvalue()
- except uu.Error:
- # Some decoding problem
- return payload
- # Everything else, including encodings with 8bit or 7bit are returned
- # unchanged.
+ bpayload = payload.encode('ascii')
+ except UnicodeError:
+ # This won't happen for RFC compliant messages (messages
+ # containing only ASCII codepoints in the unicode input).
+ # If it does happen, turn the string into bytes in a way
+ # guaranteed not to fail.
+ bpayload = payload.encode('raw-unicode-escape')
+ if not decode:
+ return payload
+ if cte == 'quoted-printable':
+ return utils._qdecode(bpayload)
+ elif cte == 'base64':
+ try:
+ return base64.b64decode(bpayload)
+ except binascii.Error:
+ # Incorrect padding
+ return bpayload
+ elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'):
+ in_file = BytesIO(bpayload)
+ out_file = BytesIO()
+ try:
+ uu.decode(in_file, out_file, quiet=True)
+ return out_file.getvalue()
+ except uu.Error:
+ # Some decoding problem
+ return bpayload
+ if isinstance(payload, str):
+ return bpayload
return payload
def set_payload(self, payload, charset=None):
@@ -238,18 +308,13 @@ class Message:
and encoded properly, if needed, when generating the plain text
representation of the message. MIME headers (MIME-Version,
Content-Type, Content-Transfer-Encoding) will be added as needed.
-
"""
if charset is None:
self.del_param('charset')
self._charset = None
return
- if isinstance(charset, basestring):
- charset = email.charset.Charset(charset)
- if not isinstance(charset, email.charset.Charset):
- raise TypeError(charset)
- # BAW: should we accept strings that can serve as arguments to the
- # Charset constructor?
+ if not isinstance(charset, Charset):
+ charset = Charset(charset)
self._charset = charset
if 'MIME-Version' not in self:
self.add_header('MIME-Version', '1.0')
@@ -258,9 +323,7 @@ class Message:
charset=charset.get_output_charset())
else:
self.set_param('charset', charset.get_output_charset())
- if isinstance(self._payload, unicode):
- self._payload = self._payload.encode(charset.output_charset)
- if str(charset) != charset.get_output_charset():
+ if charset != charset.get_output_charset():
self._payload = charset.body_encode(self._payload)
if 'Content-Transfer-Encoding' not in self:
cte = charset.get_body_encoding()
@@ -316,10 +379,9 @@ class Message:
def __contains__(self, name):
return name.lower() in [k.lower() for k, v in self._headers]
- def has_key(self, name):
- """Return true if the message contains the header."""
- missing = object()
- return self.get(name, missing) is not missing
+ def __iter__(self):
+ for field, value in self._headers:
+ yield field
def keys(self):
"""Return a list of all the message's header field names.
@@ -339,7 +401,7 @@ class Message:
Any fields deleted and re-inserted are always appended to the header
list.
"""
- return [v for k, v in self._headers]
+ return [_sanitize_header(k, v) for k, v in self._headers]
def items(self):
"""Get all the message's header fields and values.
@@ -349,7 +411,7 @@ class Message:
Any fields deleted and re-inserted are always appended to the header
list.
"""
- return self._headers[:]
+ return [(k, _sanitize_header(k, v)) for k, v in self._headers]
def get(self, name, failobj=None):
"""Get a header value.
@@ -360,7 +422,7 @@ class Message:
name = name.lower()
for k, v in self._headers:
if k.lower() == name:
- return v
+ return _sanitize_header(k, v)
return failobj
#
@@ -380,7 +442,7 @@ class Message:
name = name.lower()
for k, v in self._headers:
if k.lower() == name:
- values.append(v)
+ values.append(_sanitize_header(k, v))
if not values:
return failobj
return values
@@ -392,13 +454,18 @@ class Message:
additional parameters for the header field, with underscores converted
to dashes. Normally the parameter will be added as key="value" unless
value is None, in which case only the key will be added. If a
- parameter value contains non-ASCII characters it must be specified as a
+ parameter value contains non-ASCII characters it can be specified as a
three-tuple of (charset, language, value), in which case it will be
- encoded according to RFC2231 rules.
+ encoded according to RFC2231 rules. Otherwise it will be encoded using
+ the utf-8 charset and a language of ''.
- Example:
+ Examples:
msg.add_header('content-disposition', 'attachment', filename='bud.gif')
+ msg.add_header('content-disposition', 'attachment',
+ filename=('utf-8', '', Fußballer.ppt'))
+ msg.add_header('content-disposition', 'attachment',
+ filename='Fußballer.ppt'))
"""
parts = []
for k, v in _params.items():
@@ -497,7 +564,7 @@ class Message:
if value is missing:
return failobj
params = []
- for p in _parseparam(';' + value):
+ for p in _parseparam(value):
try:
name, val = p.split('=', 1)
name = name.strip()
@@ -546,17 +613,15 @@ class Message:
the form (CHARSET, LANGUAGE, VALUE). Note that both CHARSET and
LANGUAGE can be None, in which case you should consider VALUE to be
encoded in the us-ascii charset. You can usually ignore LANGUAGE.
+ The parameter value (either the returned string, or the VALUE item in
+ the 3-tuple) is always unquoted, unless unquote is set to False.
- Your application should be prepared to deal with 3-tuple return
- values, and can convert the parameter to a Unicode string like so:
+ If your application doesn't care whether the parameter was RFC 2231
+ encoded, it can turn the return value into a string as follows:
param = msg.get_param('foo')
- if isinstance(param, tuple):
- param = unicode(param[2], param[0] or 'us-ascii')
+ param = email.utils.collapse_rfc2231_value(rawparam)
- In any case, the parameter value (either the returned string, or the
- VALUE item in the 3-tuple) is always unquoted, unless unquote is set
- to False.
"""
if header not in self:
return failobj
@@ -762,14 +827,13 @@ class Message:
# LookupError will be raised if the charset isn't known to
# Python. UnicodeError will be raised if the encoded text
# contains a character not in the charset.
- charset = unicode(charset[2], pcharset).encode('us-ascii')
+ as_bytes = charset[2].encode('raw-unicode-escape')
+ charset = str(as_bytes, pcharset)
except (LookupError, UnicodeError):
charset = charset[2]
- # charset character must be in us-ascii range
+ # charset characters must be in us-ascii range
try:
- if isinstance(charset, str):
- charset = unicode(charset, 'us-ascii')
- charset = charset.encode('us-ascii')
+ charset.encode('us-ascii')
except UnicodeError:
return failobj
# RFC 2046, $4.1.2 says charsets are not case sensitive