1 files changed, 35 insertions, 53 deletions
diff --git a/Lib/email/utils.py b/Lib/email/utils.py
index c976021e0e0..ac4da3705f3 100644
--- a/Lib/email/utils.py
+++ b/Lib/email/utils.py
@@ -26,8 +26,9 @@ import time
 import base64
 import random
 import socket
-import urllib
+import urllib.parse
 import warnings
+from io import StringIO
 
 from email._parseaddr import quote
 from email._parseaddr import AddressList as _AddressList
@@ -44,7 +45,7 @@ from email.encoders import _bencode, _qencode
 
 COMMASPACE = ', '
 EMPTYSTRING = ''
-UEMPTYSTRING = u''
+UEMPTYSTRING = ''
 CRLF = '\r\n'
 TICK = "'"
 
@@ -52,36 +53,9 @@ specialsre = re.compile(r'[][\\()<>@,:;".]')
 escapesre = re.compile(r'[][\\()"]')
 
 
-
-# Helpers
-
-def _identity(s):
-    return s
-
-
-def _bdecode(s):
-    """Decodes a base64 string.
-
-    This function is equivalent to base64.decodestring and it's retained only
-    for backward compatibility. It used to remove the last \\n of the decoded
-    string, if it had any (see issue 7143).
-    """
-    if not s:
-        return s
-    return base64.decodestring(s)
-
-
-
-def fix_eols(s):
-    """Replace all line-ending characters with \\r\\n."""
-    # Fix newlines with no preceding carriage return
-    s = re.sub(r'(?<!\r)\n', CRLF, s)
-    # Fix carriage returns with no following newline
-    s = re.sub(r'\r(?!\n)', CRLF, s)
-    return s
 
+# Helpers
 
-
 def formataddr(pair):
     """The inverse of parseaddr(), this takes a 2-tuple of the form
     (realname, email_address) and returns the string value suitable
@@ -100,7 +74,7 @@ def formataddr(pair):
     return address
 
 
-
+
 def getaddresses(fieldvalues):
     """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
     all = COMMASPACE.join(fieldvalues)
@@ -108,7 +82,7 @@ def getaddresses(fieldvalues):
     return a.addresslist
 
 
-
+
 ecre = re.compile(r'''
   =\?                   # literal =?
   (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset
@@ -120,7 +94,7 @@ ecre = re.compile(r'''
   ''', re.VERBOSE | re.IGNORECASE)
 
 
-
+
 def formatdate(timeval=None, localtime=False, usegmt=False):
     """Returns a date string as specified by RFC 2822, e.g.:
 
@@ -173,14 +147,16 @@ def formatdate(timeval=None, localtime=False, usegmt=False):
         zone)
 
 
-
-def make_msgid(idstring=None):
+
+def make_msgid(idstring=None, domain=None):
     """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
 
     <20020201195627.33539.96671@nightshade.la.mastaler.com>
 
     Optional idstring if given is a string used to strengthen the
-    uniqueness of the message id.
+    uniqueness of the message id.  Optional domain if given provides the
+    portion of the message id after the '@'.  It defaults to the locally
+    defined hostname.
     """
     timeval = time.time()
     utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
@@ -190,12 +166,13 @@ def make_msgid(idstring=None):
         idstring = ''
     else:
         idstring = '.' + idstring
-    idhost = socket.getfqdn()
-    msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, idhost)
+    if domain is None:
+        domain = socket.getfqdn()
+    msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, domain)
     return msgid
 
 
-
+
 # These functions are in the standalone mimelib version only because they've
 # subsequently been fixed in the latest Python versions.  We use this to worm
 # around broken older Pythons.
@@ -229,7 +206,7 @@ def unquote(str):
     return str
 
 
-
+
 # RFC2231-related functions - parameter encoding and decoding
 def decode_rfc2231(s):
     """Decode string according to RFC 2231"""
@@ -246,8 +223,7 @@ def encode_rfc2231(s, charset=None, language=None):
     charset is given but not language, the string is encoded using the empty
     string for language.
     """
-    import urllib
-    s = urllib.quote(s, safe='')
+    s = urllib.parse.quote(s, safe='', encoding=charset or 'ascii')
     if charset is None and language is None:
         return s
     if language is None:
@@ -255,7 +231,8 @@ def encode_rfc2231(s, charset=None, language=None):
     return "%s'%s'%s" % (charset, language, s)
 
 
-rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$')
+rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$',
+    re.ASCII)
 
 def decode_params(params):
     """Decode parameters list according to RFC 2231.
@@ -299,7 +276,10 @@ def decode_params(params):
             # language specifiers at the beginning of the string.
             for num, s, encoded in continuations:
                 if encoded:
-                    s = urllib.unquote(s)
+                    # Decode as "latin-1", so the characters in s directly
+                    # represent the percent-encoded octet values.
+                    # collapse_rfc2231_value treats this as an octet sequence.
+                    s = urllib.parse.unquote(s, encoding="latin-1")
                     extended = True
                 value.append(s)
             value = quote(EMPTYSTRING.join(value))
@@ -312,13 +292,15 @@ def decode_params(params):
 
 def collapse_rfc2231_value(value, errors='replace',
                            fallback_charset='us-ascii'):
-    if isinstance(value, tuple):
-        rawval = unquote(value[2])
-        charset = value[0] or 'us-ascii'
-        try:
-            return unicode(rawval, charset, errors)
-        except LookupError:
-            # XXX charset is unknown to Python.
-            return unicode(rawval, fallback_charset, errors)
-    else:
+    if not isinstance(value, tuple) or len(value) != 3:
         return unquote(value)
+    # While value comes to us as a unicode string, we need it to be a bytes
+    # object.  We do not want bytes() normal utf-8 decoder, we want a straight
+    # interpretation of the string as character bytes.
+    charset, language, text = value
+    rawbytes = bytes(text, 'raw-unicode-escape')
+    try:
+        return str(rawbytes, charset, errors)
+    except LookupError:
+        # charset is not a known codec.
+        return unquote(text)