diff options
Diffstat (limited to 'Lib/email/generator.py')
-rw-r--r-- | Lib/email/generator.py | 263 |
1 files changed, 180 insertions, 83 deletions
diff --git a/Lib/email/generator.py b/Lib/email/generator.py index 5626ab91eb5..c6bfb704f53 100644 --- a/Lib/email/generator.py +++ b/Lib/email/generator.py @@ -1,9 +1,10 @@ # Copyright (C) 2001-2010 Python Software Foundation +# Author: Barry Warsaw # Contact: email-sig@python.org """Classes to generate plain text from a message object tree.""" -__all__ = ['Generator', 'DecodedGenerator'] +__all__ = ['Generator', 'DecodedGenerator', 'BytesGenerator'] import re import sys @@ -11,22 +12,15 @@ import time import random import warnings -from cStringIO import StringIO +from io import StringIO, BytesIO from email.header import Header +from email.message import _has_surrogates UNDERSCORE = '_' -NL = '\n' +NL = '\n' # XXX: no longer used by the code below. fcre = re.compile(r'^From ', re.MULTILINE) -def _is8bitstring(s): - if isinstance(s, str): - try: - unicode(s, 'us-ascii') - except UnicodeError: - return True - return False - class Generator: @@ -64,8 +58,8 @@ class Generator: # Just delegate to the file object self._fp.write(s) - def flatten(self, msg, unixfrom=False): - """Print the message object tree rooted at msg to the output file + def flatten(self, msg, unixfrom=False, linesep='\n'): + r"""Print the message object tree rooted at msg to the output file specified when the Generator instance was created. unixfrom is a flag that forces the printing of a Unix From_ delimiter @@ -74,12 +68,26 @@ class Generator: is False to inhibit the printing of any From_ delimiter. Note that for subobjects, no From_ line is printed. + + linesep specifies the characters used to indicate a new line in + the output. The default value is the most useful for typical + Python applications, but it can be set to \r\n to produce RFC-compliant + line separators when needed. + """ + # We use the _XXX constants for operating on data that comes directly + # from the msg, and _encoded_XXX constants for operating on data that + # has already been converted (to bytes in the BytesGenerator) and + # inserted into a temporary buffer. + self._NL = linesep + self._encoded_NL = self._encode(linesep) + self._EMPTY = '' + self._encoded_EMTPY = self._encode('') if unixfrom: ufrom = msg.get_unixfrom() if not ufrom: ufrom = 'From nobody ' + time.ctime(time.time()) - print >> self._fp, ufrom + self.write(ufrom + self._NL) self._write(msg) def clone(self, fp): @@ -90,6 +98,27 @@ class Generator: # Protected interface - undocumented ;/ # + # Note that we use 'self.write' when what we are writing is coming from + # the source, and self._fp.write when what we are writing is coming from a + # buffer (because the Bytes subclass has already had a chance to transform + # the data in its write method in that case). This is an entirely + # pragmatic split determined by experiment; we could be more general by + # always using write and having the Bytes subclass write method detect when + # it has already transformed the input; but, since this whole thing is a + # hack anyway this seems good enough. + + # Similarly, we have _XXX and _encoded_XXX attributes that are used on + # source and buffer data, respectively. + _encoded_EMPTY = '' + + def _new_buffer(self): + # BytesGenerator overrides this to return BytesIO. + return StringIO() + + def _encode(self, s): + # BytesGenerator overrides this to encode strings to bytes. + return s + def _write(self, msg): # We can't write the headers yet because of the following scenario: # say a multipart message includes the boundary string somewhere in @@ -98,13 +127,13 @@ class Generator: # parameter. # # The way we do this, so as to make the _handle_*() methods simpler, - # is to cache any subpart writes into a StringIO. The we write the - # headers and the StringIO contents. That way, subpart handlers can + # is to cache any subpart writes into a buffer. The we write the + # headers and the buffer contents. That way, subpart handlers can # Do The Right Thing, and can still modify the Content-Type: header if # necessary. oldfp = self._fp try: - self._fp = sfp = StringIO() + self._fp = sfp = self._new_buffer() self._dispatch(msg) finally: self._fp = oldfp @@ -139,31 +168,17 @@ class Generator: def _write_headers(self, msg): for h, v in msg.items(): - print >> self._fp, '%s:' % h, - if self._maxheaderlen == 0: - # Explicit no-wrapping - print >> self._fp, v - elif isinstance(v, Header): - # Header instances know what to do - print >> self._fp, v.encode() - elif _is8bitstring(v): - # If we have raw 8bit data in a byte string, we have no idea - # what the encoding is. There is no safe way to split this - # string. If it's ascii-subset, then we could do a normal - # ascii split, but if it's multibyte then we could break the - # string. There's no way to know so the least harm seems to - # be to not split the string and risk it being too long. - print >> self._fp, v + self.write('%s: ' % h) + if isinstance(v, Header): + self.write(v.encode( + maxlinelen=self._maxheaderlen, linesep=self._NL)+self._NL) else: - # Header's got lots of smarts, so use it. Note that this is - # fundamentally broken though because we lose idempotency when - # the header string is continued with tabs. It will now be - # continued with spaces. This was reversedly broken before we - # fixed bug 1974. Either way, we lose. - print >> self._fp, Header( - v, maxlinelen=self._maxheaderlen, header_name=h).encode() + # Header's got lots of smarts, so use it. + header = Header(v, maxlinelen=self._maxheaderlen, + header_name=h) + self.write(header.encode(linesep=self._NL)+self._NL) # A blank line always separates headers from body - print >> self._fp + self.write(self._NL) # # Handlers for writing types and subtypes @@ -173,11 +188,17 @@ class Generator: payload = msg.get_payload() if payload is None: return - if not isinstance(payload, basestring): + if not isinstance(payload, str): raise TypeError('string payload expected: %s' % type(payload)) + if _has_surrogates(msg._payload): + charset = msg.get_param('charset') + if charset is not None: + del msg['content-transfer-encoding'] + msg.set_payload(payload, charset) + payload = msg.get_payload() if self._mangle_from_: payload = fcre.sub('>From ', payload) - self._fp.write(payload) + self.write(payload) # Default body handler _writeBody = _handle_text @@ -190,25 +211,25 @@ class Generator: subparts = msg.get_payload() if subparts is None: subparts = [] - elif isinstance(subparts, basestring): + elif isinstance(subparts, str): # e.g. a non-strict parse of a message with no starting boundary. - self._fp.write(subparts) + self.write(subparts) return elif not isinstance(subparts, list): # Scalar payload subparts = [subparts] for part in subparts: - s = StringIO() + s = self._new_buffer() g = self.clone(s) - g.flatten(part, unixfrom=False) + g.flatten(part, unixfrom=False, linesep=self._NL) msgtexts.append(s.getvalue()) # BAW: What about boundaries that are wrapped in double-quotes? boundary = msg.get_boundary() if not boundary: # Create a boundary that doesn't appear in any of the # message texts. - alltext = NL.join(msgtexts) - boundary = _make_boundary(alltext) + alltext = self._encoded_NL.join(msgtexts) + boundary = self._make_boundary(alltext) msg.set_boundary(boundary) # If there's a preamble, write it out, with a trailing CRLF if msg.preamble is not None: @@ -216,9 +237,9 @@ class Generator: preamble = fcre.sub('>From ', msg.preamble) else: preamble = msg.preamble - print >> self._fp, preamble + self.write(preamble + self._NL) # dash-boundary transport-padding CRLF - print >> self._fp, '--' + boundary + self.write('--' + boundary + self._NL) # body-part if msgtexts: self._fp.write(msgtexts.pop(0)) @@ -227,18 +248,18 @@ class Generator: # --> CRLF body-part for body_part in msgtexts: # delimiter transport-padding CRLF - print >> self._fp, '\n--' + boundary + self.write(self._NL + '--' + boundary + self._NL) # body-part self._fp.write(body_part) # close-delimiter transport-padding - self._fp.write('\n--' + boundary + '--') + self.write(self._NL + '--' + boundary + '--') if msg.epilogue is not None: - print >> self._fp + self.write(self._NL) if self._mangle_from_: epilogue = fcre.sub('>From ', msg.epilogue) else: epilogue = msg.epilogue - self._fp.write(epilogue) + self.write(epilogue) def _handle_multipart_signed(self, msg): # The contents of signed parts has to stay unmodified in order to keep @@ -257,23 +278,23 @@ class Generator: # block and the boundary. Sigh. blocks = [] for part in msg.get_payload(): - s = StringIO() + s = self._new_buffer() g = self.clone(s) - g.flatten(part, unixfrom=False) + g.flatten(part, unixfrom=False, linesep=self._NL) text = s.getvalue() - lines = text.split('\n') + lines = text.split(self._encoded_NL) # Strip off the unnecessary trailing empty line - if lines and lines[-1] == '': - blocks.append(NL.join(lines[:-1])) + if lines and lines[-1] == self._encoded_EMPTY: + blocks.append(self._encoded_NL.join(lines[:-1])) else: blocks.append(text) # Now join all the blocks with an empty line. This has the lovely # effect of separating each block with an empty line, but not adding # an extra one after the last one. - self._fp.write(NL.join(blocks)) + self._fp.write(self._encoded_NL.join(blocks)) def _handle_message(self, msg): - s = StringIO() + s = self._new_buffer() g = self.clone(s) # The payload of a message/rfc822 part should be a multipart sequence # of length 1. The zeroth element of the list should be the Message @@ -286,10 +307,100 @@ class Generator: # in that case we just emit the string body. payload = msg.get_payload() if isinstance(payload, list): - g.flatten(msg.get_payload(0), unixfrom=False) + g.flatten(msg.get_payload(0), unixfrom=False, linesep=self._NL) payload = s.getvalue() self._fp.write(payload) + # This used to be a module level function; we use a classmethod for this + # and _compile_re so we can continue to provide the module level function + # for backward compatibility by doing + # _make_boudary = Generator._make_boundary + # at the end of the module. It *is* internal, so we could drop that... + @classmethod + def _make_boundary(cls, text=None): + # Craft a random boundary. If text is given, ensure that the chosen + # boundary doesn't appear in the text. + token = random.randrange(sys.maxsize) + boundary = ('=' * 15) + (_fmt % token) + '==' + if text is None: + return boundary + b = boundary + counter = 0 + while True: + cre = cls._compile_re('^--' + re.escape(b) + '(--)?$', re.MULTILINE) + if not cre.search(text): + break + b = boundary + '.' + str(counter) + counter += 1 + return b + + @classmethod + def _compile_re(cls, s, flags): + return re.compile(s, flags) + + +class BytesGenerator(Generator): + """Generates a bytes version of a Message object tree. + + Functionally identical to the base Generator except that the output is + bytes and not string. When surrogates were used in the input to encode + bytes, these are decoded back to bytes for output. + + The outfp object must accept bytes in its write method. + """ + + # Bytes versions of this constant for use in manipulating data from + # the BytesIO buffer. + _encoded_EMPTY = b'' + + def write(self, s): + self._fp.write(s.encode('ascii', 'surrogateescape')) + + def _new_buffer(self): + return BytesIO() + + def _encode(self, s): + return s.encode('ascii') + + def _write_headers(self, msg): + # This is almost the same as the string version, except for handling + # strings with 8bit bytes. + for h, v in msg._headers: + self.write('%s: ' % h) + if isinstance(v, Header): + self.write(v.encode(maxlinelen=self._maxheaderlen)+self._NL) + elif _has_surrogates(v): + # If we have raw 8bit data in a byte string, we have no idea + # what the encoding is. There is no safe way to split this + # string. If it's ascii-subset, then we could do a normal + # ascii split, but if it's multibyte then we could break the + # string. There's no way to know so the least harm seems to + # be to not split the string and risk it being too long. + self.write(v+NL) + else: + # Header's got lots of smarts and this string is safe... + header = Header(v, maxlinelen=self._maxheaderlen, + header_name=h) + self.write(header.encode(linesep=self._NL)+self._NL) + # A blank line always separates headers from body + self.write(self._NL) + + def _handle_text(self, msg): + # If the string has surrogates the original source was bytes, so + # just write it back out. + if msg._payload is None: + return + if _has_surrogates(msg._payload): + if self._mangle_from_: + msg._payload = fcre.sub(">From ", msg._payload) + self.write(msg._payload) + else: + super(BytesGenerator,self)._handle_text(msg) + + @classmethod + def _compile_re(cls, s, flags): + return re.compile(s.encode('ascii'), flags) + _FMT = '[Non-text (%(type)s) part of message omitted, filename %(filename)s]' @@ -332,12 +443,12 @@ class DecodedGenerator(Generator): for part in msg.walk(): maintype = part.get_content_maintype() if maintype == 'text': - print >> self, part.get_payload(decode=True) + print(part.get_payload(decode=False), file=self) elif maintype == 'multipart': # Just skip this pass else: - print >> self, self._fmt % { + print(self._fmt % { 'type' : part.get_content_type(), 'maintype' : part.get_content_maintype(), 'subtype' : part.get_content_subtype(), @@ -346,27 +457,13 @@ class DecodedGenerator(Generator): '[no description]'), 'encoding' : part.get('Content-Transfer-Encoding', '[no encoding]'), - } + }, file=self) -# Helper -_width = len(repr(sys.maxint-1)) +# Helper used by Generator._make_boundary +_width = len(repr(sys.maxsize-1)) _fmt = '%%0%dd' % _width -def _make_boundary(text=None): - # Craft a random boundary. If text is given, ensure that the chosen - # boundary doesn't appear in the text. - token = random.randrange(sys.maxint) - boundary = ('=' * 15) + (_fmt % token) + '==' - if text is None: - return boundary - b = boundary - counter = 0 - while True: - cre = re.compile('^--' + re.escape(b) + '(--)?$', re.MULTILINE) - if not cre.search(text): - break - b = boundary + '.' + str(counter) - counter += 1 - return b +# Backward compatibility +_make_boundary = Generator._make_boundary |