2 files changed, 180 insertions, 54 deletions
diff --git a/Lib/html/__init__.py b/Lib/html/__init__.py
index 02652ef73c3..da0a0a3ce70 100644
--- a/Lib/html/__init__.py
+++ b/Lib/html/__init__.py
@@ -2,12 +2,12 @@
 General functions for HTML manipulation.
 """
 
+import re as _re
+from html.entities import html5 as _html5
 
-_escape_map = {ord('&'): '&amp;', ord('<'): '&lt;', ord('>'): '&gt;'}
-_escape_map_full = {ord('&'): '&amp;', ord('<'): '&lt;', ord('>'): '&gt;',
-                    ord('"'): '&quot;', ord('\''): '&#x27;'}
 
-# NB: this is a candidate for a bytes/string polymorphic interface
+__all__ = ['escape', 'unescape']
+
 
 def escape(s, quote=True):
     """
@@ -16,6 +16,117 @@ def escape(s, quote=True):
     characters, both double quote (") and single quote (') characters are also
     translated.
     """
+    s = s.replace("&", "&amp;") # Must be done first!
+    s = s.replace("<", "&lt;")
+    s = s.replace(">", "&gt;")
     if quote:
-        return s.translate(_escape_map_full)
-    return s.translate(_escape_map)
+        s = s.replace('"', "&quot;")
+        s = s.replace('\'', "&#x27;")
+    return s
+
+
+# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references
+
+_invalid_charrefs = {
+    0x00: '\ufffd',  # REPLACEMENT CHARACTER
+    0x0d: '\r',      # CARRIAGE RETURN
+    0x80: '\u20ac',  # EURO SIGN
+    0x81: '\x81',    # <control>
+    0x82: '\u201a',  # SINGLE LOW-9 QUOTATION MARK
+    0x83: '\u0192',  # LATIN SMALL LETTER F WITH HOOK
+    0x84: '\u201e',  # DOUBLE LOW-9 QUOTATION MARK
+    0x85: '\u2026',  # HORIZONTAL ELLIPSIS
+    0x86: '\u2020',  # DAGGER
+    0x87: '\u2021',  # DOUBLE DAGGER
+    0x88: '\u02c6',  # MODIFIER LETTER CIRCUMFLEX ACCENT
+    0x89: '\u2030',  # PER MILLE SIGN
+    0x8a: '\u0160',  # LATIN CAPITAL LETTER S WITH CARON
+    0x8b: '\u2039',  # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+    0x8c: '\u0152',  # LATIN CAPITAL LIGATURE OE
+    0x8d: '\x8d',    # <control>
+    0x8e: '\u017d',  # LATIN CAPITAL LETTER Z WITH CARON
+    0x8f: '\x8f',    # <control>
+    0x90: '\x90',    # <control>
+    0x91: '\u2018',  # LEFT SINGLE QUOTATION MARK
+    0x92: '\u2019',  # RIGHT SINGLE QUOTATION MARK
+    0x93: '\u201c',  # LEFT DOUBLE QUOTATION MARK
+    0x94: '\u201d',  # RIGHT DOUBLE QUOTATION MARK
+    0x95: '\u2022',  # BULLET
+    0x96: '\u2013',  # EN DASH
+    0x97: '\u2014',  # EM DASH
+    0x98: '\u02dc',  # SMALL TILDE
+    0x99: '\u2122',  # TRADE MARK SIGN
+    0x9a: '\u0161',  # LATIN SMALL LETTER S WITH CARON
+    0x9b: '\u203a',  # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+    0x9c: '\u0153',  # LATIN SMALL LIGATURE OE
+    0x9d: '\x9d',    # <control>
+    0x9e: '\u017e',  # LATIN SMALL LETTER Z WITH CARON
+    0x9f: '\u0178',  # LATIN CAPITAL LETTER Y WITH DIAERESIS
+}
+
+_invalid_codepoints = {
+    # 0x0001 to 0x0008
+    0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
+    # 0x000E to 0x001F
+    0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+    0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+    # 0x007F to 0x009F
+    0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
+    0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
+    0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+    # 0xFDD0 to 0xFDEF
+    0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,
+    0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,
+    0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,
+    0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,
+    # others
+    0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,
+    0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,
+    0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,
+    0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,
+    0x10fffe, 0x10ffff
+}
+
+
+def _replace_charref(s):
+    s = s.group(1)
+    if s[0] == '#':
+        # numeric charref
+        if s[1] in 'xX':
+            num = int(s[2:].rstrip(';'), 16)
+        else:
+            num = int(s[1:].rstrip(';'))
+        if num in _invalid_charrefs:
+            return _invalid_charrefs[num]
+        if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:
+            return '\uFFFD'
+        if num in _invalid_codepoints:
+            return ''
+        return chr(num)
+    else:
+        # named charref
+        if s in _html5:
+            return _html5[s]
+        # find the longest matching name (as defined by the standard)
+        for x in range(len(s)-1, 1, -1):
+            if s[:x] in _html5:
+                return _html5[s[:x]] + s[x:]
+        else:
+            return '&' + s
+
+
+_charref = _re.compile(r'&(#[0-9]+;?'
+                       r'|#[xX][0-9a-fA-F]+;?'
+                       r'|[^\t\n\f <&#;]{1,32};?)')
+
+def unescape(s):
+    """
+    Convert all named and numeric character references (e.g. &gt;, &#62;,
+    &x3e;) in the string s to the corresponding unicode characters.
+    This function uses the rules defined by the HTML 5 standard
+    for both valid and invalid character references, and the list of
+    HTML 5 named character references defined in html.entities.html5.
+    """
+    if '&' not in s:
+        return s
+    return _charref.sub(_replace_charref, s)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 63fe77425bd..a650d5eeded 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -8,9 +8,14 @@
 # and CDATA (character data -- only end tags are special).
 
 
-import _markupbase
 import re
 import warnings
+import _markupbase
+
+from html import unescape
+
+
+__all__ = ['HTMLParser']
 
 # Regular expressions used for parsing
 
@@ -92,6 +97,8 @@ class HTMLParseError(Exception):
         return result
 
 
+_default_sentinel = object()
+
 class HTMLParser(_markupbase.ParserBase):
     """Find tags and other markup and call handler functions.
 
@@ -105,26 +112,39 @@ class HTMLParser(_markupbase.ParserBase):
     self.handle_startendtag(); end tags by self.handle_endtag().  The
     data between tags is passed from the parser to the derived class
     by calling self.handle_data() with the data as argument (the data
-    may be split up in arbitrary chunks).  Entity references are
-    passed by calling self.handle_entityref() with the entity
-    reference as the argument.  Numeric character references are
-    passed to self.handle_charref() with the string containing the
-    reference as the argument.
+    may be split up in arbitrary chunks).  If convert_charrefs is
+    True the character references are converted automatically to the
+    corresponding Unicode character (and self.handle_data() is no
+    longer split in chunks), otherwise they are passed by calling
+    self.handle_entityref() or self.handle_charref() with the string
+    containing respectively the named or numeric reference as the
+    argument.
     """
 
     CDATA_CONTENT_ELEMENTS = ("script", "style")
 
-    def __init__(self, strict=False):
+    def __init__(self, strict=_default_sentinel, *,
+                 convert_charrefs=_default_sentinel):
         """Initialize and reset this instance.
 
+        If convert_charrefs is True (default: False), all character references
+        are automatically converted to the corresponding Unicode characters.
         If strict is set to False (the default) the parser will parse invalid
         markup, otherwise it will raise an error.  Note that the strict mode
-        is deprecated.
+        and argument are deprecated.
         """
-        if strict:
-            warnings.warn("The strict mode is deprecated.",
+        if strict is not _default_sentinel:
+            warnings.warn("The strict argument and mode are deprecated.",
                           DeprecationWarning, stacklevel=2)
+        else:
+            strict = False  # default
         self.strict = strict
+        if convert_charrefs is _default_sentinel:
+            convert_charrefs = False  # default
+            warnings.warn("The value of convert_charrefs will become True in "
+                          "3.5. You are encouraged to set the value explicitly.",
+                          DeprecationWarning, stacklevel=2)
+        self.convert_charrefs = convert_charrefs
         self.reset()
 
     def reset(self):
@@ -149,6 +169,8 @@ class HTMLParser(_markupbase.ParserBase):
         self.goahead(1)
 
     def error(self, message):
+        warnings.warn("The 'error' method is deprecated.",
+                      DeprecationWarning, stacklevel=2)
         raise HTMLParseError(message, self.getpos())
 
     __starttag_text = None
@@ -173,14 +195,25 @@ class HTMLParser(_markupbase.ParserBase):
         i = 0
         n = len(rawdata)
         while i < n:
-            match = self.interesting.search(rawdata, i) # < or &
-            if match:
-                j = match.start()
+            if self.convert_charrefs and not self.cdata_elem:
+                j = rawdata.find('<', i)
+                if j < 0:
+                    if not end:
+                        break  # wait till we get all the text
+                    j = n
             else:
-                if self.cdata_elem:
-                    break
-                j = n
-            if i < j: self.handle_data(rawdata[i:j])
+                match = self.interesting.search(rawdata, i)  # < or &
+                if match:
+                    j = match.start()
+                else:
+                    if self.cdata_elem:
+                        break
+                    j = n
+            if i < j:
+                if self.convert_charrefs and not self.cdata_elem:
+                    self.handle_data(unescape(rawdata[i:j]))
+                else:
+                    self.handle_data(rawdata[i:j])
             i = self.updatepos(i, j)
             if i == n: break
             startswith = rawdata.startswith
@@ -215,7 +248,10 @@ class HTMLParser(_markupbase.ParserBase):
                             k = i + 1
                     else:
                         k += 1
-                    self.handle_data(rawdata[i:k])
+                    if self.convert_charrefs and not self.cdata_elem:
+                        self.handle_data(unescape(rawdata[i:k]))
+                    else:
+                        self.handle_data(rawdata[i:k])
                 i = self.updatepos(i, k)
             elif startswith("&#", i):
                 match = charref.match(rawdata, i)
@@ -266,7 +302,10 @@ class HTMLParser(_markupbase.ParserBase):
                 assert 0, "interesting.search() lied"
         # end while
         if end and i < n and not self.cdata_elem:
-            self.handle_data(rawdata[i:n])
+            if self.convert_charrefs and not self.cdata_elem:
+                self.handle_data(unescape(rawdata[i:n]))
+            else:
+                self.handle_data(rawdata[i:n])
             i = self.updatepos(i, n)
         self.rawdata = rawdata[i:]
 
@@ -349,7 +388,7 @@ class HTMLParser(_markupbase.ParserBase):
                  attrvalue[:1] == '"' == attrvalue[-1:]:
                 attrvalue = attrvalue[1:-1]
             if attrvalue:
-                attrvalue = self.unescape(attrvalue)
+                attrvalue = unescape(attrvalue)
             attrs.append((attrname.lower(), attrvalue))
             k = m.end()
 
@@ -505,31 +544,7 @@ class HTMLParser(_markupbase.ParserBase):
 
     # Internal -- helper to remove special character quoting
     def unescape(self, s):
-        if '&' not in s:
-            return s
-        def replaceEntities(s):
-            s = s.groups()[0]
-            try:
-                if s[0] == "#":
-                    s = s[1:]
-                    if s[0] in ['x','X']:
-                        c = int(s[1:].rstrip(';'), 16)
-                    else:
-                        c = int(s.rstrip(';'))
-                    return chr(c)
-            except ValueError:
-                return '&#' + s
-            else:
-                from html.entities import html5
-                if s in html5:
-                    return html5[s]
-                elif s.endswith(';'):
-                    return '&' + s
-                for x in range(2, len(s)):
-                    if s[:x] in html5:
-                        return html5[s[:x]] + s[x:]
-                else:
-                    return '&' + s
-
-        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
-                      replaceEntities, s, flags=re.ASCII)
+        warnings.warn('The unescape method is deprecated and will be removed '
+                      'in 3.5, use html.unescape() instead.',
+                      DeprecationWarning, stacklevel=2)
+        return unescape(s)