1 files changed, 52 insertions, 15 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 13c95c34e50..ba416e7fa6e 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -12,6 +12,7 @@ import re
 import _markupbase
 
 from html import unescape
+from html.entities import html5 as html5_entities
 
 
 __all__ = ['HTMLParser']
@@ -23,8 +24,10 @@ incomplete = re.compile('&[a-zA-Z#]')
 
 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
+attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
 
 starttagopen = re.compile('<[a-zA-Z]')
+endtagopen = re.compile('</[a-zA-Z]')
 piclose = re.compile('>')
 commentclose = re.compile(r'--\s*>')
 # Note:
@@ -57,6 +60,22 @@ endendtag = re.compile('>')
 # </ and the tag name, so maybe this should be fixed
 endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
 
+# Character reference processing logic specific to attribute values
+# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
+def _replace_attr_charref(match):
+    ref = match.group(0)
+    # Numeric / hex char refs must always be unescaped
+    if ref.startswith('&#'):
+        return unescape(ref)
+    # Named character / entity references must only be unescaped
+    # if they are an exact match, and they are not followed by an equals sign
+    if not ref.endswith('=') and ref[1:] in html5_entities:
+        return unescape(ref)
+    # Otherwise do not unescape
+    return ref
+
+def _unescape_attrvalue(s):
+    return attr_charref.sub(_replace_attr_charref, s)
 
 
 class HTMLParser(_markupbase.ParserBase):
@@ -177,7 +196,7 @@ class HTMLParser(_markupbase.ParserBase):
                     k = self.parse_pi(i)
                 elif startswith("<!", i):
                     k = self.parse_html_declaration(i)
-                elif (i + 1) < n:
+                elif (i + 1) < n or end:
                     self.handle_data("<")
                     k = i + 1
                 else:
@@ -185,17 +204,35 @@ class HTMLParser(_markupbase.ParserBase):
                 if k < 0:
                     if not end:
                         break
-                    k = rawdata.find('>', i + 1)
-                    if k < 0:
-                        k = rawdata.find('<', i + 1)
-                        if k < 0:
-                            k = i + 1
-                    else:
-                        k += 1
-                    if self.convert_charrefs and not self.cdata_elem:
-                        self.handle_data(unescape(rawdata[i:k]))
+                    if starttagopen.match(rawdata, i):  # < + letter
+                        pass
+                    elif startswith("</", i):
+                        if i + 2 == n:
+                            self.handle_data("</")
+                        elif endtagopen.match(rawdata, i):  # </ + letter
+                            pass
+                        else:
+                            # bogus comment
+                            self.handle_comment(rawdata[i+2:])
+                    elif startswith("<!--", i):
+                        j = n
+                        for suffix in ("--!", "--", "-"):
+                            if rawdata.endswith(suffix, i+4):
+                                j -= len(suffix)
+                                break
+                        self.handle_comment(rawdata[i+4:j])
+                    elif startswith("<![CDATA[", i):
+                        self.unknown_decl(rawdata[i+3:])
+                    elif rawdata[i:i+9].lower() == '<!doctype':
+                        self.handle_decl(rawdata[i+2:])
+                    elif startswith("<!", i):
+                        # bogus comment
+                        self.handle_comment(rawdata[i+2:])
+                    elif startswith("<?", i):
+                        self.handle_pi(rawdata[i+2:])
                     else:
-                        self.handle_data(rawdata[i:k])
+                        raise AssertionError("we should not get here!")
+                    k = n
                 i = self.updatepos(i, k)
             elif startswith("&#", i):
                 match = charref.match(rawdata, i)
@@ -242,7 +279,7 @@ class HTMLParser(_markupbase.ParserBase):
             else:
                 assert 0, "interesting.search() lied"
         # end while
-        if end and i < n and not self.cdata_elem:
+        if end and i < n:
             if self.convert_charrefs and not self.cdata_elem:
                 self.handle_data(unescape(rawdata[i:n]))
             else:
@@ -260,7 +297,7 @@ class HTMLParser(_markupbase.ParserBase):
         if rawdata[i:i+4] == '<!--':
             # this case is actually already handled in goahead()
             return self.parse_comment(i)
-        elif rawdata[i:i+3] == '<![':
+        elif rawdata[i:i+9] == '<![CDATA[':
             return self.parse_marked_section(i)
         elif rawdata[i:i+9].lower() == '<!doctype':
             # find the closing >
@@ -277,7 +314,7 @@ class HTMLParser(_markupbase.ParserBase):
     def parse_bogus_comment(self, i, report=1):
         rawdata = self.rawdata
         assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
-                                                'parse_comment()')
+                                                'parse_bogus_comment()')
         pos = rawdata.find('>', i+2)
         if pos == -1:
             return -1
@@ -323,7 +360,7 @@ class HTMLParser(_markupbase.ParserBase):
                  attrvalue[:1] == '"' == attrvalue[-1:]:
                 attrvalue = attrvalue[1:-1]
             if attrvalue:
-                attrvalue = unescape(attrvalue)
+                attrvalue = _unescape_attrvalue(attrvalue)
             attrs.append((attrname.lower(), attrvalue))
             k = m.end()