diff options
Diffstat (limited to 'Lib/html/parser.py')
-rw-r--r-- | Lib/html/parser.py | 67 |
1 files changed, 52 insertions, 15 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 13c95c34e50..ba416e7fa6e 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -12,6 +12,7 @@ import re import _markupbase from html import unescape +from html.entities import html5 as html5_entities __all__ = ['HTMLParser'] @@ -23,8 +24,10 @@ incomplete = re.compile('&[a-zA-Z#]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') +attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?') starttagopen = re.compile('<[a-zA-Z]') +endtagopen = re.compile('</[a-zA-Z]') piclose = re.compile('>') commentclose = re.compile(r'--\s*>') # Note: @@ -57,6 +60,22 @@ endendtag = re.compile('>') # </ and the tag name, so maybe this should be fixed endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') +# Character reference processing logic specific to attribute values +# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state +def _replace_attr_charref(match): + ref = match.group(0) + # Numeric / hex char refs must always be unescaped + if ref.startswith('&#'): + return unescape(ref) + # Named character / entity references must only be unescaped + # if they are an exact match, and they are not followed by an equals sign + if not ref.endswith('=') and ref[1:] in html5_entities: + return unescape(ref) + # Otherwise do not unescape + return ref + +def _unescape_attrvalue(s): + return attr_charref.sub(_replace_attr_charref, s) class HTMLParser(_markupbase.ParserBase): @@ -177,7 +196,7 @@ class HTMLParser(_markupbase.ParserBase): k = self.parse_pi(i) elif startswith("<!", i): k = self.parse_html_declaration(i) - elif (i + 1) < n: + elif (i + 1) < n or end: self.handle_data("<") k = i + 1 else: @@ -185,17 +204,35 @@ class HTMLParser(_markupbase.ParserBase): if k < 0: if not end: break - k = rawdata.find('>', i + 1) - if k < 0: - k = rawdata.find('<', i + 1) - if k < 0: - k = i + 1 - else: - k += 1 - if self.convert_charrefs and not self.cdata_elem: - self.handle_data(unescape(rawdata[i:k])) + if starttagopen.match(rawdata, i): # < + letter + pass + elif startswith("</", i): + if i + 2 == n: + self.handle_data("</") + elif endtagopen.match(rawdata, i): # </ + letter + pass + else: + # bogus comment + self.handle_comment(rawdata[i+2:]) + elif startswith("<!--", i): + j = n + for suffix in ("--!", "--", "-"): + if rawdata.endswith(suffix, i+4): + j -= len(suffix) + break + self.handle_comment(rawdata[i+4:j]) + elif startswith("<![CDATA[", i): + self.unknown_decl(rawdata[i+3:]) + elif rawdata[i:i+9].lower() == '<!doctype': + self.handle_decl(rawdata[i+2:]) + elif startswith("<!", i): + # bogus comment + self.handle_comment(rawdata[i+2:]) + elif startswith("<?", i): + self.handle_pi(rawdata[i+2:]) else: - self.handle_data(rawdata[i:k]) + raise AssertionError("we should not get here!") + k = n i = self.updatepos(i, k) elif startswith("&#", i): match = charref.match(rawdata, i) @@ -242,7 +279,7 @@ class HTMLParser(_markupbase.ParserBase): else: assert 0, "interesting.search() lied" # end while - if end and i < n and not self.cdata_elem: + if end and i < n: if self.convert_charrefs and not self.cdata_elem: self.handle_data(unescape(rawdata[i:n])) else: @@ -260,7 +297,7 @@ class HTMLParser(_markupbase.ParserBase): if rawdata[i:i+4] == '<!--': # this case is actually already handled in goahead() return self.parse_comment(i) - elif rawdata[i:i+3] == '<![': + elif rawdata[i:i+9] == '<![CDATA[': return self.parse_marked_section(i) elif rawdata[i:i+9].lower() == '<!doctype': # find the closing > @@ -277,7 +314,7 @@ class HTMLParser(_markupbase.ParserBase): def parse_bogus_comment(self, i, report=1): rawdata = self.rawdata assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to ' - 'parse_comment()') + 'parse_bogus_comment()') pos = rawdata.find('>', i+2) if pos == -1: return -1 @@ -323,7 +360,7 @@ class HTMLParser(_markupbase.ParserBase): attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] if attrvalue: - attrvalue = unescape(attrvalue) + attrvalue = _unescape_attrvalue(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() |