aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/Lib/html/parser.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/html/parser.py')
-rw-r--r--Lib/html/parser.py67
1 files changed, 52 insertions, 15 deletions
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 13c95c34e50..ba416e7fa6e 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -12,6 +12,7 @@ import re
import _markupbase
from html import unescape
+from html.entities import html5 as html5_entities
__all__ = ['HTMLParser']
@@ -23,8 +24,10 @@ incomplete = re.compile('&[a-zA-Z#]')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
+attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
starttagopen = re.compile('<[a-zA-Z]')
+endtagopen = re.compile('</[a-zA-Z]')
piclose = re.compile('>')
commentclose = re.compile(r'--\s*>')
# Note:
@@ -57,6 +60,22 @@ endendtag = re.compile('>')
# </ and the tag name, so maybe this should be fixed
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
+# Character reference processing logic specific to attribute values
+# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
+def _replace_attr_charref(match):
+ ref = match.group(0)
+ # Numeric / hex char refs must always be unescaped
+ if ref.startswith('&#'):
+ return unescape(ref)
+ # Named character / entity references must only be unescaped
+ # if they are an exact match, and they are not followed by an equals sign
+ if not ref.endswith('=') and ref[1:] in html5_entities:
+ return unescape(ref)
+ # Otherwise do not unescape
+ return ref
+
+def _unescape_attrvalue(s):
+ return attr_charref.sub(_replace_attr_charref, s)
class HTMLParser(_markupbase.ParserBase):
@@ -177,7 +196,7 @@ class HTMLParser(_markupbase.ParserBase):
k = self.parse_pi(i)
elif startswith("<!", i):
k = self.parse_html_declaration(i)
- elif (i + 1) < n:
+ elif (i + 1) < n or end:
self.handle_data("<")
k = i + 1
else:
@@ -185,17 +204,35 @@ class HTMLParser(_markupbase.ParserBase):
if k < 0:
if not end:
break
- k = rawdata.find('>', i + 1)
- if k < 0:
- k = rawdata.find('<', i + 1)
- if k < 0:
- k = i + 1
- else:
- k += 1
- if self.convert_charrefs and not self.cdata_elem:
- self.handle_data(unescape(rawdata[i:k]))
+ if starttagopen.match(rawdata, i): # < + letter
+ pass
+ elif startswith("</", i):
+ if i + 2 == n:
+ self.handle_data("</")
+ elif endtagopen.match(rawdata, i): # </ + letter
+ pass
+ else:
+ # bogus comment
+ self.handle_comment(rawdata[i+2:])
+ elif startswith("<!--", i):
+ j = n
+ for suffix in ("--!", "--", "-"):
+ if rawdata.endswith(suffix, i+4):
+ j -= len(suffix)
+ break
+ self.handle_comment(rawdata[i+4:j])
+ elif startswith("<![CDATA[", i):
+ self.unknown_decl(rawdata[i+3:])
+ elif rawdata[i:i+9].lower() == '<!doctype':
+ self.handle_decl(rawdata[i+2:])
+ elif startswith("<!", i):
+ # bogus comment
+ self.handle_comment(rawdata[i+2:])
+ elif startswith("<?", i):
+ self.handle_pi(rawdata[i+2:])
else:
- self.handle_data(rawdata[i:k])
+ raise AssertionError("we should not get here!")
+ k = n
i = self.updatepos(i, k)
elif startswith("&#", i):
match = charref.match(rawdata, i)
@@ -242,7 +279,7 @@ class HTMLParser(_markupbase.ParserBase):
else:
assert 0, "interesting.search() lied"
# end while
- if end and i < n and not self.cdata_elem:
+ if end and i < n:
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:n]))
else:
@@ -260,7 +297,7 @@ class HTMLParser(_markupbase.ParserBase):
if rawdata[i:i+4] == '<!--':
# this case is actually already handled in goahead()
return self.parse_comment(i)
- elif rawdata[i:i+3] == '<![':
+ elif rawdata[i:i+9] == '<![CDATA[':
return self.parse_marked_section(i)
elif rawdata[i:i+9].lower() == '<!doctype':
# find the closing >
@@ -277,7 +314,7 @@ class HTMLParser(_markupbase.ParserBase):
def parse_bogus_comment(self, i, report=1):
rawdata = self.rawdata
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
- 'parse_comment()')
+ 'parse_bogus_comment()')
pos = rawdata.find('>', i+2)
if pos == -1:
return -1
@@ -323,7 +360,7 @@ class HTMLParser(_markupbase.ParserBase):
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
- attrvalue = unescape(attrvalue)
+ attrvalue = _unescape_attrvalue(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()