-
-
Notifications
You must be signed in to change notification settings - Fork 32.5k
gh-135661: Fix parsing start and end tags in HTMLParser #135930
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
7171803
182b16f
436a8a9
ebf8ce3
d05303b
955db4e
66ec1a0
f38ad41
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -36,29 +36,33 @@ | |||||
# explode, so don't do it. | ||||||
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state | ||||||
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state | ||||||
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') | ||||||
attrfind_tolerant = re.compile( | ||||||
r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' | ||||||
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') | ||||||
locatestarttagend_tolerant = re.compile(r""" | ||||||
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name | ||||||
(?:[\s/]* # optional whitespace before attribute name | ||||||
(?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name | ||||||
(?:\s*=+\s* # value indicator | ||||||
(?:'[^']*' # LITA-enclosed value | ||||||
|"[^"]*" # LIT-enclosed value | ||||||
|(?!['"])[^>\s]* # bare value | ||||||
) | ||||||
\s* # possibly followed by a space | ||||||
)?(?:\s|/(?!>))* | ||||||
)* | ||||||
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*') | ||||||
attrfind_tolerant = re.compile(r""" | ||||||
( | ||||||
(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name | ||||||
) | ||||||
(= # value indicator | ||||||
('[^']*' # LITA-enclosed value | ||||||
|"[^"]*" # LIT-enclosed value | ||||||
|(?!['"])[^>\t\n\r\f ]* # bare value | ||||||
) | ||||||
)? | ||||||
\s* # trailing whitespace | ||||||
(?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space | ||||||
""", re.VERBOSE) | ||||||
locatetagend_tolerant = re.compile(r""" | ||||||
[a-zA-Z][^\t\n\r\f />]* # tag name | ||||||
[\t\n\r\f /]* # optional whitespace before attribute name | ||||||
(?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name | ||||||
(?:= # value indicator | ||||||
(?:'[^']*' # LITA-enclosed value | ||||||
|"[^"]*" # LIT-enclosed value | ||||||
|(?!['"])[^>\t\n\r\f ]* # bare value | ||||||
) | ||||||
)? | ||||||
[\t\n\r\f /]* # possibly followed by a space | ||||||
)* | ||||||
>? | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These changes make sense to me. I also noticed that you removed the This variable is not documented however I can see two options:
Note that before there was also a set of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Restored the removed variables. I will remove them in the main branch in the following PR. |
||||||
""", re.VERBOSE) | ||||||
endendtag = re.compile('>') | ||||||
# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between | ||||||
# </ and the tag name, so maybe this should be fixed | ||||||
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') | ||||||
|
||||||
# Character reference processing logic specific to attribute values | ||||||
# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state | ||||||
|
@@ -141,7 +145,8 @@ def get_starttag_text(self): | |||||
|
||||||
def set_cdata_mode(self, elem): | ||||||
self.cdata_elem = elem.lower() | ||||||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) | ||||||
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem, | ||||||
re.IGNORECASE|re.ASCII) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any reason for adding There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it affects case-insensitive mode. Otherwise 'ſ' ~ 's' and 'ı' ~ 'i'. There may be more cases after adding support for This is not actually a problem in the current code, but future changes could make this important. |
||||||
|
||||||
def clear_cdata_mode(self): | ||||||
self.interesting = interesting_normal | ||||||
|
@@ -166,7 +171,7 @@ def goahead(self, end): | |||||
# & near the end and see if it's followed by a space or ;. | ||||||
amppos = rawdata.rfind('&', max(i, n-34)) | ||||||
if (amppos >= 0 and | ||||||
not re.compile(r'[\s;]').search(rawdata, amppos)): | ||||||
not re.compile(r'[\t\n\r\f ;]').search(rawdata, amppos)): | ||||||
break # wait till we get all the text | ||||||
j = n | ||||||
else: | ||||||
|
@@ -381,76 +386,39 @@ def parse_starttag(self, i): | |||||
# or -1 if incomplete. | ||||||
def check_for_whole_start_tag(self, i): | ||||||
rawdata = self.rawdata | ||||||
m = locatestarttagend_tolerant.match(rawdata, i) | ||||||
if m: | ||||||
j = m.end() | ||||||
next = rawdata[j:j+1] | ||||||
if next == ">": | ||||||
return j + 1 | ||||||
if next == "/": | ||||||
if rawdata.startswith("/>", j): | ||||||
return j + 2 | ||||||
if rawdata.startswith("/", j): | ||||||
# buffer boundary | ||||||
return -1 | ||||||
# else bogus input | ||||||
if j > i: | ||||||
return j | ||||||
else: | ||||||
return i + 1 | ||||||
if next == "": | ||||||
# end of input | ||||||
return -1 | ||||||
if next in ("abcdefghijklmnopqrstuvwxyz=/" | ||||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"): | ||||||
# end of input in or before attribute value, or we have the | ||||||
# '/' from a '/>' ending | ||||||
return -1 | ||||||
if j > i: | ||||||
return j | ||||||
else: | ||||||
return i + 1 | ||||||
raise AssertionError("we should not get here!") | ||||||
match = locatetagend_tolerant.match(rawdata, i+1) | ||||||
assert match | ||||||
j = match.end() | ||||||
if rawdata[j-1] != ">": | ||||||
return -1 | ||||||
return j | ||||||
|
||||||
# Internal -- parse endtag, return end or -1 if incomplete | ||||||
def parse_endtag(self, i): | ||||||
rawdata = self.rawdata | ||||||
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" | ||||||
match = endendtag.search(rawdata, i+1) # > | ||||||
if not match: | ||||||
if rawdata.find('>', i+2) < 0: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Probably inconsequential performance-wise, but using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This check is not actually needed. It is simply an optimization for the case of truncated end tag, because it is faster than
|
||||||
return -1 | ||||||
gtpos = match.end() | ||||||
match = endtagfind.match(rawdata, i) # </ + tag + > | ||||||
if not match: | ||||||
if self.cdata_elem is not None: | ||||||
self.handle_data(rawdata[i:gtpos]) | ||||||
return gtpos | ||||||
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state | ||||||
namematch = tagfind_tolerant.match(rawdata, i+2) | ||||||
if not namematch: | ||||||
# w3.org/TR/html5/tokenization.html#end-tag-open-state | ||||||
if rawdata[i:i+3] == '</>': | ||||||
return i+3 | ||||||
else: | ||||||
return self.parse_bogus_comment(i) | ||||||
tagname = namematch.group(1).lower() | ||||||
# consume and ignore other stuff between the name and the > | ||||||
# Note: this is not 100% correct, since we might have things like | ||||||
# </tag attr=">">, but looking for > after the name should cover | ||||||
# most of the cases and is much simpler | ||||||
gtpos = rawdata.find('>', namematch.end()) | ||||||
self.handle_endtag(tagname) | ||||||
return gtpos+1 | ||||||
if not endtagopen.match(rawdata, i): # </ + letter | ||||||
# w3.org/TR/html5/tokenization.html#end-tag-open-state | ||||||
if rawdata[i+2:i+3] == '>': # </> is ignored | ||||||
return i+3 | ||||||
else: | ||||||
return self.parse_bogus_comment(i) | ||||||
|
||||||
elem = match.group(1).lower() # script or style | ||||||
if self.cdata_elem is not None: | ||||||
if elem != self.cdata_elem: | ||||||
self.handle_data(rawdata[i:gtpos]) | ||||||
return gtpos | ||||||
match = locatetagend_tolerant.match(rawdata, i+2) | ||||||
assert match | ||||||
j = match.end() | ||||||
if rawdata[j-1] != ">": | ||||||
return -1 | ||||||
|
||||||
self.handle_endtag(elem) | ||||||
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state | ||||||
match = tagfind_tolerant.match(rawdata, i+2) | ||||||
assert match | ||||||
tag = match.group(1).lower() | ||||||
self.handle_endtag(tag) | ||||||
self.clear_cdata_mode() | ||||||
return gtpos | ||||||
return j | ||||||
|
||||||
# Overridable -- finish processing of start+end tag: <tag.../> | ||||||
def handle_startendtag(self, tag, attrs): | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know if you saw and heeded the warning or if you just got lucky, but it looks like you were able to change these regex!
Since you renamed
locatestarttagend
, the comment at line 34 should also be updated.In addition, make sure that existing comments are still relevant. In particular I would appreciate this for comments linking to specific sections of the HTML5 standard.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are links below, they still work, although they now redirect to other address. I updated them.
On other hand, section numbers were changed. I updated them in places which I touched.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks!