Skip to content

Commit 151e0f0

Browse files
miss-islingtonserhiy-storchakaezio-melottiwaylan
authored
[3.10] gh-135661: Fix parsing start and end tags in HTMLParser according to the HTML5 standard (GH-135930) (GH-136268) (#136292)
* Whitespaces no longer accepted between `</` and the tag name. E.g. `</ script>` does not end the script section. * Vertical tabulation (`\v`) and non-ASCII whitespaces no longer recognized as whitespaces. The only whitespaces are `\t\n\r\f `. * Null character (U+0000) no longer ends the tag name. * Attributes and slashes after the tag name in end tags are now ignored, instead of terminating after the first `>` in quoted attribute value. E.g. `</script/foo=">"/>`. * Multiple slashes and whitespaces between the last attribute and closing `>` are now ignored in both start and end tags. E.g. `<a foo=bar/ //>`. * Multiple `=` between attribute name and value are no longer collapsed. E.g. `<a foo==bar>` produces attribute "foo" with value "=bar". * Whitespaces between the `=` separator and attribute name or value are no longer ignored. E.g. `<a foo =bar>` produces two attributes "foo" and "=bar", both with value None; `<a foo= bar>` produces two attributes: "foo" with value "" and "bar" with value None. * Fix data loss after unclosed script or style tag (gh-86155). Also backport test.support.subTests() (gh-135120). --------- (cherry picked from commit 0243f97) (cherry picked from commit c555f88) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com> Co-authored-by: Waylan Limberg <waylan.limberg@icloud.com>
1 parent 85766db commit 151e0f0

File tree

5 files changed

+222
-120
lines changed

5 files changed

+222
-120
lines changed

Lib/html/parser.py

Lines changed: 70 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,43 @@
3030
commentclose = re.compile(r'--!?>')
3131
commentabruptclose = re.compile(r'-?>')
3232
# Note:
33-
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
34-
# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
33+
# 1) if you change tagfind/attrfind remember to update locatetagend too;
34+
# 2) if you change tagfind/attrfind and/or locatetagend the parser will
3535
# explode, so don't do it.
36-
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
37-
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
38-
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
39-
attrfind_tolerant = re.compile(
40-
r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
41-
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
36+
# see the HTML5 specs section "13.2.5.6 Tag open state",
37+
# "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
38+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
39+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
40+
# https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
41+
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*')
42+
attrfind_tolerant = re.compile(r"""
43+
(
44+
(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
45+
)
46+
(= # value indicator
47+
('[^']*' # LITA-enclosed value
48+
|"[^"]*" # LIT-enclosed value
49+
|(?!['"])[^>\t\n\r\f ]* # bare value
50+
)
51+
)?
52+
(?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
53+
""", re.VERBOSE)
54+
locatetagend = re.compile(r"""
55+
[a-zA-Z][^\t\n\r\f />]* # tag name
56+
[\t\n\r\f /]* # optional whitespace before attribute name
57+
(?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
58+
(?:= # value indicator
59+
(?:'[^']*' # LITA-enclosed value
60+
|"[^"]*" # LIT-enclosed value
61+
|(?!['"])[^>\t\n\r\f ]* # bare value
62+
)
63+
)?
64+
[\t\n\r\f /]* # possibly followed by a space
65+
)*
66+
>?
67+
""", re.VERBOSE)
68+
# The following variables are not used, but are temporarily left for
69+
# backward compatibility.
4270
locatestarttagend_tolerant = re.compile(r"""
4371
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
4472
(?:[\s/]* # optional whitespace before attribute name
@@ -55,8 +83,6 @@
5583
\s* # trailing whitespace
5684
""", re.VERBOSE)
5785
endendtag = re.compile('>')
58-
# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
59-
# </ and the tag name, so maybe this should be fixed
6086
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
6187

6288

@@ -123,7 +149,8 @@ def get_starttag_text(self):
123149

124150
def set_cdata_mode(self, elem):
125151
self.cdata_elem = elem.lower()
126-
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
152+
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
153+
re.IGNORECASE|re.ASCII)
127154

128155
def clear_cdata_mode(self):
129156
self.interesting = interesting_normal
@@ -148,7 +175,7 @@ def goahead(self, end):
148175
# & near the end and see if it's followed by a space or ;.
149176
amppos = rawdata.rfind('&', max(i, n-34))
150177
if (amppos >= 0 and
151-
not re.compile(r'[\s;]').search(rawdata, amppos)):
178+
not re.compile(r'[\t\n\r\f ;]').search(rawdata, amppos)):
152179
break # wait till we get all the text
153180
j = n
154181
else:
@@ -261,7 +288,7 @@ def goahead(self, end):
261288
else:
262289
assert 0, "interesting.search() lied"
263290
# end while
264-
if end and i < n and not self.cdata_elem:
291+
if end and i < n:
265292
if self.convert_charrefs and not self.cdata_elem:
266293
self.handle_data(unescape(rawdata[i:n]))
267294
else:
@@ -307,7 +334,7 @@ def parse_comment(self, i, report=True):
307334
return match.end()
308335

309336
# Internal -- parse bogus comment, return length or -1 if not terminated
310-
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
337+
# see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
311338
def parse_bogus_comment(self, i, report=1):
312339
rawdata = self.rawdata
313340
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
@@ -333,6 +360,8 @@ def parse_pi(self, i):
333360

334361
# Internal -- handle starttag, return end or -1 if not terminated
335362
def parse_starttag(self, i):
363+
# See the HTML5 specs section "13.2.5.8 Tag name state"
364+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
336365
self.__starttag_text = None
337366
endpos = self.check_for_whole_start_tag(i)
338367
if endpos < 0:
@@ -385,76 +414,42 @@ def parse_starttag(self, i):
385414
# or -1 if incomplete.
386415
def check_for_whole_start_tag(self, i):
387416
rawdata = self.rawdata
388-
m = locatestarttagend_tolerant.match(rawdata, i)
389-
if m:
390-
j = m.end()
391-
next = rawdata[j:j+1]
392-
if next == ">":
393-
return j + 1
394-
if next == "/":
395-
if rawdata.startswith("/>", j):
396-
return j + 2
397-
if rawdata.startswith("/", j):
398-
# buffer boundary
399-
return -1
400-
# else bogus input
401-
if j > i:
402-
return j
403-
else:
404-
return i + 1
405-
if next == "":
406-
# end of input
407-
return -1
408-
if next in ("abcdefghijklmnopqrstuvwxyz=/"
409-
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
410-
# end of input in or before attribute value, or we have the
411-
# '/' from a '/>' ending
412-
return -1
413-
if j > i:
414-
return j
415-
else:
416-
return i + 1
417-
raise AssertionError("we should not get here!")
417+
match = locatetagend.match(rawdata, i+1)
418+
assert match
419+
j = match.end()
420+
if rawdata[j-1] != ">":
421+
return -1
422+
return j
418423

419424
# Internal -- parse endtag, return end or -1 if incomplete
420425
def parse_endtag(self, i):
426+
# See the HTML5 specs section "13.2.5.7 End tag open state"
427+
# https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
421428
rawdata = self.rawdata
422429
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
423-
match = endendtag.search(rawdata, i+1) # >
424-
if not match:
430+
if rawdata.find('>', i+2) < 0: # fast check
425431
return -1
426-
gtpos = match.end()
427-
match = endtagfind.match(rawdata, i) # </ + tag + >
428-
if not match:
429-
if self.cdata_elem is not None:
430-
self.handle_data(rawdata[i:gtpos])
431-
return gtpos
432-
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
433-
namematch = tagfind_tolerant.match(rawdata, i+2)
434-
if not namematch:
435-
# w3.org/TR/html5/tokenization.html#end-tag-open-state
436-
if rawdata[i:i+3] == '</>':
437-
return i+3
438-
else:
439-
return self.parse_bogus_comment(i)
440-
tagname = namematch.group(1).lower()
441-
# consume and ignore other stuff between the name and the >
442-
# Note: this is not 100% correct, since we might have things like
443-
# </tag attr=">">, but looking for > after the name should cover
444-
# most of the cases and is much simpler
445-
gtpos = rawdata.find('>', namematch.end())
446-
self.handle_endtag(tagname)
447-
return gtpos+1
432+
if not endtagopen.match(rawdata, i): # </ + letter
433+
if rawdata[i+2:i+3] == '>': # </> is ignored
434+
# "missing-end-tag-name" parser error
435+
return i+3
436+
else:
437+
return self.parse_bogus_comment(i)
448438

449-
elem = match.group(1).lower() # script or style
450-
if self.cdata_elem is not None:
451-
if elem != self.cdata_elem:
452-
self.handle_data(rawdata[i:gtpos])
453-
return gtpos
439+
match = locatetagend.match(rawdata, i+2)
440+
assert match
441+
j = match.end()
442+
if rawdata[j-1] != ">":
443+
return -1
454444

455-
self.handle_endtag(elem)
445+
# find the name: "13.2.5.8 Tag name state"
446+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
447+
match = tagfind_tolerant.match(rawdata, i+2)
448+
assert match
449+
tag = match.group(1).lower()
450+
self.handle_endtag(tag)
456451
self.clear_cdata_mode()
457-
return gtpos
452+
return j
458453

459454
# Overridable -- finish processing of start+end tag: <tag.../>
460455
def handle_startendtag(self, tag, attrs):

Lib/test/support/__init__.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,31 @@ def check_sizeof(test, o, size):
718718
% (type(o), result, size)
719719
test.assertEqual(result, size, msg)
720720

721+
def subTests(arg_names, arg_values, /, *, _do_cleanups=False):
722+
"""Run multiple subtests with different parameters.
723+
"""
724+
single_param = False
725+
if isinstance(arg_names, str):
726+
arg_names = arg_names.replace(',',' ').split()
727+
if len(arg_names) == 1:
728+
single_param = True
729+
arg_values = tuple(arg_values)
730+
def decorator(func):
731+
if isinstance(func, type):
732+
raise TypeError('subTests() can only decorate methods, not classes')
733+
@functools.wraps(func)
734+
def wrapper(self, /, *args, **kwargs):
735+
for values in arg_values:
736+
if single_param:
737+
values = (values,)
738+
subtest_kwargs = dict(zip(arg_names, values))
739+
with self.subTest(**subtest_kwargs):
740+
func(self, *args, **kwargs, **subtest_kwargs)
741+
if _do_cleanups:
742+
self.doCleanups()
743+
return wrapper
744+
return decorator
745+
721746
#=======================================================================
722747
# Decorator for running a function in a different locale, correctly resetting
723748
# it afterwards.

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy