Skip to content

Commit aa0c3d1

Browse files
miss-islingtonezio-melottiserhiy-storchaka
authored
[3.13] gh-77057: Fix handling of invalid markup declarations in HTMLParser (GH-9295) (GH-133834)
(cherry picked from commit 76c0b01) Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
1 parent 3de6546 commit aa0c3d1

File tree

3 files changed

+68
-19
lines changed

3 files changed

+68
-19
lines changed

Lib/html/parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ def parse_html_declaration(self, i):
278278
if rawdata[i:i+4] == '<!--':
279279
# this case is actually already handled in goahead()
280280
return self.parse_comment(i)
281-
elif rawdata[i:i+3] == '<![':
281+
elif rawdata[i:i+9] == '<![CDATA[':
282282
return self.parse_marked_section(i)
283283
elif rawdata[i:i+9].lower() == '<!doctype':
284284
# find the closing >
@@ -295,7 +295,7 @@ def parse_html_declaration(self, i):
295295
def parse_bogus_comment(self, i, report=1):
296296
rawdata = self.rawdata
297297
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
298-
'parse_comment()')
298+
'parse_bogus_comment()')
299299
pos = rawdata.find('>', i+2)
300300
if pos == -1:
301301
return -1

Lib/test/test_htmlparser.py

Lines changed: 64 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -566,52 +566,99 @@ def test_EOF_in_charref(self):
566566
for html, expected in data:
567567
self._run_check(html, expected)
568568

569-
def test_broken_comments(self):
569+
def test_EOF_in_comments_or_decls(self):
570+
data = [
571+
('<!', [('data', '<!')]),
572+
('<!-', [('data', '<!-')]),
573+
('<!--', [('data', '<!--')]),
574+
('<![', [('data', '<![')]),
575+
('<![CDATA[', [('data', '<![CDATA[')]),
576+
('<![CDATA[x', [('data', '<![CDATA[x')]),
577+
('<!DOCTYPE', [('data', '<!DOCTYPE')]),
578+
('<!DOCTYPE HTML', [('data', '<!DOCTYPE HTML')]),
579+
]
580+
for html, expected in data:
581+
self._run_check(html, expected)
582+
def test_bogus_comments(self):
570583
html = ('<! not really a comment >'
571584
'<! not a comment either -->'
572585
'<! -- close enough -->'
573586
'<!><!<-- this was an empty comment>'
574-
'<!!! another bogus comment !!!>')
587+
'<!!! another bogus comment !!!>'
588+
# see #32876
589+
'<![with square brackets]!>'
590+
'<![\nmultiline\nbogusness\n]!>'
591+
'<![more brackets]-[and a hyphen]!>'
592+
'<![cdata[should be uppercase]]>'
593+
'<![CDATA [whitespaces are not ignored]]>'
594+
'<![CDATA]]>' # required '[' after CDATA
595+
)
575596
expected = [
576597
('comment', ' not really a comment '),
577598
('comment', ' not a comment either --'),
578599
('comment', ' -- close enough --'),
579600
('comment', ''),
580601
('comment', '<-- this was an empty comment'),
581602
('comment', '!! another bogus comment !!!'),
603+
('comment', '[with square brackets]!'),
604+
('comment', '[\nmultiline\nbogusness\n]!'),
605+
('comment', '[more brackets]-[and a hyphen]!'),
606+
('comment', '[cdata[should be uppercase]]'),
607+
('comment', '[CDATA [whitespaces are not ignored]]'),
608+
('comment', '[CDATA]]'),
582609
]
583610
self._run_check(html, expected)
584611

585612
def test_broken_condcoms(self):
586613
# these condcoms are missing the '--' after '<!' and before the '>'
614+
# and they are considered bogus comments according to
615+
# "8.2.4.42. Markup declaration open state"
587616
html = ('<![if !(IE)]>broken condcom<![endif]>'
588617
'<![if ! IE]><link href="favicon.tiff"/><![endif]>'
589618
'<![if !IE 6]><img src="firefox.png" /><![endif]>'
590619
'<![if !ie 6]><b>foo</b><![endif]>'
591620
'<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>')
592-
# According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
593-
# and "8.2.4.45 Markup declaration open state", comment tokens should
594-
# be emitted instead of 'unknown decl', but calling unknown_decl
595-
# provides more flexibility.
596-
# See also Lib/_markupbase.py:parse_declaration
597621
expected = [
598-
('unknown decl', 'if !(IE)'),
622+
('comment', '[if !(IE)]'),
599623
('data', 'broken condcom'),
600-
('unknown decl', 'endif'),
601-
('unknown decl', 'if ! IE'),
624+
('comment', '[endif]'),
625+
('comment', '[if ! IE]'),
602626
('startendtag', 'link', [('href', 'favicon.tiff')]),
603-
('unknown decl', 'endif'),
604-
('unknown decl', 'if !IE 6'),
627+
('comment', '[endif]'),
628+
('comment', '[if !IE 6]'),
605629
('startendtag', 'img', [('src', 'firefox.png')]),
606-
('unknown decl', 'endif'),
607-
('unknown decl', 'if !ie 6'),
630+
('comment', '[endif]'),
631+
('comment', '[if !ie 6]'),
608632
('starttag', 'b', []),
609633
('data', 'foo'),
610634
('endtag', 'b'),
611-
('unknown decl', 'endif'),
612-
('unknown decl', 'if (!IE)|(lt IE 9)'),
635+
('comment', '[endif]'),
636+
('comment', '[if (!IE)|(lt IE 9)]'),
613637
('startendtag', 'img', [('src', 'mammoth.bmp')]),
614-
('unknown decl', 'endif')
638+
('comment', '[endif]')
639+
]
640+
self._run_check(html, expected)
641+
642+
def test_cdata_declarations(self):
643+
# More tests should be added. See also "8.2.4.42. Markup
644+
# declaration open state", "8.2.4.69. CDATA section state",
645+
# and issue 32876
646+
html = ('<![CDATA[just some plain text]]>')
647+
expected = [('unknown decl', 'CDATA[just some plain text')]
648+
self._run_check(html, expected)
649+
650+
def test_cdata_declarations_multiline(self):
651+
html = ('<code><![CDATA['
652+
' if (a < b && a > b) {'
653+
' printf("[<marquee>How?</marquee>]");'
654+
' }'
655+
']]></code>')
656+
expected = [
657+
('starttag', 'code', []),
658+
('unknown decl',
659+
'CDATA[ if (a < b && a > b) { '
660+
'printf("[<marquee>How?</marquee>]"); }'),
661+
('endtag', 'code')
615662
]
616663
self._run_check(html, expected)
617664

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix handling of invalid markup declarations in
2+
:class:`html.parser.HTMLParser`.

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy