diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 13c95c34e505c8..315bfd5d249b68 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -260,7 +260,7 @@ def parse_html_declaration(self, i): if rawdata[i:i+4] == '' '' '' - '') + '' + # see #32876 + '' + '' + '' + '' + '' + '' # required '[' after CDATA + ) expected = [ ('comment', ' not really a comment '), ('comment', ' not a comment either --'), @@ -552,39 +573,65 @@ def test_broken_comments(self): ('comment', ''), ('comment', '<-- this was an empty comment'), ('comment', '!! another bogus comment !!!'), + ('comment', '[with square brackets]!'), + ('comment', '[\nmultiline\nbogusness\n]!'), + ('comment', '[more brackets]-[and a hyphen]!'), + ('comment', '[cdata[should be uppercase]]'), + ('comment', '[CDATA [whitespaces are not ignored]]'), + ('comment', '[CDATA]]'), ] self._run_check(html, expected) def test_broken_condcoms(self): # these condcoms are missing the '--' after '' + # and they are considered bogus comments according to + # "8.2.4.42. Markup declaration open state" html = ('broken condcom' '' '' 'foo' '') - # According to the HTML5 specs sections "8.2.4.44 Bogus comment state" - # and "8.2.4.45 Markup declaration open state", comment tokens should - # be emitted instead of 'unknown decl', but calling unknown_decl - # provides more flexibility. - # See also Lib/_markupbase.py:parse_declaration expected = [ - ('unknown decl', 'if !(IE)'), + ('comment', '[if !(IE)]'), ('data', 'broken condcom'), - ('unknown decl', 'endif'), - ('unknown decl', 'if ! IE'), + ('comment', '[endif]'), + ('comment', '[if ! IE]'), ('startendtag', 'link', [('href', 'favicon.tiff')]), - ('unknown decl', 'endif'), - ('unknown decl', 'if !IE 6'), + ('comment', '[endif]'), + ('comment', '[if !IE 6]'), ('startendtag', 'img', [('src', 'firefox.png')]), - ('unknown decl', 'endif'), - ('unknown decl', 'if !ie 6'), + ('comment', '[endif]'), + ('comment', '[if !ie 6]'), ('starttag', 'b', []), ('data', 'foo'), ('endtag', 'b'), - ('unknown decl', 'endif'), - ('unknown decl', 'if (!IE)|(lt IE 9)'), + ('comment', '[endif]'), + ('comment', '[if (!IE)|(lt IE 9)]'), ('startendtag', 'img', [('src', 'mammoth.bmp')]), - ('unknown decl', 'endif') + ('comment', '[endif]') + ] + self._run_check(html, expected) + + def test_cdata_declarations(self): + # More tests should be added. See also "8.2.4.42. Markup + # declaration open state", "8.2.4.69. CDATA section state", + # and issue 32876 + html = ('') + expected = [('unknown decl', 'CDATA[just some plain text')] + self._run_check(html, expected) + + def test_cdata_declarations_multiline(self): + html = (' b) {' + ' printf("[How?]");' + ' }' + ']]>') + expected = [ + ('starttag', 'code', []), + ('unknown decl', + 'CDATA[ if (a < b && a > b) { ' + 'printf("[How?]"); }'), + ('endtag', 'code') ] self._run_check(html, expected) diff --git a/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst b/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst new file mode 100644 index 00000000000000..42107de75c7d29 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst @@ -0,0 +1,2 @@ +Fix handling of invalid markup declarations in +:class:`html.parser.HTMLParser`. pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy