From 5c0a5f37237a641ef0327de4f51f9e26fdd967f0 Mon Sep 17 00:00:00 2001 From: Ezio Melotti Date: Fri, 14 Sep 2018 00:13:22 -0700 Subject: [PATCH 1/5] bpo-32876: fix handling of invalid markup declarations. --- Lib/html/parser.py | 4 ++-- Lib/test/test_htmlparser.py | 41 ++++++++++++++++++++++--------------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index de81879a631ac7..26c6ebb3b906de 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -260,7 +260,7 @@ def parse_html_declaration(self, i): if rawdata[i:i+4] == '' '' '' - '') + '' + # see #32876 + '' + '' + '' + '') expected = [ ('comment', ' not really a comment '), ('comment', ' not a comment either --'), @@ -593,39 +598,40 @@ def test_broken_comments(self): ('comment', ''), ('comment', '<-- this was an empty comment'), ('comment', '!! another bogus comment !!!'), + ('comment', '[with square brackets]!'), + ('comment', '[\nmultiline\nbogusness\n]!'), + ('comment', '[more brackets]-[and a hyphen]!'), + ('comment', '[cdata[should be uppercase]]'), ] self._run_check(html, expected) def test_broken_condcoms(self): # these condcoms are missing the '--' after '' + # and they are considered bogus comments according to + # "8.2.4.42. Markup declaration open state" html = ('broken condcom' '' '' 'foo' '') - # According to the HTML5 specs sections "8.2.4.44 Bogus comment state" - # and "8.2.4.45 Markup declaration open state", comment tokens should - # be emitted instead of 'unknown decl', but calling unknown_decl - # provides more flexibility. - # See also Lib/_markupbase.py:parse_declaration expected = [ - ('unknown decl', 'if !(IE)'), + ('comment', '[if !(IE)]'), ('data', 'broken condcom'), - ('unknown decl', 'endif'), - ('unknown decl', 'if ! IE'), + ('comment', '[endif]'), + ('comment', '[if ! IE]'), ('startendtag', 'link', [('href', 'favicon.tiff')]), - ('unknown decl', 'endif'), - ('unknown decl', 'if !IE 6'), + ('comment', '[endif]'), + ('comment', '[if !IE 6]'), ('startendtag', 'img', [('src', 'firefox.png')]), - ('unknown decl', 'endif'), - ('unknown decl', 'if !ie 6'), + ('comment', '[endif]'), + ('comment', '[if !ie 6]'), ('starttag', 'b', []), ('data', 'foo'), ('endtag', 'b'), - ('unknown decl', 'endif'), - ('unknown decl', 'if (!IE)|(lt IE 9)'), + ('comment', '[endif]'), + ('comment', '[if (!IE)|(lt IE 9)]'), ('startendtag', 'img', [('src', 'mammoth.bmp')]), - ('unknown decl', 'endif') + ('comment', '[endif]') ] self._run_check(html, expected) @@ -642,6 +648,7 @@ def test_convert_charrefs_dropped_text(self): ) + class AttributesTestCase(TestCaseBase): def test_attr_syntax(self): From 74cf400cabebd831e2e2ff16e1a47abbe4b553ce Mon Sep 17 00:00:00 2001 From: Ezio Melotti Date: Fri, 14 Sep 2018 13:41:22 -0700 Subject: [PATCH 2/5] bpo-32876: fix CDATA handling and add a couple of tests. --- Lib/html/parser.py | 2 +- Lib/test/test_htmlparser.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 26c6ebb3b906de..eacbb09aa2a53e 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -260,7 +260,7 @@ def parse_html_declaration(self, i): if rawdata[i:i+4] == '' @@ -556,7 +562,10 @@ def test_bogus_comments(self): '' '' '' - '') + '' + '' + '' # required '[' after CDATA + ) expected = [ ('comment', ' not really a comment '), ('comment', ' not a comment either --'), @@ -568,6 +577,8 @@ def test_bogus_comments(self): ('comment', '[\nmultiline\nbogusness\n]!'), ('comment', '[more brackets]-[and a hyphen]!'), ('comment', '[cdata[should be uppercase]]'), + ('comment', '[CDATA [whitespaces are not ignored]]'), + ('comment', '[CDATA]]'), ] self._run_check(html, expected) From 9c5e14a35dfa725b240d458b9eaac04045e93c70 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 9 May 2025 15:50:46 +0300 Subject: [PATCH 4/5] Add a NEWS entry. --- .../next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst diff --git a/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst b/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst new file mode 100644 index 00000000000000..42107de75c7d29 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst @@ -0,0 +1,2 @@ +Fix handling of invalid markup declarations in +:class:`html.parser.HTMLParser`. From 57cdab23a72023567a88ca84f77e52ddcbd939ce Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 9 May 2025 15:51:25 +0300 Subject: [PATCH 5/5] Remove redundant empty lines. --- Lib/test/test_htmlparser.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index fee67f79cf28e4..b1aa8dd3a0636c 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -635,7 +635,6 @@ def test_cdata_declarations_multiline(self): ] self._run_check(html, expected) - def test_convert_charrefs_dropped_text(self): # #23144: make sure that all the events are triggered when # convert_charrefs is True, even if we don't call .close() @@ -649,7 +648,6 @@ def test_convert_charrefs_dropped_text(self): ) - class AttributesTestCase(TestCaseBase): def test_attr_syntax(self): pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy