diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 5d03c98df5cdd0..75bf8adae6d70a 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -146,6 +146,7 @@ def reset(self): self.lasttag = '???' self.interesting = interesting_normal self.cdata_elem = None + self._support_cdata = True self._escapable = True super().reset() @@ -183,6 +184,19 @@ def clear_cdata_mode(self): self.cdata_elem = None self._escapable = True + def _set_support_cdata(self, flag=True): + """Enable or disable support of the CDATA sections. + If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>". + If disabled, "<[CDATA[" starts a bogus comments which ends with ">". + + This method is not called by default. Its purpose is to be called + in custom handle_starttag() and handle_endtag() methods, with + value that depends on the adjusted current node. + See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state + for details. + """ + self._support_cdata = flag + # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is # true, force handling all data as if followed by EOF marker. @@ -258,7 +272,10 @@ def goahead(self, end): break self.handle_comment(rawdata[i+4:j]) elif startswith("', i+9) + if j < 0: + return -1 + self.unknown_decl(rawdata[i+3: j]) + return j + 3 + else: + return self.parse_bogus_comment(i) elif rawdata[i:i+9].lower() == ' gtpos = rawdata.find('>', i+9) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 380bbe40177ec5..fff41dab321acd 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -10,10 +10,13 @@ class EventCollector(html.parser.HTMLParser): - def __init__(self, *args, **kw): + def __init__(self, *args, autocdata=False, **kw): + self.autocdata = autocdata self.events = [] self.append = self.events.append html.parser.HTMLParser.__init__(self, *args, **kw) + if autocdata: + self._set_support_cdata(False) def get_events(self): # Normalize the list of events so that buffer artefacts don't @@ -34,12 +37,16 @@ def get_events(self): def handle_starttag(self, tag, attrs): self.append(("starttag", tag, attrs)) + if self.autocdata and tag == 'svg': + self._set_support_cdata(True) def handle_startendtag(self, tag, attrs): self.append(("startendtag", tag, attrs)) def handle_endtag(self, tag): self.append(("endtag", tag)) + if self.autocdata and tag == 'svg': + self._set_support_cdata(False) # all other markup @@ -767,10 +774,6 @@ def test_eof_in_declarations(self): ('' '' @@ -845,28 +860,53 @@ def test_broken_condcoms(self): ] self._run_check(html, expected) - def test_cdata_declarations(self): - # More tests should be added. See also "8.2.4.42. Markup - # declaration open state", "8.2.4.69. CDATA section state", - # and issue 32876 - html = ('') - expected = [('unknown decl', 'CDATA[just some plain text')] + @support.subTests('content', [ + 'just some plain text', + '', + '¬-an-entity-ref;', + "", + '', + '[[I have many brackets]]', + 'I have a > in the middle', + 'I have a ]] in the middle', + '] ]>', + ']] >', + ('\n' + ' if (a < b && a > b) {\n' + ' printf("[How?]");\n' + ' }\n'), + ]) + def test_cdata_section_content(self, content): + # See "13.2.5.42 Markup declaration open state", + # "13.2.5.69 CDATA section state", and issue bpo-32876. + html = f'{content}' + expected = [ + ('starttag', 'svg', []), + ('starttag', 'text', [('y', '100')]), + ('unknown decl', 'CDATA[' + content), + ('endtag', 'text'), + ('endtag', 'svg'), + ] self._run_check(html, expected) + self._run_check(html, expected, collector=EventCollector(autocdata=True)) - def test_cdata_declarations_multiline(self): - html = (' b) {' - ' printf("[How?]");' - ' }' - ']]>') + def test_cdata_section(self): + # See "13.2.5.42 Markup declaration open state". + html = ('bar]]>' + 'foo<br>bar' + 'bar]]>') expected = [ - ('starttag', 'code', []), - ('unknown decl', - 'CDATA[ if (a < b && a > b) { ' - 'printf("[How?]"); }'), - ('endtag', 'code') + ('comment', '[CDATA[foo'), + ('starttag', 'svg', []), + ('starttag', 'text', [('y', '100')]), + ('unknown decl', 'CDATA[foo
bar'), + ('endtag', 'text'), + ('endtag', 'svg'), + ('comment', '[CDATA[foo'), ] - self._run_check(html, expected) + self._run_check(html, expected, collector=EventCollector(autocdata=True)) def test_convert_charrefs_dropped_text(self): # #23144: make sure that all the events are triggered when diff --git a/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst new file mode 100644 index 00000000000000..64bb32704ca51f --- /dev/null +++ b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst @@ -0,0 +1,5 @@ +Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to +the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section. +Add private method ``_set_support_cdata()`` which can be used to specify +how to parse ``<[CDATA[`` -- as a CDATA section in foreign content +(SVG or MathML) or as a bogus comment in the HTML namespace. pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy