Skip to content

Commit c3c0f07

Browse files
miss-islingtontimonviolaserhiy-storchakaambv
authored andcommitted
[3.12] pythongh-118350: Fix support of elements "textarea" and "title" in HTMLParser (pythonGH-135310) (pythonGH-136986)
(cherry picked from commit 4d02f31) Co-authored-by: Timon Viola <44016238+timonviola@users.noreply.github.com> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Łukasz Langa <lukasz@langa.pl> Signed-off-by: Michał Górny <mgorny@gentoo.org>
1 parent 59ff151 commit c3c0f07

File tree

2 files changed

+112
-5
lines changed

2 files changed

+112
-5
lines changed

Lib/html/parser.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ class HTMLParser(_markupbase.ParserBase):
110110
"""
111111

112112
CDATA_CONTENT_ELEMENTS = ("script", "style")
113+
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
113114

114115
def __init__(self, *, convert_charrefs=True):
115116
"""Initialize and reset this instance.
@@ -126,7 +127,9 @@ def reset(self):
126127
self.lasttag = '???'
127128
self.interesting = interesting_normal
128129
self.cdata_elem = None
130+
self._escapable = True
129131
_markupbase.ParserBase.reset(self)
132+
super().reset()
130133

131134
def feed(self, data):
132135
r"""Feed data to the parser.
@@ -147,14 +150,20 @@ def get_starttag_text(self):
147150
"""Return full source of start tag: '<...>'."""
148151
return self.__starttag_text
149152

150-
def set_cdata_mode(self, elem):
153+
def set_cdata_mode(self, elem, *, escapable=False):
151154
self.cdata_elem = elem.lower()
152-
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
153-
re.IGNORECASE|re.ASCII)
155+
self._escapable = escapable
156+
if escapable and not self.convert_charrefs:
157+
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
158+
re.IGNORECASE|re.ASCII)
159+
else:
160+
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
161+
re.IGNORECASE|re.ASCII)
154162

155163
def clear_cdata_mode(self):
156164
self.interesting = interesting_normal
157165
self.cdata_elem = None
166+
self._escapable = True
158167

159168
# Internal -- handle data as far as reasonable. May leave state
160169
# and data to be processed by a subsequent call. If 'end' is
@@ -187,7 +196,7 @@ def goahead(self, end):
187196
break
188197
j = n
189198
if i < j:
190-
if self.convert_charrefs and not self.cdata_elem:
199+
if self.convert_charrefs and self._escapable:
191200
self.handle_data(unescape(rawdata[i:j]))
192201
else:
193202
self.handle_data(rawdata[i:j])
@@ -289,7 +298,7 @@ def goahead(self, end):
289298
assert 0, "interesting.search() lied"
290299
# end while
291300
if end and i < n:
292-
if self.convert_charrefs and not self.cdata_elem:
301+
if self.convert_charrefs and self._escapable:
293302
self.handle_data(unescape(rawdata[i:n]))
294303
else:
295304
self.handle_data(rawdata[i:n])
@@ -408,6 +417,8 @@ def parse_starttag(self, i):
408417
self.handle_starttag(tag, attrs)
409418
if tag in self.CDATA_CONTENT_ELEMENTS:
410419
self.set_cdata_mode(tag)
420+
elif tag in self.RCDATA_CONTENT_ELEMENTS:
421+
self.set_cdata_mode(tag, escapable=True)
411422
return endpos
412423

413424
# Internal -- check to see if we have a complete starttag; return end

Lib/test/test_htmlparser.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,49 @@ def test_style_content(self, content):
316316
("data", content),
317317
("endtag", "style")])
318318

319+
@support.subTests('content', [
320+
'<!-- not a comment -->',
321+
"<not a='start tag'>",
322+
'<![CDATA[not a cdata]]>',
323+
'<!not a bogus comment>',
324+
'</not a bogus comment>',
325+
'\u2603',
326+
'< /title>',
327+
'</ title>',
328+
'</titled>',
329+
'</title\v>',
330+
'</title\xa0>',
331+
'</tıtle>',
332+
])
333+
def test_title_content(self, content):
334+
source = f"<title>{content}</title>"
335+
self._run_check(source, [
336+
("starttag", "title", []),
337+
("data", content),
338+
("endtag", "title"),
339+
])
340+
341+
@support.subTests('content', [
342+
'<!-- not a comment -->',
343+
"<not a='start tag'>",
344+
'<![CDATA[not a cdata]]>',
345+
'<!not a bogus comment>',
346+
'</not a bogus comment>',
347+
'\u2603',
348+
'< /textarea>',
349+
'</ textarea>',
350+
'</textareable>',
351+
'</textarea\v>',
352+
'</textarea\xa0>',
353+
])
354+
def test_textarea_content(self, content):
355+
source = f"<textarea>{content}</textarea>"
356+
self._run_check(source, [
357+
("starttag", "textarea", []),
358+
("data", content),
359+
("endtag", "textarea"),
360+
])
361+
319362
@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
320363
'script/', 'script foo=bar', 'script foo=">"'])
321364
def test_script_closing_tag(self, endtag):
@@ -345,6 +388,38 @@ def test_style_closing_tag(self, endtag):
345388
("endtag", "style")],
346389
collector=EventCollectorNoNormalize(convert_charrefs=False))
347390

391+
@support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n',
392+
'title/', 'title foo=bar', 'title foo=">"'])
393+
def test_title_closing_tag(self, endtag):
394+
content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
395+
s = f'<TitLe>{content}</{endtag}>'
396+
self._run_check(s, [("starttag", "title", []),
397+
('data', '<!-- not a comment --><i>Egg & Spam</i>'),
398+
("endtag", "title")],
399+
collector=EventCollectorNoNormalize(convert_charrefs=True))
400+
self._run_check(s, [("starttag", "title", []),
401+
('data', '<!-- not a comment --><i>Egg '),
402+
('entityref', 'amp'),
403+
('data', ' Spam</i>'),
404+
("endtag", "title")],
405+
collector=EventCollectorNoNormalize(convert_charrefs=False))
406+
407+
@support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n',
408+
'textarea/', 'textarea foo=bar', 'textarea foo=">"'])
409+
def test_textarea_closing_tag(self, endtag):
410+
content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
411+
s = f'<TexTarEa>{content}</{endtag}>'
412+
self._run_check(s, [("starttag", "textarea", []),
413+
('data', '<!-- not a comment --><i>Egg & Spam</i>'),
414+
("endtag", "textarea")],
415+
collector=EventCollectorNoNormalize(convert_charrefs=True))
416+
self._run_check(s, [("starttag", "textarea", []),
417+
('data', '<!-- not a comment --><i>Egg '),
418+
('entityref', 'amp'),
419+
('data', ' Spam</i>'),
420+
("endtag", "textarea")],
421+
collector=EventCollectorNoNormalize(convert_charrefs=False))
422+
348423
@support.subTests('tail,end', [
349424
('', False),
350425
('<', False),
@@ -362,6 +437,27 @@ def test_eof_in_script(self, tail, end):
362437
("data", content if end else content + tail)],
363438
collector=EventCollectorNoNormalize(convert_charrefs=False))
364439

440+
@support.subTests('tail,end', [
441+
('', False),
442+
('<', False),
443+
('</', False),
444+
('</t', False),
445+
('</title', False),
446+
('</title ', True),
447+
('</title foo=bar', True),
448+
('</title foo=">', True),
449+
])
450+
def test_eof_in_title(self, tail, end):
451+
s = f'<TitLe>Egg &amp; Spam{tail}'
452+
self._run_check(s, [("starttag", "title", []),
453+
("data", "Egg & Spam" + ('' if end else tail))],
454+
collector=EventCollectorNoNormalize(convert_charrefs=True))
455+
self._run_check(s, [("starttag", "title", []),
456+
('data', 'Egg '),
457+
('entityref', 'amp'),
458+
('data', ' Spam' + ('' if end else tail))],
459+
collector=EventCollectorNoNormalize(convert_charrefs=False))
460+
365461
def test_comments(self):
366462
html = ("<!-- I'm a valid comment -->"
367463
'<!--me too!-->'

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy