Skip to content

Commit f66c75f

Browse files
miss-islingtontimonviolaserhiy-storchakaambv
authored
[3.12] gh-118350: Fix support of elements "textarea" and "title" in HTMLParser (GH-135310) (GH-136986)
(cherry picked from commit 4d02f31) Co-authored-by: Timon Viola <44016238+timonviola@users.noreply.github.com> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Łukasz Langa <lukasz@langa.pl>
1 parent ad695f5 commit f66c75f

File tree

3 files changed

+113
-5
lines changed

3 files changed

+113
-5
lines changed

Lib/html/parser.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ class HTMLParser(_markupbase.ParserBase):
110110
"""
111111

112112
CDATA_CONTENT_ELEMENTS = ("script", "style")
113+
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
113114

114115
def __init__(self, *, convert_charrefs=True):
115116
"""Initialize and reset this instance.
@@ -127,6 +128,7 @@ def reset(self):
127128
self.lasttag = '???'
128129
self.interesting = interesting_normal
129130
self.cdata_elem = None
131+
self._escapable = True
130132
super().reset()
131133

132134
def feed(self, data):
@@ -148,14 +150,20 @@ def get_starttag_text(self):
148150
"""Return full source of start tag: '<...>'."""
149151
return self.__starttag_text
150152

151-
def set_cdata_mode(self, elem):
153+
def set_cdata_mode(self, elem, *, escapable=False):
152154
self.cdata_elem = elem.lower()
153-
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
154-
re.IGNORECASE|re.ASCII)
155+
self._escapable = escapable
156+
if escapable and not self.convert_charrefs:
157+
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
158+
re.IGNORECASE|re.ASCII)
159+
else:
160+
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
161+
re.IGNORECASE|re.ASCII)
155162

156163
def clear_cdata_mode(self):
157164
self.interesting = interesting_normal
158165
self.cdata_elem = None
166+
self._escapable = True
159167

160168
# Internal -- handle data as far as reasonable. May leave state
161169
# and data to be processed by a subsequent call. If 'end' is
@@ -188,7 +196,7 @@ def goahead(self, end):
188196
break
189197
j = n
190198
if i < j:
191-
if self.convert_charrefs and not self.cdata_elem:
199+
if self.convert_charrefs and self._escapable:
192200
self.handle_data(unescape(rawdata[i:j]))
193201
else:
194202
self.handle_data(rawdata[i:j])
@@ -290,7 +298,7 @@ def goahead(self, end):
290298
assert 0, "interesting.search() lied"
291299
# end while
292300
if end and i < n:
293-
if self.convert_charrefs and not self.cdata_elem:
301+
if self.convert_charrefs and self._escapable:
294302
self.handle_data(unescape(rawdata[i:n]))
295303
else:
296304
self.handle_data(rawdata[i:n])
@@ -402,6 +410,8 @@ def parse_starttag(self, i):
402410
self.handle_starttag(tag, attrs)
403411
if tag in self.CDATA_CONTENT_ELEMENTS:
404412
self.set_cdata_mode(tag)
413+
elif tag in self.RCDATA_CONTENT_ELEMENTS:
414+
self.set_cdata_mode(tag, escapable=True)
405415
return endpos
406416

407417
# Internal -- check to see if we have a complete starttag; return end

Lib/test/test_htmlparser.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,49 @@ def test_style_content(self, content):
317317
("data", content),
318318
("endtag", "style")])
319319

320+
@support.subTests('content', [
321+
'<!-- not a comment -->',
322+
"<not a='start tag'>",
323+
'<![CDATA[not a cdata]]>',
324+
'<!not a bogus comment>',
325+
'</not a bogus comment>',
326+
'\u2603',
327+
'< /title>',
328+
'</ title>',
329+
'</titled>',
330+
'</title\v>',
331+
'</title\xa0>',
332+
'</tıtle>',
333+
])
334+
def test_title_content(self, content):
335+
source = f"<title>{content}</title>"
336+
self._run_check(source, [
337+
("starttag", "title", []),
338+
("data", content),
339+
("endtag", "title"),
340+
])
341+
342+
@support.subTests('content', [
343+
'<!-- not a comment -->',
344+
"<not a='start tag'>",
345+
'<![CDATA[not a cdata]]>',
346+
'<!not a bogus comment>',
347+
'</not a bogus comment>',
348+
'\u2603',
349+
'< /textarea>',
350+
'</ textarea>',
351+
'</textareable>',
352+
'</textarea\v>',
353+
'</textarea\xa0>',
354+
])
355+
def test_textarea_content(self, content):
356+
source = f"<textarea>{content}</textarea>"
357+
self._run_check(source, [
358+
("starttag", "textarea", []),
359+
("data", content),
360+
("endtag", "textarea"),
361+
])
362+
320363
@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
321364
'script/', 'script foo=bar', 'script foo=">"'])
322365
def test_script_closing_tag(self, endtag):
@@ -346,6 +389,38 @@ def test_style_closing_tag(self, endtag):
346389
("endtag", "style")],
347390
collector=EventCollectorNoNormalize(convert_charrefs=False))
348391

392+
@support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n',
393+
'title/', 'title foo=bar', 'title foo=">"'])
394+
def test_title_closing_tag(self, endtag):
395+
content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
396+
s = f'<TitLe>{content}</{endtag}>'
397+
self._run_check(s, [("starttag", "title", []),
398+
('data', '<!-- not a comment --><i>Egg & Spam</i>'),
399+
("endtag", "title")],
400+
collector=EventCollectorNoNormalize(convert_charrefs=True))
401+
self._run_check(s, [("starttag", "title", []),
402+
('data', '<!-- not a comment --><i>Egg '),
403+
('entityref', 'amp'),
404+
('data', ' Spam</i>'),
405+
("endtag", "title")],
406+
collector=EventCollectorNoNormalize(convert_charrefs=False))
407+
408+
@support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n',
409+
'textarea/', 'textarea foo=bar', 'textarea foo=">"'])
410+
def test_textarea_closing_tag(self, endtag):
411+
content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
412+
s = f'<TexTarEa>{content}</{endtag}>'
413+
self._run_check(s, [("starttag", "textarea", []),
414+
('data', '<!-- not a comment --><i>Egg & Spam</i>'),
415+
("endtag", "textarea")],
416+
collector=EventCollectorNoNormalize(convert_charrefs=True))
417+
self._run_check(s, [("starttag", "textarea", []),
418+
('data', '<!-- not a comment --><i>Egg '),
419+
('entityref', 'amp'),
420+
('data', ' Spam</i>'),
421+
("endtag", "textarea")],
422+
collector=EventCollectorNoNormalize(convert_charrefs=False))
423+
349424
@support.subTests('tail,end', [
350425
('', False),
351426
('<', False),
@@ -363,6 +438,27 @@ def test_eof_in_script(self, tail, end):
363438
("data", content if end else content + tail)],
364439
collector=EventCollectorNoNormalize(convert_charrefs=False))
365440

441+
@support.subTests('tail,end', [
442+
('', False),
443+
('<', False),
444+
('</', False),
445+
('</t', False),
446+
('</title', False),
447+
('</title ', True),
448+
('</title foo=bar', True),
449+
('</title foo=">', True),
450+
])
451+
def test_eof_in_title(self, tail, end):
452+
s = f'<TitLe>Egg &amp; Spam{tail}'
453+
self._run_check(s, [("starttag", "title", []),
454+
("data", "Egg & Spam" + ('' if end else tail))],
455+
collector=EventCollectorNoNormalize(convert_charrefs=True))
456+
self._run_check(s, [("starttag", "title", []),
457+
('data', 'Egg '),
458+
('entityref', 'amp'),
459+
('data', ' Spam' + ('' if end else tail))],
460+
collector=EventCollectorNoNormalize(convert_charrefs=False))
461+
366462
def test_comments(self):
367463
html = ("<!-- I'm a valid comment -->"
368464
'<!--me too!-->'
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix support of escapable raw text mode (elements "textarea" and "title")
2+
in :class:`html.parser.HTMLParser`.

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy