diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 7eea885cfe63c5..5d03c98df5cdd0 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -128,6 +128,7 @@ class HTMLParser(_markupbase.ParserBase):
"""
CDATA_CONTENT_ELEMENTS = ("script", "style")
+ RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
def __init__(self, *, convert_charrefs=True):
"""Initialize and reset this instance.
@@ -145,6 +146,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
+ self._escapable = True
super().reset()
def feed(self, data):
@@ -166,14 +168,20 @@ def get_starttag_text(self):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text
- def set_cdata_mode(self, elem):
+ def set_cdata_mode(self, elem, *, escapable=False):
self.cdata_elem = elem.lower()
- self.interesting = re.compile(r'%s(?=[\t\n\r\f />])' % self.cdata_elem,
- re.IGNORECASE|re.ASCII)
+ self._escapable = escapable
+ if escapable and not self.convert_charrefs:
+ self.interesting = re.compile(r'&|%s(?=[\t\n\r\f />])' % self.cdata_elem,
+ re.IGNORECASE|re.ASCII)
+ else:
+ self.interesting = re.compile(r'%s(?=[\t\n\r\f />])' % self.cdata_elem,
+ re.IGNORECASE|re.ASCII)
def clear_cdata_mode(self):
self.interesting = interesting_normal
self.cdata_elem = None
+ self._escapable = True
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
@@ -206,7 +214,7 @@ def goahead(self, end):
break
j = n
if i < j:
- if self.convert_charrefs and not self.cdata_elem:
+ if self.convert_charrefs and self._escapable:
self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
@@ -308,7 +316,7 @@ def goahead(self, end):
assert 0, "interesting.search() lied"
# end while
if end and i < n:
- if self.convert_charrefs and not self.cdata_elem:
+ if self.convert_charrefs and self._escapable:
self.handle_data(unescape(rawdata[i:n]))
else:
self.handle_data(rawdata[i:n])
@@ -420,6 +428,8 @@ def parse_starttag(self, i):
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
+ elif tag in self.RCDATA_CONTENT_ELEMENTS:
+ self.set_cdata_mode(tag, escapable=True)
return endpos
# Internal -- check to see if we have a complete starttag; return end
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 47c0752fb517b9..380bbe40177ec5 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -317,6 +317,49 @@ def test_style_content(self, content):
("data", content),
("endtag", "style")])
+ @support.subTests('content', [
+ '',
+ "