Skip to content

gh-134873: fix various quadratic worst-time complexities in _header_value_parser.py [WIP] #134947

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 59 additions & 51 deletions Lib/email/_header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,14 @@

WSP = set(' \t')
CFWS_LEADER = WSP | set('(')
CFWS_LEADER_WITH_DOT = CFWS_LEADER | set('.')
SPECIALS = set(r'()<>@,:;.\"[]')
ATOM_ENDS = SPECIALS | WSP
DOT_ATOM_ENDS = ATOM_ENDS - set('.')
# '.', '"', and '(' do not end phrases in order to support obs-phrase
PHRASE_ENDS = SPECIALS - set('."(')
PHRASE_ENDS_CHARS = r''.join(PHRASE_ENDS)
PHRASE_ENDS_CHARS_NO_SEMICOLON = PHRASE_ENDS_CHARS.replace(';', '')
TSPECIALS = (SPECIALS | set('/?=')) - set('.')
TOKEN_ENDS = TSPECIALS | WSP
ASPECIALS = TSPECIALS | set("*'%")
Expand Down Expand Up @@ -1300,6 +1303,12 @@ def get_cfws(value):
cfws.append(token)
return cfws, value

def get_cfws_digits(value, leader_set):
ind = 0
while ind < len(value) and value[ind] not in leader_set:
ind += 1
return value[:ind], value[ind:]

def get_quoted_string(value):
"""quoted-string = [CFWS] <bare-quoted-string> [CFWS]

Expand Down Expand Up @@ -1443,11 +1452,14 @@ def get_phrase(value):
phrase.defects.append(errors.InvalidHeaderDefect(
"phrase does not start with word"))
while value and value[0] not in PHRASE_ENDS:
if value[0]=='.':
phrase.append(DOT)
phrase.defects.append(errors.ObsoleteHeaderDefect(
"period in 'phrase'"))
value = value[1:]
if value[0] == '.':
tmpvalue = value.lstrip('.')
n = len(value) - len(tmpvalue)
phrase.extend(DOT for _ in range(n))
phrase.defects.extend(
errors.ObsoleteHeaderDefect("period in 'phrase'")
for _ in range(n))
value = tmpvalue
else:
try:
token, value = get_word(value)
Expand All @@ -1461,6 +1473,19 @@ def get_phrase(value):
phrase.append(token)
return phrase, value

def _find_phrase(reslist, value, phrase_ends, phrase_end_chars, endchars):
while value and value[0] not in endchars:
if value[0] in phrase_ends:
tmpvalue = value.lstrip(phrase_end_chars)
reslist.extend(
ValueTerminal(value[i], 'misplaced-special')
for i in range(len(value) - len(tmpvalue)))
value = tmpvalue
else:
token, value = get_phrase(value)
reslist.append(token)
return value

def get_local_part(value):
""" local-part = dot-atom / quoted-string / obs-local-part

Expand Down Expand Up @@ -1842,14 +1867,10 @@ def get_invalid_mailbox(value, endchars):

"""
invalid_mailbox = InvalidMailbox()
while value and value[0] not in endchars:
if value[0] in PHRASE_ENDS:
invalid_mailbox.append(ValueTerminal(value[0],
'misplaced-special'))
value = value[1:]
else:
token, value = get_phrase(value)
invalid_mailbox.append(token)
# lstrip() should not strip stuff in 'endchars'
phrase_end_chars = ''.join(PHRASE_ENDS - set(endchars))
value = _find_phrase(invalid_mailbox, value,
PHRASE_ENDS, phrase_end_chars, endchars)
return invalid_mailbox, value

def get_mailbox_list(value):
Expand Down Expand Up @@ -2196,10 +2217,7 @@ def parse_mime_version(value):
if not value:
mime_version.defects.append(errors.HeaderMissingRequiredValue(
"Expected MIME version number but found only CFWS"))
digits = ''
while value and value[0] != '.' and value[0] not in CFWS_LEADER:
digits += value[0]
value = value[1:]
digits, value = get_cfws_digits(value, CFWS_LEADER_WITH_DOT)
if not digits.isdigit():
mime_version.defects.append(errors.InvalidHeaderDefect(
"Expected MIME major version number but found {!r}".format(digits)))
Expand Down Expand Up @@ -2227,10 +2245,7 @@ def parse_mime_version(value):
mime_version.defects.append(errors.InvalidHeaderDefect(
"Incomplete MIME version; found only major number"))
return mime_version
digits = ''
while value and value[0] not in CFWS_LEADER:
digits += value[0]
value = value[1:]
digits, value = get_cfws_digits(value, CFWS_LEADER)
if not digits.isdigit():
mime_version.defects.append(errors.InvalidHeaderDefect(
"Expected MIME minor version number but found {!r}".format(digits)))
Expand All @@ -2255,14 +2270,8 @@ def get_invalid_parameter(value):

"""
invalid_parameter = InvalidParameter()
while value and value[0] != ';':
if value[0] in PHRASE_ENDS:
invalid_parameter.append(ValueTerminal(value[0],
'misplaced-special'))
value = value[1:]
else:
token, value = get_phrase(value)
invalid_parameter.append(token)
value = _find_phrase(invalid_parameter, value,
PHRASE_ENDS, PHRASE_ENDS_CHARS_NO_SEMICOLON, ';')
return invalid_parameter, value

def get_ttext(value):
Expand Down Expand Up @@ -2407,10 +2416,8 @@ def get_section(value):
if not value or not value[0].isdigit():
raise errors.HeaderParseError("Expected section number but "
"found {}".format(value))
digits = ''
while value and value[0].isdigit():
digits += value[0]
value = value[1:]
ind = next((i for i, ch in enumerate(value) if not ch.isdigit()), 0)
digits, value = value[:ind], value[ind:]
if digits[0] == '0' and digits != '0':
section.defects.append(errors.InvalidHeaderDefect(
"section number has an invalid leading 0"))
Expand Down Expand Up @@ -2567,12 +2574,15 @@ def get_parameter(value):
while value:
if value[0] in WSP:
token, value = get_fws(value)
v.append(token)
elif value[0] == '"':
token = ValueTerminal('"', 'DQUOTE')
value = value[1:]
tmpvalue = value.lstrip('"')
n = len(value) - len(tmpvalue)
v.extend((ValueTerminal('"', 'DQUOTE') for _ in range(n)))
value = tmpvalue
else:
token, value = get_qcontent(value)
v.append(token)
v.append(token)
token = v
else:
token, value = get_value(value)
Expand Down Expand Up @@ -2638,17 +2648,11 @@ def _find_mime_parameters(tokenlist, value):
"""Do our best to find the parameters in an invalid MIME header

"""
while value and value[0] != ';':
if value[0] in PHRASE_ENDS:
tokenlist.append(ValueTerminal(value[0], 'misplaced-special'))
value = value[1:]
else:
token, value = get_phrase(value)
tokenlist.append(token)
if not value:
return
tokenlist.append(ValueTerminal(';', 'parameter-separator'))
tokenlist.append(parse_mime_parameters(value[1:]))
value = _find_phrase(tokenlist, value,
PHRASE_ENDS, PHRASE_ENDS_CHARS_NO_SEMICOLON, ';')
if value:
tokenlist.append(ValueTerminal(';', 'parameter-separator'))
tokenlist.append(parse_mime_parameters(value[1:]))

def parse_content_type_header(value):
""" maintype "/" subtype *( ";" parameter )
Expand Down Expand Up @@ -2757,12 +2761,16 @@ def parse_content_transfer_encoding_header(value):
if not value:
return cte_header
while value:
cte_header.defects.append(errors.InvalidHeaderDefect(
"Extra text after content transfer encoding"))
if value[0] in PHRASE_ENDS:
cte_header.append(ValueTerminal(value[0], 'misplaced-special'))
value = value[1:]
tmpvalue = value.lstrip(PHRASE_ENDS_CHARS)
for i in range(len(value) - len(tmpvalue)):
cte_header.defects.append(errors.InvalidHeaderDefect(
"Extra text after content transfer encoding"))
cte_header.append(ValueTerminal(value[i], 'misplaced-special'))
value = tmpvalue
else:
cte_header.defects.append(errors.InvalidHeaderDefect(
"Extra text after content transfer encoding"))
token, value = get_phrase(value)
cte_header.append(token)
return cte_header
Expand Down
10 changes: 10 additions & 0 deletions Lib/test/test_email/test__header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2676,6 +2676,16 @@ def test_invalid_content_transfer_encoding(self):
";foo", ";foo", ";foo", [errors.InvalidHeaderDefect]*3
)

def test_invalid_content_transfer_encoding_misplaced_special(self):
cte = parser.parse_content_transfer_encoding_header("foo;;;;;")
self.assertEqual(len(cte), 6)
self.assertEqual(cte[0].value, "foo")
self.assertEqual(cte[0].token_type, "token")
self.assertEqual(cte[0].value, "foo")
self.assertEqual(cte[0].token_type, "token")
terminal = parser.ValueTerminal(";", "misplaced-special")
self.assertEqual(cte[1:], [terminal] * 5)

# get_msg_id

def test_get_msg_id_empty(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix various HTTP header value parsing routines with worst-case
quadratic time complexity. Patch by Bénédikt Tran.
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy