Skip to content

Commit daad2c4

Browse files
authored
bpo-36216: Add check for characters in netloc that normalize to separators (GH-12201)
1 parent bf44f48 commit daad2c4

File tree

4 files changed

+61
-0
lines changed

4 files changed

+61
-0
lines changed

Doc/library/urllib.parse.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,11 @@ or on combining URL components into a URL string.
124124
Unmatched square brackets in the :attr:`netloc` attribute will raise a
125125
:exc:`ValueError`.
126126

127+
Characters in the :attr:`netloc` attribute that decompose under NFKC
128+
normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
129+
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
130+
decomposed before parsing, no error will be raised.
131+
127132
.. versionchanged:: 3.2
128133
Added IPv6 URL parsing capabilities.
129134

@@ -136,6 +141,10 @@ or on combining URL components into a URL string.
136141
Out-of-range port numbers now raise :exc:`ValueError`, instead of
137142
returning :const:`None`.
138143

144+
.. versionchanged:: 3.7.3
145+
Characters that affect netloc parsing under NFKC normalization will
146+
now raise :exc:`ValueError`.
147+
139148

140149
.. function:: parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace', max_num_fields=None)
141150

@@ -257,10 +266,19 @@ or on combining URL components into a URL string.
257266
Unmatched square brackets in the :attr:`netloc` attribute will raise a
258267
:exc:`ValueError`.
259268

269+
Characters in the :attr:`netloc` attribute that decompose under NFKC
270+
normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
271+
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
272+
decomposed before parsing, no error will be raised.
273+
260274
.. versionchanged:: 3.6
261275
Out-of-range port numbers now raise :exc:`ValueError`, instead of
262276
returning :const:`None`.
263277

278+
.. versionchanged:: 3.7.3
279+
Characters that affect netloc parsing under NFKC normalization will
280+
now raise :exc:`ValueError`.
281+
264282

265283
.. function:: urlunsplit(parts)
266284

Lib/test/test_urlparse.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import sys
2+
import unicodedata
13
import unittest
24
import urllib.parse
35

@@ -984,6 +986,27 @@ def test_all(self):
984986
expected.append(name)
985987
self.assertCountEqual(urllib.parse.__all__, expected)
986988

989+
def test_urlsplit_normalization(self):
990+
# Certain characters should never occur in the netloc,
991+
# including under normalization.
992+
# Ensure that ALL of them are detected and cause an error
993+
illegal_chars = '/:#?@'
994+
hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars}
995+
denorm_chars = [
996+
c for c in map(chr, range(128, sys.maxunicode))
997+
if (hex_chars & set(unicodedata.decomposition(c).split()))
998+
and c not in illegal_chars
999+
]
1000+
# Sanity check that we found at least one such character
1001+
self.assertIn('\u2100', denorm_chars)
1002+
self.assertIn('\uFF03', denorm_chars)
1003+
1004+
for scheme in ["http", "https", "ftp"]:
1005+
for c in denorm_chars:
1006+
url = "{}://netloc{}false.netloc/path".format(scheme, c)
1007+
with self.subTest(url=url, char='{:04X}'.format(ord(c))):
1008+
with self.assertRaises(ValueError):
1009+
urllib.parse.urlsplit(url)
9871010

9881011
class Utility_Tests(unittest.TestCase):
9891012
"""Testcase to test the various utility functions in the urllib."""

Lib/urllib/parse.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,21 @@ def _splitnetloc(url, start=0):
391391
delim = min(delim, wdelim) # use earliest delim position
392392
return url[start:delim], url[delim:] # return (domain, rest)
393393

394+
def _checknetloc(netloc):
395+
if not netloc or netloc.isascii():
396+
return
397+
# looking for characters like \u2100 that expand to 'a/c'
398+
# IDNA uses NFKC equivalence, so normalize for this check
399+
import unicodedata
400+
netloc2 = unicodedata.normalize('NFKC', netloc)
401+
if netloc == netloc2:
402+
return
403+
_, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay
404+
for c in '/?#@:':
405+
if c in netloc2:
406+
raise ValueError("netloc '" + netloc2 + "' contains invalid " +
407+
"characters under NFKC normalization")
408+
394409
def urlsplit(url, scheme='', allow_fragments=True):
395410
"""Parse a URL into 5 components:
396411
<scheme>://<netloc>/<path>?<query>#<fragment>
@@ -419,6 +434,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
419434
url, fragment = url.split('#', 1)
420435
if '?' in url:
421436
url, query = url.split('?', 1)
437+
_checknetloc(netloc)
422438
v = SplitResult('http', netloc, url, query, fragment)
423439
_parse_cache[key] = v
424440
return _coerce_result(v)
@@ -442,6 +458,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
442458
url, fragment = url.split('#', 1)
443459
if '?' in url:
444460
url, query = url.split('?', 1)
461+
_checknetloc(netloc)
445462
v = SplitResult(scheme, netloc, url, query, fragment)
446463
_parse_cache[key] = v
447464
return _coerce_result(v)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Changes urlsplit() to raise ValueError when the URL contains characters that
2+
decompose under IDNA encoding (NFKC-normalization) into characters that
3+
affect how the URL is parsed.

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy