Skip to content

Commit 23fc041

Browse files
zoobaned-deily
authored andcommitted
[3.6] bpo-36216: Add check for characters in netloc that normalize to separators (GH-12201) (GH-12215)
1 parent 5565b1d commit 23fc041

File tree

4 files changed

+61
-0
lines changed

4 files changed

+61
-0
lines changed

Doc/library/urllib.parse.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,11 @@ or on combining URL components into a URL string.
121121
Unmatched square brackets in the :attr:`netloc` attribute will raise a
122122
:exc:`ValueError`.
123123

124+
Characters in the :attr:`netloc` attribute that decompose under NFKC
125+
normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
126+
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
127+
decomposed before parsing, no error will be raised.
128+
124129
.. versionchanged:: 3.2
125130
Added IPv6 URL parsing capabilities.
126131

@@ -133,6 +138,10 @@ or on combining URL components into a URL string.
133138
Out-of-range port numbers now raise :exc:`ValueError`, instead of
134139
returning :const:`None`.
135140

141+
.. versionchanged:: 3.6.9
142+
Characters that affect netloc parsing under NFKC normalization will
143+
now raise :exc:`ValueError`.
144+
136145

137146
.. function:: parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace', max_num_fields=None)
138147

@@ -256,10 +265,19 @@ or on combining URL components into a URL string.
256265
Unmatched square brackets in the :attr:`netloc` attribute will raise a
257266
:exc:`ValueError`.
258267

268+
Characters in the :attr:`netloc` attribute that decompose under NFKC
269+
normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
270+
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
271+
decomposed before parsing, no error will be raised.
272+
259273
.. versionchanged:: 3.6
260274
Out-of-range port numbers now raise :exc:`ValueError`, instead of
261275
returning :const:`None`.
262276

277+
.. versionchanged:: 3.6.9
278+
Characters that affect netloc parsing under NFKC normalization will
279+
now raise :exc:`ValueError`.
280+
263281

264282
.. function:: urlunsplit(parts)
265283

Lib/test/test_urlparse.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import sys
2+
import unicodedata
13
import unittest
24
import urllib.parse
35

@@ -984,6 +986,27 @@ def test_all(self):
984986
expected.append(name)
985987
self.assertCountEqual(urllib.parse.__all__, expected)
986988

989+
def test_urlsplit_normalization(self):
990+
# Certain characters should never occur in the netloc,
991+
# including under normalization.
992+
# Ensure that ALL of them are detected and cause an error
993+
illegal_chars = '/:#?@'
994+
hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars}
995+
denorm_chars = [
996+
c for c in map(chr, range(128, sys.maxunicode))
997+
if (hex_chars & set(unicodedata.decomposition(c).split()))
998+
and c not in illegal_chars
999+
]
1000+
# Sanity check that we found at least one such character
1001+
self.assertIn('\u2100', denorm_chars)
1002+
self.assertIn('\uFF03', denorm_chars)
1003+
1004+
for scheme in ["http", "https", "ftp"]:
1005+
for c in denorm_chars:
1006+
url = "{}://netloc{}false.netloc/path".format(scheme, c)
1007+
with self.subTest(url=url, char='{:04X}'.format(ord(c))):
1008+
with self.assertRaises(ValueError):
1009+
urllib.parse.urlsplit(url)
9871010

9881011
class Utility_Tests(unittest.TestCase):
9891012
"""Testcase to test the various utility functions in the urllib."""

Lib/urllib/parse.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,21 @@ def _splitnetloc(url, start=0):
391391
delim = min(delim, wdelim) # use earliest delim position
392392
return url[start:delim], url[delim:] # return (domain, rest)
393393

394+
def _checknetloc(netloc):
395+
if not netloc or not any(ord(c) > 127 for c in netloc):
396+
return
397+
# looking for characters like \u2100 that expand to 'a/c'
398+
# IDNA uses NFKC equivalence, so normalize for this check
399+
import unicodedata
400+
netloc2 = unicodedata.normalize('NFKC', netloc)
401+
if netloc == netloc2:
402+
return
403+
_, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay
404+
for c in '/?#@:':
405+
if c in netloc2:
406+
raise ValueError("netloc '" + netloc2 + "' contains invalid " +
407+
"characters under NFKC normalization")
408+
394409
def urlsplit(url, scheme='', allow_fragments=True):
395410
"""Parse a URL into 5 components:
396411
<scheme>://<netloc>/<path>?<query>#<fragment>
@@ -420,6 +435,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
420435
url, fragment = url.split('#', 1)
421436
if '?' in url:
422437
url, query = url.split('?', 1)
438+
_checknetloc(netloc)
423439
v = SplitResult(scheme, netloc, url, query, fragment)
424440
_parse_cache[key] = v
425441
return _coerce_result(v)
@@ -443,6 +459,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
443459
url, fragment = url.split('#', 1)
444460
if '?' in url:
445461
url, query = url.split('?', 1)
462+
_checknetloc(netloc)
446463
v = SplitResult(scheme, netloc, url, query, fragment)
447464
_parse_cache[key] = v
448465
return _coerce_result(v)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Changes urlsplit() to raise ValueError when the URL contains characters that
2+
decompose under IDNA encoding (NFKC-normalization) into characters that
3+
affect how the URL is parsed.

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy