Skip to content

Commit 16e6f7d

Browse files
authored
bpo-36216: Add check for characters in netloc that normalize to separators (GH-12201)
1 parent 1f58f4f commit 16e6f7d

File tree

4 files changed

+61
-0
lines changed

4 files changed

+61
-0
lines changed

Doc/library/urllib.parse.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,11 @@ or on combining URL components into a URL string.
124124
Unmatched square brackets in the :attr:`netloc` attribute will raise a
125125
:exc:`ValueError`.
126126

127+
Characters in the :attr:`netloc` attribute that decompose under NFKC
128+
normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
129+
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
130+
decomposed before parsing, no error will be raised.
131+
127132
.. versionchanged:: 3.2
128133
Added IPv6 URL parsing capabilities.
129134

@@ -136,6 +141,10 @@ or on combining URL components into a URL string.
136141
Out-of-range port numbers now raise :exc:`ValueError`, instead of
137142
returning :const:`None`.
138143

144+
.. versionchanged:: 3.8
145+
Characters that affect netloc parsing under NFKC normalization will
146+
now raise :exc:`ValueError`.
147+
139148

140149
.. function:: parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace', max_num_fields=None)
141150

@@ -259,10 +268,19 @@ or on combining URL components into a URL string.
259268
Unmatched square brackets in the :attr:`netloc` attribute will raise a
260269
:exc:`ValueError`.
261270

271+
Characters in the :attr:`netloc` attribute that decompose under NFKC
272+
normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
273+
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
274+
decomposed before parsing, no error will be raised.
275+
262276
.. versionchanged:: 3.6
263277
Out-of-range port numbers now raise :exc:`ValueError`, instead of
264278
returning :const:`None`.
265279

280+
.. versionchanged:: 3.8
281+
Characters that affect netloc parsing under NFKC normalization will
282+
now raise :exc:`ValueError`.
283+
266284

267285
.. function:: urlunsplit(parts)
268286

Lib/test/test_urlparse.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import sys
2+
import unicodedata
13
import unittest
24
import urllib.parse
35

@@ -994,6 +996,27 @@ def test_all(self):
994996
expected.append(name)
995997
self.assertCountEqual(urllib.parse.__all__, expected)
996998

999+
def test_urlsplit_normalization(self):
1000+
# Certain characters should never occur in the netloc,
1001+
# including under normalization.
1002+
# Ensure that ALL of them are detected and cause an error
1003+
illegal_chars = '/:#?@'
1004+
hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars}
1005+
denorm_chars = [
1006+
c for c in map(chr, range(128, sys.maxunicode))
1007+
if (hex_chars & set(unicodedata.decomposition(c).split()))
1008+
and c not in illegal_chars
1009+
]
1010+
# Sanity check that we found at least one such character
1011+
self.assertIn('\u2100', denorm_chars)
1012+
self.assertIn('\uFF03', denorm_chars)
1013+
1014+
for scheme in ["http", "https", "ftp"]:
1015+
for c in denorm_chars:
1016+
url = "{}://netloc{}false.netloc/path".format(scheme, c)
1017+
with self.subTest(url=url, char='{:04X}'.format(ord(c))):
1018+
with self.assertRaises(ValueError):
1019+
urllib.parse.urlsplit(url)
9971020

9981021
class Utility_Tests(unittest.TestCase):
9991022
"""Testcase to test the various utility functions in the urllib."""

Lib/urllib/parse.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,21 @@ def _splitnetloc(url, start=0):
396396
delim = min(delim, wdelim) # use earliest delim position
397397
return url[start:delim], url[delim:] # return (domain, rest)
398398

399+
def _checknetloc(netloc):
400+
if not netloc or netloc.isascii():
401+
return
402+
# looking for characters like \u2100 that expand to 'a/c'
403+
# IDNA uses NFKC equivalence, so normalize for this check
404+
import unicodedata
405+
netloc2 = unicodedata.normalize('NFKC', netloc)
406+
if netloc == netloc2:
407+
return
408+
_, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay
409+
for c in '/?#@:':
410+
if c in netloc2:
411+
raise ValueError("netloc '" + netloc2 + "' contains invalid " +
412+
"characters under NFKC normalization")
413+
399414
def urlsplit(url, scheme='', allow_fragments=True):
400415
"""Parse a URL into 5 components:
401416
<scheme>://<netloc>/<path>?<query>#<fragment>
@@ -424,6 +439,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
424439
url, fragment = url.split('#', 1)
425440
if '?' in url:
426441
url, query = url.split('?', 1)
442+
_checknetloc(netloc)
427443
v = SplitResult('http', netloc, url, query, fragment)
428444
_parse_cache[key] = v
429445
return _coerce_result(v)
@@ -447,6 +463,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
447463
url, fragment = url.split('#', 1)
448464
if '?' in url:
449465
url, query = url.split('?', 1)
466+
_checknetloc(netloc)
450467
v = SplitResult(scheme, netloc, url, query, fragment)
451468
_parse_cache[key] = v
452469
return _coerce_result(v)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Changes urlsplit() to raise ValueError when the URL contains characters that
2+
decompose under IDNA encoding (NFKC-normalization) into characters that
3+
affect how the URL is parsed.

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy