Skip to content

Commit c0d9511

Browse files
zoobalarryhastings
authored andcommitted
bpo-36216: Add check for characters in netloc that normalize to separators (GH-12201) (#12223)
1 parent 6b0d50d commit c0d9511

File tree

4 files changed

+61
-0
lines changed

4 files changed

+61
-0
lines changed

Doc/library/urllib.parse.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,11 @@ or on combining URL components into a URL string.
120120
Unmatched square brackets in the :attr:`netloc` attribute will raise a
121121
:exc:`ValueError`.
122122

123+
Characters in the :attr:`netloc` attribute that decompose under NFKC
124+
normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
125+
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
126+
decomposed before parsing, no error will be raised.
127+
123128
.. versionchanged:: 3.2
124129
Added IPv6 URL parsing capabilities.
125130

@@ -128,6 +133,10 @@ or on combining URL components into a URL string.
128133
false), in accordance with :rfc:`3986`. Previously, a whitelist of
129134
schemes that support fragments existed.
130135

136+
.. versionchanged:: 3.5.7
137+
Characters that affect netloc parsing under NFKC normalization will
138+
now raise :exc:`ValueError`.
139+
131140

132141
.. function:: parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace')
133142

@@ -236,6 +245,15 @@ or on combining URL components into a URL string.
236245
Unmatched square brackets in the :attr:`netloc` attribute will raise a
237246
:exc:`ValueError`.
238247

248+
Characters in the :attr:`netloc` attribute that decompose under NFKC
249+
normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
250+
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
251+
decomposed before parsing, no error will be raised.
252+
253+
.. versionchanged:: 3.5.7
254+
Characters that affect netloc parsing under NFKC normalization will
255+
now raise :exc:`ValueError`.
256+
239257

240258
.. function:: urlunsplit(parts)
241259

Lib/test/test_urlparse.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import sys
2+
import unicodedata
13
import unittest
24
import urllib.parse
35

@@ -970,6 +972,27 @@ def test_all(self):
970972
expected.append(name)
971973
self.assertCountEqual(urllib.parse.__all__, expected)
972974

975+
def test_urlsplit_normalization(self):
976+
# Certain characters should never occur in the netloc,
977+
# including under normalization.
978+
# Ensure that ALL of them are detected and cause an error
979+
illegal_chars = '/:#?@'
980+
hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars}
981+
denorm_chars = [
982+
c for c in map(chr, range(128, sys.maxunicode))
983+
if (hex_chars & set(unicodedata.decomposition(c).split()))
984+
and c not in illegal_chars
985+
]
986+
# Sanity check that we found at least one such character
987+
self.assertIn('\u2100', denorm_chars)
988+
self.assertIn('\uFF03', denorm_chars)
989+
990+
for scheme in ["http", "https", "ftp"]:
991+
for c in denorm_chars:
992+
url = "{}://netloc{}false.netloc/path".format(scheme, c)
993+
with self.subTest(url=url, char='{:04X}'.format(ord(c))):
994+
with self.assertRaises(ValueError):
995+
urllib.parse.urlsplit(url)
973996

974997
class Utility_Tests(unittest.TestCase):
975998
"""Testcase to test the various utility functions in the urllib."""

Lib/urllib/parse.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,21 @@ def _splitnetloc(url, start=0):
327327
delim = min(delim, wdelim) # use earliest delim position
328328
return url[start:delim], url[delim:] # return (domain, rest)
329329

330+
def _checknetloc(netloc):
331+
if not netloc or not any(ord(c) > 127 for c in netloc):
332+
return
333+
# looking for characters like \u2100 that expand to 'a/c'
334+
# IDNA uses NFKC equivalence, so normalize for this check
335+
import unicodedata
336+
netloc2 = unicodedata.normalize('NFKC', netloc)
337+
if netloc == netloc2:
338+
return
339+
_, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay
340+
for c in '/?#@:':
341+
if c in netloc2:
342+
raise ValueError("netloc '" + netloc2 + "' contains invalid " +
343+
"characters under NFKC normalization")
344+
330345
def urlsplit(url, scheme='', allow_fragments=True):
331346
"""Parse a URL into 5 components:
332347
<scheme>://<netloc>/<path>?<query>#<fragment>
@@ -356,6 +371,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
356371
url, fragment = url.split('#', 1)
357372
if '?' in url:
358373
url, query = url.split('?', 1)
374+
_checknetloc(netloc)
359375
v = SplitResult(scheme, netloc, url, query, fragment)
360376
_parse_cache[key] = v
361377
return _coerce_result(v)
@@ -379,6 +395,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
379395
url, fragment = url.split('#', 1)
380396
if '?' in url:
381397
url, query = url.split('?', 1)
398+
_checknetloc(netloc)
382399
v = SplitResult(scheme, netloc, url, query, fragment)
383400
_parse_cache[key] = v
384401
return _coerce_result(v)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Changes urlsplit() to raise ValueError when the URL contains characters that
2+
decompose under IDNA encoding (NFKC-normalization) into characters that
3+
affect how the URL is parsed.

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy