From 91cdfbeb11e8222710af659cbf590f46ee0dd7f6 Mon Sep 17 00:00:00 2001 From: Steve Dower Date: Wed, 20 Feb 2019 14:21:04 -0800 Subject: [PATCH 1/2] bpo-36216: Add check for characters in netloc that normalize to separators --- Doc/library/urllib.parse.rst | 18 +++++++++++++++ Lib/test/test_urlparse.py | 23 +++++++++++++++++++ Lib/urllib/parse.py | 17 ++++++++++++++ .../2019-03-06-09-38-40.bpo-36216.6q1m4a.rst | 3 +++ 4 files changed, 61 insertions(+) create mode 100644 Misc/NEWS.d/next/Security/2019-03-06-09-38-40.bpo-36216.6q1m4a.rst diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index 913e933d657cfe..af15f5bbfff3a2 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -124,6 +124,11 @@ or on combining URL components into a URL string. Unmatched square brackets in the :attr:`netloc` attribute will raise a :exc:`ValueError`. + Characters in the :attr:`netloc` attribute that decompose under NFKC + normalization (as used by the IDNA encoding) into any of ``/``, ``?``, + ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is + decomposed before parsing, no error will be raised. + .. versionchanged:: 3.2 Added IPv6 URL parsing capabilities. @@ -136,6 +141,10 @@ or on combining URL components into a URL string. Out-of-range port numbers now raise :exc:`ValueError`, instead of returning :const:`None`. + .. versionchanged:: 3.8 + Characters that affect netloc parsing under NFKC normalization will + now raise :exc:`ValueError`. + .. function:: parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace', max_num_fields=None) @@ -259,10 +268,19 @@ or on combining URL components into a URL string. Unmatched square brackets in the :attr:`netloc` attribute will raise a :exc:`ValueError`. + Characters in the :attr:`netloc` attribute that decompose under NFKC + normalization (as used by the IDNA encoding) into any of ``/``, ``?``, + ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is + decomposed before parsing, no error will be raised. + .. versionchanged:: 3.6 Out-of-range port numbers now raise :exc:`ValueError`, instead of returning :const:`None`. + .. versionchanged:: 3.8 + Characters that affect netloc parsing under NFKC normalization will + now raise :exc:`ValueError`. + .. function:: urlunsplit(parts) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 9c71be53afd42b..0faf2bbb645924 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -1,3 +1,5 @@ +import sys +import unicodedata import unittest import urllib.parse @@ -994,6 +996,27 @@ def test_all(self): expected.append(name) self.assertCountEqual(urllib.parse.__all__, expected) + def test_urlsplit_normalization(self): + # Certain characters should never occur in the netloc, + # including under normalization. + # Ensure that ALL of them are detected and cause an error + illegal_chars = '/:#?@' + hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars} + denorm_chars = [ + c for c in map(chr, range(128, sys.maxunicode)) + if (hex_chars & set(unicodedata.decomposition(c).split())) + and c not in illegal_chars + ] + # Sanity check that we found at least one such character + self.assertIn('\u2100', denorm_chars) + self.assertIn('\uFF03', denorm_chars) + + for scheme in ["http", "https", "ftp"]: + for c in denorm_chars: + url = "{}://netloc{}false.netloc/path".format(scheme, c) + with self.subTest(url=url, char='{:04X}'.format(ord(c))): + with self.assertRaises(ValueError): + urllib.parse.urlsplit(url) class Utility_Tests(unittest.TestCase): """Testcase to test the various utility functions in the urllib.""" diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index dc2171144fc8ba..97a44d6da784a6 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -30,6 +30,7 @@ import re import sys import collections +import unicodedata import warnings __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", @@ -396,6 +397,20 @@ def _splitnetloc(url, start=0): delim = min(delim, wdelim) # use earliest delim position return url[start:delim], url[delim:] # return (domain, rest) +def _checknetloc(netloc): + if not netloc or netloc.isascii(): + return + # looking for characters like \u2100 that expand to 'a/c' + # IDNA uses NFKC equivalence, so normalize for this check + netloc2 = unicodedata.normalize('NFKC', netloc) + if netloc == netloc2: + return + _, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay + for c in '/?#@:': + if c in netloc2: + raise ValueError("netloc '" + netloc2 + "' contains invalid " + + "characters under NFKC normalization") + def urlsplit(url, scheme='', allow_fragments=True): """Parse a URL into 5 components: :///?# @@ -424,6 +439,7 @@ def urlsplit(url, scheme='', allow_fragments=True): url, fragment = url.split('#', 1) if '?' in url: url, query = url.split('?', 1) + _checknetloc(netloc) v = SplitResult('http', netloc, url, query, fragment) _parse_cache[key] = v return _coerce_result(v) @@ -447,6 +463,7 @@ def urlsplit(url, scheme='', allow_fragments=True): url, fragment = url.split('#', 1) if '?' in url: url, query = url.split('?', 1) + _checknetloc(netloc) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return _coerce_result(v) diff --git a/Misc/NEWS.d/next/Security/2019-03-06-09-38-40.bpo-36216.6q1m4a.rst b/Misc/NEWS.d/next/Security/2019-03-06-09-38-40.bpo-36216.6q1m4a.rst new file mode 100644 index 00000000000000..5546394157f9e5 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2019-03-06-09-38-40.bpo-36216.6q1m4a.rst @@ -0,0 +1,3 @@ +Changes urlsplit() to raise ValueError when the URL contains characters that +decompose under IDNA encoding (NFKC-normalization) into characters that +affect how the URL is parsed. From f1d2699f334ec91a8e8663cf189be451b59cdf99 Mon Sep 17 00:00:00 2001 From: Steve Dower Date: Wed, 6 Mar 2019 10:10:33 -0800 Subject: [PATCH 2/2] Load unicodedata module lazily --- Lib/urllib/parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 97a44d6da784a6..8b6c9b10609152 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -30,7 +30,6 @@ import re import sys import collections -import unicodedata import warnings __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", @@ -402,6 +401,7 @@ def _checknetloc(netloc): return # looking for characters like \u2100 that expand to 'a/c' # IDNA uses NFKC equivalence, so normalize for this check + import unicodedata netloc2 = unicodedata.normalize('NFKC', netloc) if netloc == netloc2: return pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy