From d08b50fbcd89ddea42b60388f1a297ad2a716853 Mon Sep 17 00:00:00 2001 From: Steve Dower Date: Thu, 7 Mar 2019 10:33:14 -0800 Subject: [PATCH] bpo-36216: Add check for characters in netloc that normalize to separators (GH-12201) --- Doc/library/urllib.parse.rst | 18 +++++++++++++++ Lib/test/test_urlparse.py | 23 +++++++++++++++++++ Lib/urllib/parse.py | 17 ++++++++++++++ .../2019-03-06-09-38-40.bpo-36216.6q1m4a.rst | 3 +++ 4 files changed, 61 insertions(+) create mode 100644 Misc/NEWS.d/next/Security/2019-03-06-09-38-40.bpo-36216.6q1m4a.rst diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index ac04f99deb74b7..ddbb9143b61192 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -118,6 +118,11 @@ or on combining URL components into a URL string. See section :ref:`urlparse-result-object` for more information on the result object. + Characters in the :attr:`netloc` attribute that decompose under NFKC + normalization (as used by the IDNA encoding) into any of ``/``, ``?``, + ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is + decomposed before parsing, no error will be raised. + .. versionchanged:: 3.2 Added IPv6 URL parsing capabilities. @@ -126,6 +131,10 @@ or on combining URL components into a URL string. false), in accordance with :rfc:`3986`. Previously, a whitelist of schemes that support fragments existed. + .. versionchanged:: 3.4.10 + Characters that affect netloc parsing under NFKC normalization will + now raise :exc:`ValueError`. + .. function:: parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace') @@ -231,6 +240,15 @@ or on combining URL components into a URL string. See section :ref:`urlparse-result-object` for more information on the result object. + Characters in the :attr:`netloc` attribute that decompose under NFKC + normalization (as used by the IDNA encoding) into any of ``/``, ``?``, + ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is + decomposed before parsing, no error will be raised. + + .. versionchanged:: 3.4.10 + Characters that affect netloc parsing under NFKC normalization will + now raise :exc:`ValueError`. + .. function:: urlunsplit(parts) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index f087cedbe00c59..77c200729a2c53 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -1,3 +1,5 @@ +import sys +import unicodedata import unittest import urllib.parse @@ -868,6 +870,27 @@ def test_Quoter_repr(self): quoter = urllib.parse.Quoter(urllib.parse._ALWAYS_SAFE) self.assertIn('Quoter', repr(quoter)) + def test_urlsplit_normalization(self): + # Certain characters should never occur in the netloc, + # including under normalization. + # Ensure that ALL of them are detected and cause an error + illegal_chars = '/:#?@' + hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars} + denorm_chars = [ + c for c in map(chr, range(128, sys.maxunicode)) + if (hex_chars & set(unicodedata.decomposition(c).split())) + and c not in illegal_chars + ] + # Sanity check that we found at least one such character + self.assertIn('\u2100', denorm_chars) + self.assertIn('\uFF03', denorm_chars) + + for scheme in ["http", "https", "ftp"]: + for c in denorm_chars: + url = "{}://netloc{}false.netloc/path".format(scheme, c) + with self.subTest(url=url, char='{:04X}'.format(ord(c))): + with self.assertRaises(ValueError): + urllib.parse.urlsplit(url) class Utility_Tests(unittest.TestCase): """Testcase to test the various utility functions in the urllib.""" diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index d7b4e8846328ce..243f470a289ce9 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -316,6 +316,21 @@ def _splitnetloc(url, start=0): delim = min(delim, wdelim) # use earliest delim position return url[start:delim], url[delim:] # return (domain, rest) +def _checknetloc(netloc): + if not netloc or not any(ord(c) > 127 for c in netloc): + return + # looking for characters like \u2100 that expand to 'a/c' + # IDNA uses NFKC equivalence, so normalize for this check + import unicodedata + netloc2 = unicodedata.normalize('NFKC', netloc) + if netloc == netloc2: + return + _, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay + for c in '/?#@:': + if c in netloc2: + raise ValueError("netloc '" + netloc2 + "' contains invalid " + + "characters under NFKC normalization") + def urlsplit(url, scheme='', allow_fragments=True): """Parse a URL into 5 components: :///?# @@ -345,6 +360,7 @@ def urlsplit(url, scheme='', allow_fragments=True): url, fragment = url.split('#', 1) if '?' in url: url, query = url.split('?', 1) + _checknetloc(netloc) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return _coerce_result(v) @@ -368,6 +384,7 @@ def urlsplit(url, scheme='', allow_fragments=True): url, fragment = url.split('#', 1) if '?' in url: url, query = url.split('?', 1) + _checknetloc(netloc) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return _coerce_result(v) diff --git a/Misc/NEWS.d/next/Security/2019-03-06-09-38-40.bpo-36216.6q1m4a.rst b/Misc/NEWS.d/next/Security/2019-03-06-09-38-40.bpo-36216.6q1m4a.rst new file mode 100644 index 00000000000000..5546394157f9e5 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2019-03-06-09-38-40.bpo-36216.6q1m4a.rst @@ -0,0 +1,3 @@ +Changes urlsplit() to raise ValueError when the URL contains characters that +decompose under IDNA encoding (NFKC-normalization) into characters that +affect how the URL is parsed. pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy