Skip to content

Commit 515a7bc

Browse files
miss-islingtongpsheadserhiy-storchakaorsenthil
authored
[3.8] bpo-43882 - urllib.parse should sanitize urls containing ASCII newline and tabs. (GH-25595) (#25726)
Co-authored-by: Gregory P. Smith <greg@krypto.org> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> (cherry picked from commit 76cd81d) Co-authored-by: Senthil Kumaran <senthil@uthcode.com> Co-authored-by: Senthil Kumaran <skumaran@gatech.edu>
1 parent 44f6b9a commit 515a7bc

File tree

4 files changed

+77
-0
lines changed

4 files changed

+77
-0
lines changed

Doc/library/urllib.parse.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,9 @@ or on combining URL components into a URL string.
312312
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
313313
decomposed before parsing, no error will be raised.
314314

315+
Following the `WHATWG spec`_ that updates RFC 3986, ASCII newline
316+
``\n``, ``\r`` and tab ``\t`` characters are stripped from the URL.
317+
315318
.. versionchanged:: 3.6
316319
Out-of-range port numbers now raise :exc:`ValueError`, instead of
317320
returning :const:`None`.
@@ -320,6 +323,10 @@ or on combining URL components into a URL string.
320323
Characters that affect netloc parsing under NFKC normalization will
321324
now raise :exc:`ValueError`.
322325

326+
.. versionchanged:: 3.8.10
327+
ASCII newline and tab characters are stripped from the URL.
328+
329+
.. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser
323330

324331
.. function:: urlunsplit(parts)
325332

@@ -668,6 +675,10 @@ task isn't already covered by the URL parsing functions above.
668675

669676
.. seealso::
670677

678+
`WHATWG`_ - URL Living standard
679+
Working Group for the URL Standard that defines URLs, domains, IP addresses, the
680+
application/x-www-form-urlencoded format, and their API.
681+
671682
:rfc:`3986` - Uniform Resource Identifiers
672683
This is the current standard (STD66). Any changes to urllib.parse module
673684
should conform to this. Certain deviations could be observed, which are
@@ -691,3 +702,5 @@ task isn't already covered by the URL parsing functions above.
691702

692703
:rfc:`1738` - Uniform Resource Locators (URL)
693704
This specifies the formal syntax and semantics of absolute URLs.
705+
706+
.. _WHATWG: https://url.spec.whatwg.org/

Lib/test/test_urlparse.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -612,6 +612,54 @@ def test_urlsplit_attributes(self):
612612
with self.assertRaisesRegex(ValueError, "out of range"):
613613
p.port
614614

615+
def test_urlsplit_remove_unsafe_bytes(self):
616+
# Remove ASCII tabs and newlines from input, for http common case scenario.
617+
url = "h\nttp://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
618+
p = urllib.parse.urlsplit(url)
619+
self.assertEqual(p.scheme, "http")
620+
self.assertEqual(p.netloc, "www.python.org")
621+
self.assertEqual(p.path, "/javascript:alert('msg')/")
622+
self.assertEqual(p.query, "query=something")
623+
self.assertEqual(p.fragment, "fragment")
624+
self.assertEqual(p.username, None)
625+
self.assertEqual(p.password, None)
626+
self.assertEqual(p.hostname, "www.python.org")
627+
self.assertEqual(p.port, None)
628+
self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment")
629+
630+
# Remove ASCII tabs and newlines from input as bytes, for http common case scenario.
631+
url = b"h\nttp://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
632+
p = urllib.parse.urlsplit(url)
633+
self.assertEqual(p.scheme, b"http")
634+
self.assertEqual(p.netloc, b"www.python.org")
635+
self.assertEqual(p.path, b"/javascript:alert('msg')/")
636+
self.assertEqual(p.query, b"query=something")
637+
self.assertEqual(p.fragment, b"fragment")
638+
self.assertEqual(p.username, None)
639+
self.assertEqual(p.password, None)
640+
self.assertEqual(p.hostname, b"www.python.org")
641+
self.assertEqual(p.port, None)
642+
self.assertEqual(p.geturl(), b"http://www.python.org/javascript:alert('msg')/?query=something#fragment")
643+
644+
# any scheme
645+
url = "x-new-scheme\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
646+
p = urllib.parse.urlsplit(url)
647+
self.assertEqual(p.geturl(), "x-new-scheme://www.python.org/javascript:alert('msg')/?query=something#fragment")
648+
649+
# Remove ASCII tabs and newlines from input as bytes, any scheme.
650+
url = b"x-new-scheme\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
651+
p = urllib.parse.urlsplit(url)
652+
self.assertEqual(p.geturl(), b"x-new-scheme://www.python.org/javascript:alert('msg')/?query=something#fragment")
653+
654+
# Unsafe bytes is not returned from urlparse cache.
655+
# scheme is stored after parsing, sending an scheme with unsafe bytes *will not* return an unsafe scheme
656+
url = "https://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
657+
scheme = "htt\nps"
658+
for _ in range(2):
659+
p = urllib.parse.urlsplit(url, scheme=scheme)
660+
self.assertEqual(p.scheme, "https")
661+
self.assertEqual(p.geturl(), "https://www.python.org/javascript:alert('msg')/?query=something#fragment")
662+
615663
def test_attributes_bad_port(self):
616664
"""Check handling of invalid ports."""
617665
for bytes in (False, True):

Lib/urllib/parse.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,9 @@
7777
'0123456789'
7878
'+-.')
7979

80+
# Unsafe bytes to be removed per WHATWG spec
81+
_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']
82+
8083
# XXX: Consider replacing with functools.lru_cache
8184
MAX_CACHE_SIZE = 20
8285
_parse_cache = {}
@@ -414,13 +417,20 @@ def _checknetloc(netloc):
414417
raise ValueError("netloc '" + netloc + "' contains invalid " +
415418
"characters under NFKC normalization")
416419

420+
def _remove_unsafe_bytes_from_url(url):
421+
for b in _UNSAFE_URL_BYTES_TO_REMOVE:
422+
url = url.replace(b, "")
423+
return url
424+
417425
def urlsplit(url, scheme='', allow_fragments=True):
418426
"""Parse a URL into 5 components:
419427
<scheme>://<netloc>/<path>?<query>#<fragment>
420428
Return a 5-tuple: (scheme, netloc, path, query, fragment).
421429
Note that we don't break the components up in smaller bits
422430
(e.g. netloc is a single string) and we don't expand % escapes."""
423431
url, scheme, _coerce_result = _coerce_args(url, scheme)
432+
url = _remove_unsafe_bytes_from_url(url)
433+
scheme = _remove_unsafe_bytes_from_url(scheme)
424434
allow_fragments = bool(allow_fragments)
425435
key = url, scheme, allow_fragments, type(url), type(scheme)
426436
cached = _parse_cache.get(key, None)
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
The presence of newline or tab characters in parts of a URL could allow
2+
some forms of attacks.
3+
4+
Following the controlling specification for URLs defined by WHATWG
5+
:func:`urllib.parse` now removes ASCII newlines and tabs from URLs,
6+
preventing such attacks.

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy