From 3566743096b98ae3612364d50f112b6bb174b72b Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Sat, 13 Mar 2021 13:28:13 +0100 Subject: [PATCH] bpo-35883: Py_DecodeLocale() escapes invalid Unicode characters Python no longer fails at startup with a fatal error if a command line argument contains an invalid Unicode character. The Py_DecodeLocale() function now escapes byte sequences which would be decoded as Unicode characters outside the [U+0000; U+10ffff] range. Use MAX_UNICODE constant in unicodeobject.c. --- Lib/test/test_cmd_line.py | 86 +++++++++---- .../2021-03-13-13-57-21.bpo-35883.UyGpdG.rst | 4 + Objects/unicodeobject.c | 9 +- Python/fileutils.c | 118 ++++++++++++------ 4 files changed, 148 insertions(+), 69 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-03-13-13-57-21.bpo-35883.UyGpdG.rst diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index f12dff3202fe31..5abefde9ac4091 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -200,38 +200,72 @@ def test_undecodable_code(self): if not stdout.startswith(pattern): raise AssertionError("%a doesn't start with %a" % (stdout, pattern)) + @unittest.skipIf(sys.platform == 'win32', + 'Windows has a native unicode API') + def test_invalid_utf8_arg(self): + # bpo-35883: Py_DecodeLocale() must escape b'\xfd\xbf\xbf\xbb\xba\xba' + # byte sequence with surrogateescape rather than decoding it as the + # U+7fffbeba character which is outside the [U+0000; U+10ffff] range of + # Python Unicode characters. + # + # Test with default config, in the C locale, in the Python UTF-8 Mode. + code = 'import sys, os; s=os.fsencode(sys.argv[1]); print(ascii(s))' + base_cmd = [sys.executable, '-c', code] + + def run_default(arg): + cmd = [sys.executable, '-c', code, arg] + return subprocess.run(cmd, stdout=subprocess.PIPE, text=True) + + def run_c_locale(arg): + cmd = [sys.executable, '-c', code, arg] + env = dict(os.environ) + env['LC_ALL'] = 'C' + return subprocess.run(cmd, stdout=subprocess.PIPE, + text=True, env=env) + + def run_utf8_mode(arg): + cmd = [sys.executable, '-X', 'utf8', '-c', code, arg] + return subprocess.run(cmd, stdout=subprocess.PIPE, text=True) + + valid_utf8 = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8') + # invalid UTF-8 byte sequences with a valid UTF-8 sequence + # in the middle. + invalid_utf8 = ( + b'\xff' # invalid byte + b'\xc3\xff' # invalid byte sequence + b'\xc3\xa9' # valid utf-8: U+00E9 character + b'\xed\xa0\x80' # lone surrogate character (invalid) + b'\xfd\xbf\xbf\xbb\xba\xba' # character outside [U+0000; U+10ffff] + ) + test_args = [valid_utf8, invalid_utf8] + + for run_cmd in (run_default, run_c_locale, run_utf8_mode): + with self.subTest(run_cmd=run_cmd): + for arg in test_args: + proc = run_cmd(arg) + self.assertEqual(proc.stdout.rstrip(), ascii(arg)) + @unittest.skipUnless((sys.platform == 'darwin' or support.is_android), 'test specific to Mac OS X and Android') def test_osx_android_utf8(self): - def check_output(text): - decoded = text.decode('utf-8', 'surrogateescape') - expected = ascii(decoded).encode('ascii') + b'\n' + text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8') + code = "import sys; print(ascii(sys.argv[1]))" - env = os.environ.copy() - # C locale gives ASCII locale encoding, but Python uses UTF-8 - # to parse the command line arguments on Mac OS X and Android. - env['LC_ALL'] = 'C' + decoded = text.decode('utf-8', 'surrogateescape') + expected = ascii(decoded).encode('ascii') + b'\n' - p = subprocess.Popen( - (sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text), - stdout=subprocess.PIPE, - env=env) - stdout, stderr = p.communicate() - self.assertEqual(stdout, expected) - self.assertEqual(p.returncode, 0) + env = os.environ.copy() + # C locale gives ASCII locale encoding, but Python uses UTF-8 + # to parse the command line arguments on Mac OS X and Android. + env['LC_ALL'] = 'C' - # test valid utf-8 - text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8') - check_output(text) - - # test invalid utf-8 - text = ( - b'\xff' # invalid byte - b'\xc3\xa9' # valid utf-8 character - b'\xc3\xff' # invalid byte sequence - b'\xed\xa0\x80' # lone surrogate character (invalid) - ) - check_output(text) + p = subprocess.Popen( + (sys.executable, "-c", code, text), + stdout=subprocess.PIPE, + env=env) + stdout, stderr = p.communicate() + self.assertEqual(stdout, expected) + self.assertEqual(p.returncode, 0) def test_non_interactive_output_buffering(self): code = textwrap.dedent(""" diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-03-13-13-57-21.bpo-35883.UyGpdG.rst b/Misc/NEWS.d/next/Core and Builtins/2021-03-13-13-57-21.bpo-35883.UyGpdG.rst new file mode 100644 index 00000000000000..46742429db64bf --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-03-13-13-57-21.bpo-35883.UyGpdG.rst @@ -0,0 +1,4 @@ +Python no longer fails at startup with a fatal error if a command line +argument contains an invalid Unicode character. The +:c:func:`Py_DecodeLocale` function now escapes byte sequences which would be +decoded as Unicode characters outside the [U+0000; U+10ffff] range. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 0b08b0e8f07409..a7a31515470999 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -94,7 +94,8 @@ NOTE: In the interpreter's initialization phase, some globals are currently extern "C" { #endif -/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ +// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111). +// The value must be the same in fileutils.c. #define MAX_UNICODE 0x10ffff #ifdef Py_DEBUG @@ -1784,8 +1785,8 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, *maxchar = ch; if (*maxchar > MAX_UNICODE) { PyErr_Format(PyExc_ValueError, - "character U+%x is not in range [U+0000; U+10ffff]", - ch); + "character U+%x is not in range [U+0000; U+%x]", + ch, MAX_UNICODE); return -1; } } @@ -14089,7 +14090,7 @@ _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, { case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; - case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break; + case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break; default: Py_UNREACHABLE(); } diff --git a/Python/fileutils.c b/Python/fileutils.c index f2b4681ea849c5..4997f922251c1e 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -34,6 +34,13 @@ extern int winerror_to_errno(int); int _Py_open_cloexec_works = -1; #endif +// The value must be the same in unicodeobject.c. +#define MAX_UNICODE 0x10ffff + +// mbstowcs() and mbrtowc() errors +static const size_t DECODE_ERROR = ((size_t)-1); +static const size_t INCOMPLETE_CHARACTER = (size_t)-2; + static int get_surrogateescape(_Py_error_handler errors, int *surrogateescape) @@ -82,6 +89,57 @@ _Py_device_encoding(int fd) #endif } + +static size_t +is_valid_wide_char(wchar_t ch) +{ + if (Py_UNICODE_IS_SURROGATE(ch)) { + // Reject lone surrogate characters + return 0; + } + if (ch > MAX_UNICODE) { + // bpo-35883: Reject characters outside [U+0000; U+10ffff] range. + // The glibc mbstowcs() UTF-8 decoder does not respect the RFC 3629, + // it creates characters outside the [U+0000; U+10ffff] range: + // https://sourceware.org/bugzilla/show_bug.cgi?id=2373 + return 0; + } + return 1; +} + + +static size_t +_Py_mbstowcs(wchar_t *dest, const char *src, size_t n) +{ + size_t count = mbstowcs(dest, src, n); + if (dest != NULL && count != DECODE_ERROR) { + for (size_t i=0; i < count; i++) { + wchar_t ch = dest[i]; + if (!is_valid_wide_char(ch)) { + return DECODE_ERROR; + } + } + } + return count; +} + + +#ifdef HAVE_MBRTOWC +static size_t +_Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs) +{ + assert(pwc != NULL); + size_t count = mbrtowc(pwc, str, len, pmbs); + if (count != 0 && count != DECODE_ERROR && count != INCOMPLETE_CHARACTER) { + if (!is_valid_wide_char(*pwc)) { + return DECODE_ERROR; + } + } + return count; +} +#endif + + #if !defined(_Py_FORCE_UTF8_FS_ENCODING) && !defined(MS_WINDOWS) #define USE_FORCE_ASCII @@ -148,8 +206,8 @@ check_force_ascii(void) size_t res; ch = (unsigned char)0xA7; - res = mbstowcs(&wch, (char*)&ch, 1); - if (res != (size_t)-1 && wch == L'\xA7') { + res = _Py_mbstowcs(&wch, (char*)&ch, 1); + if (res != DECODE_ERROR && wch == L'\xA7') { /* On HP-UX withe C locale or the POSIX locale, nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses Latin1 encoding in practice. Force ASCII in this case. @@ -196,8 +254,8 @@ check_force_ascii(void) unsigned uch = (unsigned char)i; ch[0] = (char)uch; - res = mbstowcs(wch, ch, 1); - if (res != (size_t)-1) { + res = _Py_mbstowcs(wch, ch, 1); + if (res != DECODE_ERROR) { /* decoding a non-ASCII character from the locale encoding succeed: the locale encoding is not ASCII, force ASCII */ return 1; @@ -387,9 +445,9 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, */ argsize = strlen(arg); #else - argsize = mbstowcs(NULL, arg, 0); + argsize = _Py_mbstowcs(NULL, arg, 0); #endif - if (argsize != (size_t)-1) { + if (argsize != DECODE_ERROR) { if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { return -1; } @@ -398,21 +456,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, return -1; } - count = mbstowcs(res, arg, argsize + 1); - if (count != (size_t)-1) { - wchar_t *tmp; - /* Only use the result if it contains no - surrogate characters. */ - for (tmp = res; *tmp != 0 && - !Py_UNICODE_IS_SURROGATE(*tmp); tmp++) - ; - if (*tmp == 0) { - if (wlen != NULL) { - *wlen = count; - } - *wstr = res; - return 0; + count = _Py_mbstowcs(res, arg, argsize + 1); + if (count != DECODE_ERROR) { + *wstr = res; + if (wlen != NULL) { + *wlen = count; } + return 0; } PyMem_RawFree(res); } @@ -436,13 +486,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, out = res; memset(&mbs, 0, sizeof mbs); while (argsize) { - size_t converted = mbrtowc(out, (char*)in, argsize, &mbs); + size_t converted = _Py_mbrtowc(out, (char*)in, argsize, &mbs); if (converted == 0) { /* Reached end of string; null char stored. */ break; } - if (converted == (size_t)-2) { + if (converted == INCOMPLETE_CHARACTER) { /* Incomplete character. This should never happen, since we provide everything that we have - unless there is a bug in the C library, or I @@ -450,32 +500,22 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, goto decode_error; } - if (converted == (size_t)-1) { + if (converted == DECODE_ERROR) { if (!surrogateescape) { goto decode_error; } - /* Conversion error. Escape as UTF-8b, and start over - in the initial shift state. */ + /* Decoding error. Escape as UTF-8b, and start over in the initial + shift state. */ *out++ = 0xdc00 + *in++; argsize--; memset(&mbs, 0, sizeof mbs); continue; } - if (Py_UNICODE_IS_SURROGATE(*out)) { - if (!surrogateescape) { - goto decode_error; - } + // _Py_mbrtowc() reject lone surrogate characters + assert(!Py_UNICODE_IS_SURROGATE(*out)); - /* Surrogate character. Escape the original - byte sequence with surrogateescape. */ - argsize -= converted; - while (converted--) { - *out++ = 0xdc00 + *in++; - } - continue; - } /* successfully converted some bytes */ in += converted; argsize -= converted; @@ -652,7 +692,7 @@ encode_current_locale(const wchar_t *text, char **str, else { converted = wcstombs(NULL, buf, 0); } - if (converted == (size_t)-1) { + if (converted == DECODE_ERROR) { goto encode_error; } if (bytes != NULL) { @@ -1440,7 +1480,7 @@ _Py_wfopen(const wchar_t *path, const wchar_t *mode) char cmode[10]; size_t r; r = wcstombs(cmode, mode, 10); - if (r == (size_t)-1 || r >= 10) { + if (r == DECODE_ERROR || r >= 10) { errno = EINVAL; return NULL; } pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy