diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 81a35cdc801d09..94326876292b63 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -1222,6 +1222,8 @@ PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void); and where the hash values are equal (i.e. a very probable match) */ PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *); +PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *); + #ifdef __cplusplus } #endif diff --git a/Include/errcode.h b/Include/errcode.h index b37cd261d5ec4d..790518b8b7730e 100644 --- a/Include/errcode.h +++ b/Include/errcode.h @@ -29,7 +29,6 @@ extern "C" { #define E_EOFS 23 /* EOF in triple-quoted string */ #define E_EOLS 24 /* EOL in single-quoted string */ #define E_LINECONT 25 /* Unexpected characters after a line continuation */ -#define E_IDENTIFIER 26 /* Invalid characters in identifier */ #define E_BADSINGLE 27 /* Ill-formed single statement input */ #ifdef __cplusplus diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index ac5aa9a76efe7c..e0bb5b56b2614f 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -583,7 +583,7 @@ def test_missing_expression(self): ]) # Different error message is raised for other whitespace characters. - self.assertAllRaise(SyntaxError, 'invalid character in identifier', + self.assertAllRaise(SyntaxError, r"invalid non-printable character U\+00A0", ["f'''{\xa0}'''", "\xa0", ]) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index a0bd741c36ac29..5ca43461d9940d 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -57,6 +57,9 @@ def test_issue7820(self): # one byte in common with the UTF-16-LE BOM self.assertRaises(SyntaxError, eval, b'\xff\x20') + # one byte in common with the UTF-8 BOM + self.assertRaises(SyntaxError, eval, b'\xef\x20') + # two bytes in common with the UTF-8 BOM self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20') diff --git a/Lib/test/test_unicode_identifiers.py b/Lib/test/test_unicode_identifiers.py index 07332c4631903e..5b9ced5d1cb837 100644 --- a/Lib/test/test_unicode_identifiers.py +++ b/Lib/test/test_unicode_identifiers.py @@ -20,9 +20,11 @@ def test_non_bmp_normalized(self): def test_invalid(self): try: from test import badsyntax_3131 - except SyntaxError as s: - self.assertEqual(str(s), - "invalid character in identifier (badsyntax_3131.py, line 2)") + except SyntaxError as err: + self.assertEqual(str(err), + "invalid character '€' (U+20AC) (badsyntax_3131.py, line 2)") + self.assertEqual(err.lineno, 2) + self.assertEqual(err.offset, 1) else: self.fail("expected exception didn't occur") diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst new file mode 100644 index 00000000000000..5587d4f49ccf97 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst @@ -0,0 +1 @@ +Improved syntax errors for invalid characters in source code. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 18b9458721de18..276547ca48a5b2 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12309,31 +12309,22 @@ unicode_isnumeric_impl(PyObject *self) Py_RETURN_TRUE; } -int -PyUnicode_IsIdentifier(PyObject *self) +Py_ssize_t +_PyUnicode_ScanIdentifier(PyObject *self) { Py_ssize_t i; - int ready = PyUnicode_IS_READY(self); + if (PyUnicode_READY(self) == -1) + return -1; - Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self); + Py_ssize_t len = PyUnicode_GET_LENGTH(self); if (len == 0) { /* an empty string is not a valid identifier */ return 0; } - int kind = 0; - const void *data = NULL; - const wchar_t *wstr = NULL; - Py_UCS4 ch; - if (ready) { - kind = PyUnicode_KIND(self); - data = PyUnicode_DATA(self); - ch = PyUnicode_READ(kind, data, 0); - } - else { - wstr = _PyUnicode_WSTR(self); - ch = wstr[0]; - } + int kind = PyUnicode_KIND(self); + const void *data = PyUnicode_DATA(self); + Py_UCS4 ch = PyUnicode_READ(kind, data, 0); /* PEP 3131 says that the first character must be in XID_Start and subsequent characters in XID_Continue, and for the ASCII range, the 2.x rules apply (i.e @@ -12347,17 +12338,44 @@ PyUnicode_IsIdentifier(PyObject *self) } for (i = 1; i < len; i++) { - if (ready) { - ch = PyUnicode_READ(kind, data, i); + ch = PyUnicode_READ(kind, data, i); + if (!_PyUnicode_IsXidContinue(ch)) { + return i; } - else { - ch = wstr[i]; + } + return i; +} + +int +PyUnicode_IsIdentifier(PyObject *self) +{ + if (PyUnicode_IS_READY(self)) { + Py_ssize_t i = _PyUnicode_ScanIdentifier(self); + Py_ssize_t len = PyUnicode_GET_LENGTH(self); + /* an empty string is not a valid identifier */ + return len && i == len; + } + else { + Py_ssize_t i, len = PyUnicode_GET_SIZE(self); + if (len == 0) { + /* an empty string is not a valid identifier */ + return 0; } - if (!_PyUnicode_IsXidContinue(ch)) { + + const wchar_t *wstr = _PyUnicode_WSTR(self); + Py_UCS4 ch = wstr[0]; + if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { return 0; } + + for (i = 1; i < len; i++) { + ch = wstr[i]; + if (!_PyUnicode_IsXidContinue(ch)) { + return 0; + } + } + return 1; } - return 1; } /*[clinic input] diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c index c80f08668b07d6..5f8c862c1f88be 100644 --- a/Parser/pegen/pegen.c +++ b/Parser/pegen/pegen.c @@ -337,9 +337,6 @@ tokenizer_error(Parser *p) case E_TOKEN: msg = "invalid token"; break; - case E_IDENTIFIER: - msg = "invalid character in identifier"; - break; case E_EOFS: RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal"); return -1; diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 0f2b6af5e50adf..b81fa118f216eb 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1101,25 +1101,53 @@ static int verify_identifier(struct tok_state *tok) { PyObject *s; - int result; if (tok->decoding_erred) return 0; s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); if (s == NULL) { if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { - PyErr_Clear(); - tok->done = E_IDENTIFIER; - } else { + tok->done = E_DECODE; + } + else { tok->done = E_ERROR; } return 0; } - result = PyUnicode_IsIdentifier(s); - Py_DECREF(s); - if (result == 0) { - tok->done = E_IDENTIFIER; + Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s); + if (invalid < 0) { + Py_DECREF(s); + tok->done = E_ERROR; + return 0; } - return result; + assert(PyUnicode_GET_LENGTH(s) > 0); + if (invalid < PyUnicode_GET_LENGTH(s)) { + Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid); + if (invalid + 1 < PyUnicode_GET_LENGTH(s)) { + /* Determine the offset in UTF-8 encoded input */ + Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1)); + if (s != NULL) { + Py_SETREF(s, PyUnicode_AsUTF8String(s)); + } + if (s == NULL) { + tok->done = E_ERROR; + return 0; + } + tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s); + } + Py_DECREF(s); + // PyUnicode_FromFormatV() does not support %X + char hex[9]; + snprintf(hex, sizeof(hex), "%04X", ch); + if (Py_UNICODE_ISPRINTABLE(ch)) { + syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex); + } + else { + syntaxerror(tok, "invalid non-printable character U+%s", hex); + } + return 0; + } + Py_DECREF(s); + return 1; } static int diff --git a/Python/pythonrun.c b/Python/pythonrun.c index 1b79a33c814da1..45f08b707eb999 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -1603,9 +1603,6 @@ err_input(perrdetail *err) msg = "unexpected character after line continuation character"; break; - case E_IDENTIFIER: - msg = "invalid character in identifier"; - break; case E_BADSINGLE: msg = "multiple statements found while compiling a single statement"; break;
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: