From 8c069633d4fc403475b6168a09e3f4e04a9c1bc6 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 11 May 2020 13:50:59 +0300 Subject: [PATCH 1/3] bpo-40593: Improve syntax errors for invalid characters in source code. --- Include/cpython/unicodeobject.h | 2 + Include/errcode.h | 1 - Lib/test/test_fstring.py | 2 +- Lib/test/test_source_encoding.py | 3 ++ Lib/test/test_unicode_identifiers.py | 8 ++-- .../2020-05-11-13-50-52.bpo-40593.yuOXj3.rst | 1 + Objects/unicodeobject.c | 37 +++++++++++++++++ Parser/pegen/pegen.c | 3 -- Parser/tokenizer.c | 41 +++++++++++++++---- Python/pythonrun.c | 3 -- 10 files changed, 81 insertions(+), 20 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 81a35cdc801d09..94326876292b63 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -1222,6 +1222,8 @@ PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void); and where the hash values are equal (i.e. a very probable match) */ PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *); +PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *); + #ifdef __cplusplus } #endif diff --git a/Include/errcode.h b/Include/errcode.h index b37cd261d5ec4d..790518b8b7730e 100644 --- a/Include/errcode.h +++ b/Include/errcode.h @@ -29,7 +29,6 @@ extern "C" { #define E_EOFS 23 /* EOF in triple-quoted string */ #define E_EOLS 24 /* EOL in single-quoted string */ #define E_LINECONT 25 /* Unexpected characters after a line continuation */ -#define E_IDENTIFIER 26 /* Invalid characters in identifier */ #define E_BADSINGLE 27 /* Ill-formed single statement input */ #ifdef __cplusplus diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index ac5aa9a76efe7c..393be624b6ae25 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -583,7 +583,7 @@ def test_missing_expression(self): ]) # Different error message is raised for other whitespace characters. - self.assertAllRaise(SyntaxError, 'invalid character in identifier', + self.assertAllRaise(SyntaxError, r"invalid character '\u00a0' \(U\+00A0\)", ["f'''{\xa0}'''", "\xa0", ]) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index a0bd741c36ac29..5ca43461d9940d 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -57,6 +57,9 @@ def test_issue7820(self): # one byte in common with the UTF-16-LE BOM self.assertRaises(SyntaxError, eval, b'\xff\x20') + # one byte in common with the UTF-8 BOM + self.assertRaises(SyntaxError, eval, b'\xef\x20') + # two bytes in common with the UTF-8 BOM self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20') diff --git a/Lib/test/test_unicode_identifiers.py b/Lib/test/test_unicode_identifiers.py index 07332c4631903e..5b9ced5d1cb837 100644 --- a/Lib/test/test_unicode_identifiers.py +++ b/Lib/test/test_unicode_identifiers.py @@ -20,9 +20,11 @@ def test_non_bmp_normalized(self): def test_invalid(self): try: from test import badsyntax_3131 - except SyntaxError as s: - self.assertEqual(str(s), - "invalid character in identifier (badsyntax_3131.py, line 2)") + except SyntaxError as err: + self.assertEqual(str(err), + "invalid character '€' (U+20AC) (badsyntax_3131.py, line 2)") + self.assertEqual(err.lineno, 2) + self.assertEqual(err.offset, 1) else: self.fail("expected exception didn't occur") diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst new file mode 100644 index 00000000000000..5587d4f49ccf97 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst @@ -0,0 +1 @@ +Improved syntax errors for invalid characters in source code. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 18b9458721de18..4c79bd18af2827 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12309,6 +12309,43 @@ unicode_isnumeric_impl(PyObject *self) Py_RETURN_TRUE; } +Py_ssize_t +_PyUnicode_ScanIdentifier(PyObject *self) +{ + Py_ssize_t i; + if (PyUnicode_READY(self) == -1) + return -1; + + Py_ssize_t len = PyUnicode_GET_LENGTH(self); + if (len == 0) { + /* an empty string is not a valid identifier */ + return 0; + } + + int kind = PyUnicode_KIND(self); + const void *data = PyUnicode_DATA(self); + Py_UCS4 ch = PyUnicode_READ(kind, data, 0); + /* PEP 3131 says that the first character must be in + XID_Start and subsequent characters in XID_Continue, + and for the ASCII range, the 2.x rules apply (i.e + start with letters and underscore, continue with + letters, digits, underscore). However, given the current + definition of XID_Start and XID_Continue, it is sufficient + to check just for these, except that _ must be allowed + as starting an identifier. */ + if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { + return 0; + } + + for (i = 1; i < len; i++) { + ch = PyUnicode_READ(kind, data, i); + if (!_PyUnicode_IsXidContinue(ch)) { + return i; + } + } + return i; +} + int PyUnicode_IsIdentifier(PyObject *self) { diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c index c80f08668b07d6..5f8c862c1f88be 100644 --- a/Parser/pegen/pegen.c +++ b/Parser/pegen/pegen.c @@ -337,9 +337,6 @@ tokenizer_error(Parser *p) case E_TOKEN: msg = "invalid token"; break; - case E_IDENTIFIER: - msg = "invalid character in identifier"; - break; case E_EOFS: RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal"); return -1; diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 0f2b6af5e50adf..6d6f930a0df633 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1101,25 +1101,48 @@ static int verify_identifier(struct tok_state *tok) { PyObject *s; - int result; if (tok->decoding_erred) return 0; s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); if (s == NULL) { if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { - PyErr_Clear(); - tok->done = E_IDENTIFIER; - } else { + tok->done = E_DECODE; + } + else { tok->done = E_ERROR; } return 0; } - result = PyUnicode_IsIdentifier(s); - Py_DECREF(s); - if (result == 0) { - tok->done = E_IDENTIFIER; + Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s); + if (invalid < 0) { + Py_DECREF(s); + tok->done = E_ERROR; + return 0; } - return result; + assert(PyUnicode_GET_LENGTH(s) > 0); + if (invalid < PyUnicode_GET_LENGTH(s)) { + Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid); + if (invalid + 1 < PyUnicode_GET_LENGTH(s)) { + /* Determine the offset in UTF-8 encoded input */ + Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1)); + if (s != NULL) { + Py_SETREF(s, PyUnicode_AsUTF8String(s)); + } + if (s == NULL) { + tok->done = E_ERROR; + return 0; + } + tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s); + } + Py_DECREF(s); + // PyUnicode_FromFormatV() does not support %X + char hex[9]; + snprintf(hex, sizeof(hex), "%04X", ch); + syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex); + return 0; + } + Py_DECREF(s); + return 1; } static int diff --git a/Python/pythonrun.c b/Python/pythonrun.c index 1b79a33c814da1..45f08b707eb999 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -1603,9 +1603,6 @@ err_input(perrdetail *err) msg = "unexpected character after line continuation character"; break; - case E_IDENTIFIER: - msg = "invalid character in identifier"; - break; case E_BADSINGLE: msg = "multiple statements found while compiling a single statement"; break; From e543b404cac4ec0b5dac18da0d89a46d018c7924 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 11 May 2020 15:03:16 +0300 Subject: [PATCH 2/3] Omit non-printable characters. --- Lib/test/test_fstring.py | 2 +- Parser/tokenizer.c | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index 393be624b6ae25..e0bb5b56b2614f 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -583,7 +583,7 @@ def test_missing_expression(self): ]) # Different error message is raised for other whitespace characters. - self.assertAllRaise(SyntaxError, r"invalid character '\u00a0' \(U\+00A0\)", + self.assertAllRaise(SyntaxError, r"invalid non-printable character U\+00A0", ["f'''{\xa0}'''", "\xa0", ]) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 6d6f930a0df633..b81fa118f216eb 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1138,7 +1138,12 @@ verify_identifier(struct tok_state *tok) // PyUnicode_FromFormatV() does not support %X char hex[9]; snprintf(hex, sizeof(hex), "%04X", ch); - syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex); + if (Py_UNICODE_ISPRINTABLE(ch)) { + syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex); + } + else { + syntaxerror(tok, "invalid non-printable character U+%s", hex); + } return 0; } Py_DECREF(s); From 5f3b60ea9d70cc1c547f75593e1eb4d1b51da8a6 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 11 May 2020 19:04:22 +0300 Subject: [PATCH 3/3] Refactor PyUnicode_IsIdentifier(). --- Objects/unicodeobject.c | 57 ++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 38 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4c79bd18af2827..276547ca48a5b2 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12349,52 +12349,33 @@ _PyUnicode_ScanIdentifier(PyObject *self) int PyUnicode_IsIdentifier(PyObject *self) { - Py_ssize_t i; - int ready = PyUnicode_IS_READY(self); - - Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self); - if (len == 0) { + if (PyUnicode_IS_READY(self)) { + Py_ssize_t i = _PyUnicode_ScanIdentifier(self); + Py_ssize_t len = PyUnicode_GET_LENGTH(self); /* an empty string is not a valid identifier */ - return 0; - } - - int kind = 0; - const void *data = NULL; - const wchar_t *wstr = NULL; - Py_UCS4 ch; - if (ready) { - kind = PyUnicode_KIND(self); - data = PyUnicode_DATA(self); - ch = PyUnicode_READ(kind, data, 0); + return len && i == len; } else { - wstr = _PyUnicode_WSTR(self); - ch = wstr[0]; - } - /* PEP 3131 says that the first character must be in - XID_Start and subsequent characters in XID_Continue, - and for the ASCII range, the 2.x rules apply (i.e - start with letters and underscore, continue with - letters, digits, underscore). However, given the current - definition of XID_Start and XID_Continue, it is sufficient - to check just for these, except that _ must be allowed - as starting an identifier. */ - if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { - return 0; - } + Py_ssize_t i, len = PyUnicode_GET_SIZE(self); + if (len == 0) { + /* an empty string is not a valid identifier */ + return 0; + } - for (i = 1; i < len; i++) { - if (ready) { - ch = PyUnicode_READ(kind, data, i); + const wchar_t *wstr = _PyUnicode_WSTR(self); + Py_UCS4 ch = wstr[0]; + if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { + return 0; } - else { + + for (i = 1; i < len; i++) { ch = wstr[i]; + if (!_PyUnicode_IsXidContinue(ch)) { + return 0; + } } - if (!_PyUnicode_IsXidContinue(ch)) { - return 0; - } + return 1; } - return 1; } /*[clinic input] pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy