Skip to content

Commit 74ea6b5

Browse files
bpo-40593: Improve syntax errors for invalid characters in source code. (GH-20033)
1 parent f3a5b7a commit 74ea6b5

File tree

10 files changed

+90
-43
lines changed

10 files changed

+90
-43
lines changed

Include/cpython/unicodeobject.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1222,6 +1222,8 @@ PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
12221222
and where the hash values are equal (i.e. a very probable match) */
12231223
PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
12241224

1225+
PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
1226+
12251227
#ifdef __cplusplus
12261228
}
12271229
#endif

Include/errcode.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ extern "C" {
2929
#define E_EOFS 23 /* EOF in triple-quoted string */
3030
#define E_EOLS 24 /* EOL in single-quoted string */
3131
#define E_LINECONT 25 /* Unexpected characters after a line continuation */
32-
#define E_IDENTIFIER 26 /* Invalid characters in identifier */
3332
#define E_BADSINGLE 27 /* Ill-formed single statement input */
3433

3534
#ifdef __cplusplus

Lib/test/test_fstring.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -583,7 +583,7 @@ def test_missing_expression(self):
583583
])
584584

585585
# Different error message is raised for other whitespace characters.
586-
self.assertAllRaise(SyntaxError, 'invalid character in identifier',
586+
self.assertAllRaise(SyntaxError, r"invalid non-printable character U\+00A0",
587587
["f'''{\xa0}'''",
588588
"\xa0",
589589
])

Lib/test/test_source_encoding.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ def test_issue7820(self):
5757
# one byte in common with the UTF-16-LE BOM
5858
self.assertRaises(SyntaxError, eval, b'\xff\x20')
5959

60+
# one byte in common with the UTF-8 BOM
61+
self.assertRaises(SyntaxError, eval, b'\xef\x20')
62+
6063
# two bytes in common with the UTF-8 BOM
6164
self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')
6265

Lib/test/test_unicode_identifiers.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,11 @@ def test_non_bmp_normalized(self):
2020
def test_invalid(self):
2121
try:
2222
from test import badsyntax_3131
23-
except SyntaxError as s:
24-
self.assertEqual(str(s),
25-
"invalid character in identifier (badsyntax_3131.py, line 2)")
23+
except SyntaxError as err:
24+
self.assertEqual(str(err),
25+
"invalid character '€' (U+20AC) (badsyntax_3131.py, line 2)")
26+
self.assertEqual(err.lineno, 2)
27+
self.assertEqual(err.offset, 1)
2628
else:
2729
self.fail("expected exception didn't occur")
2830

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Improved syntax errors for invalid characters in source code.

Objects/unicodeobject.c

Lines changed: 41 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -12309,31 +12309,22 @@ unicode_isnumeric_impl(PyObject *self)
1230912309
Py_RETURN_TRUE;
1231012310
}
1231112311

12312-
int
12313-
PyUnicode_IsIdentifier(PyObject *self)
12312+
Py_ssize_t
12313+
_PyUnicode_ScanIdentifier(PyObject *self)
1231412314
{
1231512315
Py_ssize_t i;
12316-
int ready = PyUnicode_IS_READY(self);
12316+
if (PyUnicode_READY(self) == -1)
12317+
return -1;
1231712318

12318-
Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self);
12319+
Py_ssize_t len = PyUnicode_GET_LENGTH(self);
1231912320
if (len == 0) {
1232012321
/* an empty string is not a valid identifier */
1232112322
return 0;
1232212323
}
1232312324

12324-
int kind = 0;
12325-
const void *data = NULL;
12326-
const wchar_t *wstr = NULL;
12327-
Py_UCS4 ch;
12328-
if (ready) {
12329-
kind = PyUnicode_KIND(self);
12330-
data = PyUnicode_DATA(self);
12331-
ch = PyUnicode_READ(kind, data, 0);
12332-
}
12333-
else {
12334-
wstr = _PyUnicode_WSTR(self);
12335-
ch = wstr[0];
12336-
}
12325+
int kind = PyUnicode_KIND(self);
12326+
const void *data = PyUnicode_DATA(self);
12327+
Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
1233712328
/* PEP 3131 says that the first character must be in
1233812329
XID_Start and subsequent characters in XID_Continue,
1233912330
and for the ASCII range, the 2.x rules apply (i.e
@@ -12347,17 +12338,44 @@ PyUnicode_IsIdentifier(PyObject *self)
1234712338
}
1234812339

1234912340
for (i = 1; i < len; i++) {
12350-
if (ready) {
12351-
ch = PyUnicode_READ(kind, data, i);
12341+
ch = PyUnicode_READ(kind, data, i);
12342+
if (!_PyUnicode_IsXidContinue(ch)) {
12343+
return i;
1235212344
}
12353-
else {
12354-
ch = wstr[i];
12345+
}
12346+
return i;
12347+
}
12348+
12349+
int
12350+
PyUnicode_IsIdentifier(PyObject *self)
12351+
{
12352+
if (PyUnicode_IS_READY(self)) {
12353+
Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12354+
Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12355+
/* an empty string is not a valid identifier */
12356+
return len && i == len;
12357+
}
12358+
else {
12359+
Py_ssize_t i, len = PyUnicode_GET_SIZE(self);
12360+
if (len == 0) {
12361+
/* an empty string is not a valid identifier */
12362+
return 0;
1235512363
}
12356-
if (!_PyUnicode_IsXidContinue(ch)) {
12364+
12365+
const wchar_t *wstr = _PyUnicode_WSTR(self);
12366+
Py_UCS4 ch = wstr[0];
12367+
if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
1235712368
return 0;
1235812369
}
12370+
12371+
for (i = 1; i < len; i++) {
12372+
ch = wstr[i];
12373+
if (!_PyUnicode_IsXidContinue(ch)) {
12374+
return 0;
12375+
}
12376+
}
12377+
return 1;
1235912378
}
12360-
return 1;
1236112379
}
1236212380

1236312381
/*[clinic input]

Parser/pegen/pegen.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -337,9 +337,6 @@ tokenizer_error(Parser *p)
337337
case E_TOKEN:
338338
msg = "invalid token";
339339
break;
340-
case E_IDENTIFIER:
341-
msg = "invalid character in identifier";
342-
break;
343340
case E_EOFS:
344341
RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal");
345342
return -1;

Parser/tokenizer.c

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1101,25 +1101,53 @@ static int
11011101
verify_identifier(struct tok_state *tok)
11021102
{
11031103
PyObject *s;
1104-
int result;
11051104
if (tok->decoding_erred)
11061105
return 0;
11071106
s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
11081107
if (s == NULL) {
11091108
if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1110-
PyErr_Clear();
1111-
tok->done = E_IDENTIFIER;
1112-
} else {
1109+
tok->done = E_DECODE;
1110+
}
1111+
else {
11131112
tok->done = E_ERROR;
11141113
}
11151114
return 0;
11161115
}
1117-
result = PyUnicode_IsIdentifier(s);
1118-
Py_DECREF(s);
1119-
if (result == 0) {
1120-
tok->done = E_IDENTIFIER;
1116+
Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1117+
if (invalid < 0) {
1118+
Py_DECREF(s);
1119+
tok->done = E_ERROR;
1120+
return 0;
11211121
}
1122-
return result;
1122+
assert(PyUnicode_GET_LENGTH(s) > 0);
1123+
if (invalid < PyUnicode_GET_LENGTH(s)) {
1124+
Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1125+
if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1126+
/* Determine the offset in UTF-8 encoded input */
1127+
Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1128+
if (s != NULL) {
1129+
Py_SETREF(s, PyUnicode_AsUTF8String(s));
1130+
}
1131+
if (s == NULL) {
1132+
tok->done = E_ERROR;
1133+
return 0;
1134+
}
1135+
tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1136+
}
1137+
Py_DECREF(s);
1138+
// PyUnicode_FromFormatV() does not support %X
1139+
char hex[9];
1140+
snprintf(hex, sizeof(hex), "%04X", ch);
1141+
if (Py_UNICODE_ISPRINTABLE(ch)) {
1142+
syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1143+
}
1144+
else {
1145+
syntaxerror(tok, "invalid non-printable character U+%s", hex);
1146+
}
1147+
return 0;
1148+
}
1149+
Py_DECREF(s);
1150+
return 1;
11231151
}
11241152

11251153
static int

Python/pythonrun.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1603,9 +1603,6 @@ err_input(perrdetail *err)
16031603
msg = "unexpected character after line continuation character";
16041604
break;
16051605

1606-
case E_IDENTIFIER:
1607-
msg = "invalid character in identifier";
1608-
break;
16091606
case E_BADSINGLE:
16101607
msg = "multiple statements found while compiling a single statement";
16111608
break;

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy