From c3286ec4eadaf466ad825ab0987e59ca77cc6569 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Thu, 13 Feb 2025 01:07:37 +0000 Subject: [PATCH] [3.13] gh-116042: Fix location for SyntaxErrors of invalid escapes in the tokenizer (GH-116049) (cherry picked from commit 56eda256336310a08d4beb75b998488cb359444b) (cherry picked from commit 369704b428982968346c1482fccdac8b25fdd836) Co-authored-by: Pablo Galindo Salgado --- Lib/test/test_cmd_line_script.py | 2 +- Lib/test/test_string_literals.py | 39 +++++++++++--- ...-02-13-00-28-43.gh-issue-116042.861juq.rst | 2 + Parser/pegen_errors.c | 4 +- Parser/string_parser.c | 53 ++++++++++++++++--- 5 files changed, 82 insertions(+), 18 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2025-02-13-00-28-43.gh-issue-116042.861juq.rst diff --git a/Lib/test/test_cmd_line_script.py b/Lib/test/test_cmd_line_script.py index 1ec5e581f81d17..84fbdfe1aec086 100644 --- a/Lib/test/test_cmd_line_script.py +++ b/Lib/test/test_cmd_line_script.py @@ -660,7 +660,7 @@ def test_syntaxerror_invalid_escape_sequence_multi_line(self): self.assertEqual( stderr.splitlines()[-3:], [ b' foo = """\\q"""', - b' ^^^^^^^^', + b' ^^', b'SyntaxError: invalid escape sequence \'\\q\'' ], ) diff --git a/Lib/test/test_string_literals.py b/Lib/test/test_string_literals.py index c7c6f684cd33f0..3d793427c9ab5d 100644 --- a/Lib/test/test_string_literals.py +++ b/Lib/test/test_string_literals.py @@ -118,7 +118,7 @@ def test_eval_str_invalid_escape(self): self.assertEqual(len(w), 1) self.assertEqual(str(w[0].message), r"invalid escape sequence '\z'") self.assertEqual(w[0].filename, '') - self.assertEqual(w[0].lineno, 1) + self.assertEqual(w[0].lineno, 2) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('error', category=SyntaxWarning) @@ -128,7 +128,7 @@ def test_eval_str_invalid_escape(self): self.assertEqual(w, []) self.assertEqual(exc.msg, r"invalid escape sequence '\z'") self.assertEqual(exc.filename, '') - self.assertEqual(exc.lineno, 1) + self.assertEqual(exc.lineno, 2) self.assertEqual(exc.offset, 1) # Check that the warning is raised only once if there are syntax errors @@ -155,7 +155,7 @@ def test_eval_str_invalid_octal_escape(self): self.assertEqual(str(w[0].message), r"invalid octal escape sequence '\407'") self.assertEqual(w[0].filename, '') - self.assertEqual(w[0].lineno, 1) + self.assertEqual(w[0].lineno, 2) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('error', category=SyntaxWarning) @@ -165,9 +165,32 @@ def test_eval_str_invalid_octal_escape(self): self.assertEqual(w, []) self.assertEqual(exc.msg, r"invalid octal escape sequence '\407'") self.assertEqual(exc.filename, '') - self.assertEqual(exc.lineno, 1) + self.assertEqual(exc.lineno, 2) self.assertEqual(exc.offset, 1) + def test_invalid_escape_locations_with_offset(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('error', category=SyntaxWarning) + with self.assertRaises(SyntaxError) as cm: + eval("\"'''''''''''''''''''''invalid\ Escape\"") + exc = cm.exception + self.assertEqual(w, []) + self.assertEqual(exc.msg, r"invalid escape sequence '\ '") + self.assertEqual(exc.filename, '') + self.assertEqual(exc.lineno, 1) + self.assertEqual(exc.offset, 30) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('error', category=SyntaxWarning) + with self.assertRaises(SyntaxError) as cm: + eval("\"''Incorrect \ logic?\"") + exc = cm.exception + self.assertEqual(w, []) + self.assertEqual(exc.msg, r"invalid escape sequence '\ '") + self.assertEqual(exc.filename, '') + self.assertEqual(exc.lineno, 1) + self.assertEqual(exc.offset, 14) + def test_eval_str_raw(self): self.assertEqual(eval(""" r'x' """), 'x') self.assertEqual(eval(r""" r'\x01' """), '\\' + 'x01') @@ -207,7 +230,7 @@ def test_eval_bytes_invalid_escape(self): self.assertEqual(len(w), 1) self.assertEqual(str(w[0].message), r"invalid escape sequence '\z'") self.assertEqual(w[0].filename, '') - self.assertEqual(w[0].lineno, 1) + self.assertEqual(w[0].lineno, 2) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('error', category=SyntaxWarning) @@ -217,7 +240,7 @@ def test_eval_bytes_invalid_escape(self): self.assertEqual(w, []) self.assertEqual(exc.msg, r"invalid escape sequence '\z'") self.assertEqual(exc.filename, '') - self.assertEqual(exc.lineno, 1) + self.assertEqual(exc.lineno, 2) def test_eval_bytes_invalid_octal_escape(self): for i in range(0o400, 0o1000): @@ -231,7 +254,7 @@ def test_eval_bytes_invalid_octal_escape(self): self.assertEqual(str(w[0].message), r"invalid octal escape sequence '\407'") self.assertEqual(w[0].filename, '') - self.assertEqual(w[0].lineno, 1) + self.assertEqual(w[0].lineno, 2) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('error', category=SyntaxWarning) @@ -241,7 +264,7 @@ def test_eval_bytes_invalid_octal_escape(self): self.assertEqual(w, []) self.assertEqual(exc.msg, r"invalid octal escape sequence '\407'") self.assertEqual(exc.filename, '') - self.assertEqual(exc.lineno, 1) + self.assertEqual(exc.lineno, 2) def test_eval_bytes_raw(self): self.assertEqual(eval(""" br'x' """), b'x') diff --git a/Misc/NEWS.d/next/Core and Builtins/2025-02-13-00-28-43.gh-issue-116042.861juq.rst b/Misc/NEWS.d/next/Core and Builtins/2025-02-13-00-28-43.gh-issue-116042.861juq.rst new file mode 100644 index 00000000000000..098804fa92e804 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2025-02-13-00-28-43.gh-issue-116042.861juq.rst @@ -0,0 +1,2 @@ +Fix location for SyntaxErrors of invalid escapes in the tokenizer. Patch by +Pablo Galindo diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c index e8f11a67e50fa0..d692d6b0c86446 100644 --- a/Parser/pegen_errors.c +++ b/Parser/pegen_errors.c @@ -352,8 +352,8 @@ _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) { - Py_ssize_t size = p->tok->inp - p->tok->buf; - error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace"); + Py_ssize_t size = p->tok->inp - p->tok->line_start; + error_line = PyUnicode_DecodeUTF8(p->tok->line_start, size, "replace"); } else if (p->tok->fp == NULL || p->tok->fp == stdin) { error_line = get_error_line_from_tokenizer_buffers(p, lineno); diff --git a/Parser/string_parser.c b/Parser/string_parser.c index 9537c543b0eb93..e92984935430ce 100644 --- a/Parser/string_parser.c +++ b/Parser/string_parser.c @@ -11,7 +11,7 @@ //// STRING HANDLING FUNCTIONS //// static int -warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t) +warn_invalid_escape_sequence(Parser *p, const char* buffer, const char *first_invalid_escape, Token *t) { if (p->call_invalid_rules) { // Do not report warnings if we are in the second pass of the parser @@ -41,8 +41,46 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token else { category = PyExc_DeprecationWarning; } + + // Calculate the lineno and the col_offset of the invalid escape sequence + const char *start = buffer; + const char *end = first_invalid_escape; + int lineno = t->lineno; + int col_offset = t->col_offset; + while (start < end) { + if (*start == '\n') { + lineno++; + col_offset = 0; + } + else { + col_offset++; + } + start++; + } + + // Count the number of quotes in the token + char first_quote = 0; + if (lineno == t->lineno) { + int quote_count = 0; + char* tok = PyBytes_AsString(t->bytes); + for (int i = 0; i < PyBytes_Size(t->bytes); i++) { + if (tok[i] == '\'' || tok[i] == '\"') { + if (quote_count == 0) { + first_quote = tok[i]; + } + if (tok[i] == first_quote) { + quote_count++; + } + } else { + break; + } + } + + col_offset += quote_count; + } + if (PyErr_WarnExplicitObject(category, msg, p->tok->filename, - t->lineno, NULL, NULL) < 0) { + lineno, NULL, NULL) < 0) { if (PyErr_ExceptionMatches(category)) { /* Replace the Syntax/DeprecationWarning exception with a SyntaxError to get a more accurate error report */ @@ -53,11 +91,12 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token error location, if p->known_err_token is not set. */ p->known_err_token = t; if (octal) { - RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'", - first_invalid_escape); + RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1, + "invalid octal escape sequence '\\%.3s'", first_invalid_escape); } else { - RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c); + RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1, + "invalid escape sequence '\\%c'", c); } } Py_DECREF(msg); @@ -151,7 +190,7 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t) // HACK: later we can simply pass the line no, since we don't preserve the tokens // when we are decoding the string but we preserve the line numbers. if (v != NULL && first_invalid_escape != NULL && t != NULL) { - if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) { + if (warn_invalid_escape_sequence(parser, s, first_invalid_escape, t) < 0) { /* We have not decref u before because first_invalid_escape points inside u. */ Py_XDECREF(u); @@ -173,7 +212,7 @@ decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t) } if (first_invalid_escape != NULL) { - if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) { + if (warn_invalid_escape_sequence(p, s, first_invalid_escape, t) < 0) { Py_DECREF(result); return NULL; } pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy