From ef9d22f49f7bc371d2ec6f22c07fbeb1a816a431 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Fri, 6 Jun 2025 02:17:57 +0100 Subject: [PATCH 1/4] gh-135148: Correctly handle f/t strings with comments and debug expressions --- Lib/test/test_fstring.py | 6 ++ ...-06-06-02-24-42.gh-issue-135148.r-t2sC.rst | 3 + Parser/lexer/lexer.c | 85 ++++++++++++++++--- 3 files changed, 81 insertions(+), 13 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-06-06-02-24-42.gh-issue-135148.r-t2sC.rst diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index dd58e032a8befe..fcda09fc58d2c7 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -1651,6 +1651,12 @@ def __repr__(self): self.assertEqual(f"{1+2 = # my comment }", '1+2 = \n 3') + self.assertEqual(f'{""" # booo + """=}', '""" # booo\n """=\' # booo\\n \'') + + self.assertEqual(f'{" # nooo "=}', '" # nooo "=\' # nooo \'') + self.assertEqual(f'{" \" # nooo \" "=}', '" \\" # nooo \\" "=\' " # nooo " \'') + # These next lines contains tabs. Backslash escapes don't # work in f-strings. # patchcheck doesn't like these tabs. So the only way to test diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-06-06-02-24-42.gh-issue-135148.r-t2sC.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-06-06-02-24-42.gh-issue-135148.r-t2sC.rst new file mode 100644 index 00000000000000..9b1f62433b45ed --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-06-06-02-24-42.gh-issue-135148.r-t2sC.rst @@ -0,0 +1,3 @@ +Fixed a bug where f-string debug expressions (using =) would incorrectly +strip out parts of strings containing escaped quotes and # characters. Patch +by Pablo Galindo. diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index 4d10bccf0a53f2..bfa98b2fe70d43 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -123,35 +123,96 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { // Check if there is a # character in the expression int hash_detected = 0; + int in_string = 0; + char string_quote = 0; for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { - if (tok_mode->last_expr_buffer[i] == '#') { + char ch = tok_mode->last_expr_buffer[i]; + if (ch == '\\' && i + 1 < tok_mode->last_expr_size - tok_mode->last_expr_end) { + // Skip the next character if it's an escape sequence + i++; + continue; + } + if (ch == '"' || ch == '\'') { + if (!in_string) { + in_string = 1; + string_quote = ch; + } else if (ch == string_quote) { + // Check for triple quotes + if (i > 0 && tok_mode->last_expr_buffer[i-1] == ch && + i > 1 && tok_mode->last_expr_buffer[i-2] == ch) { + // Skip the rest of the triple quote + i += 2; + } + in_string = 0; + } + } else if (ch == '#' && !in_string) { hash_detected = 1; break; } } - + // If we found a # character in the expression, we need to handle comments if (hash_detected) { + // Calculate length of input we need to process Py_ssize_t input_length = tok_mode->last_expr_size - tok_mode->last_expr_end; + + // Allocate buffer for processed result, with room for null terminator char *result = (char *)PyMem_Malloc((input_length + 1) * sizeof(char)); if (!result) { return -1; } - Py_ssize_t i = 0; - Py_ssize_t j = 0; + // Initialize counters and state + Py_ssize_t i = 0; // Input position + Py_ssize_t j = 0; // Output position + in_string = 0; // Whether we're currently inside a string + string_quote = 0; // The quote character for current string (' or ") + // Process each character of input for (i = 0, j = 0; i < input_length; i++) { - if (tok_mode->last_expr_buffer[i] == '#') { - // Skip characters until newline or end of string + char ch = tok_mode->last_expr_buffer[i]; + + // Handle escape sequences - copy both backslash and next char + if (ch == '\\' && i + 1 < input_length) { + result[j++] = ch; // Copy backslash + result[j++] = tok_mode->last_expr_buffer[++i]; // Copy escaped char + continue; + } + + // Handle string quotes + if (ch == '"' || ch == '\'') { + if (!in_string) { + // Start of new string + in_string = 1; + string_quote = ch; + } else if (ch == string_quote) { + // Potential end of string - check for triple quotes + if (i > 0 && tok_mode->last_expr_buffer[i-1] == ch && + i > 1 && tok_mode->last_expr_buffer[i-2] == ch) { + // Found triple quote - copy all three quotes + result[j++] = ch; + result[j++] = ch; + result[j++] = ch; + i += 2; // Skip the other two quotes + continue; + } + // End of regular string + in_string = 0; + } + result[j++] = ch; // Copy the quote character + } + // Handle comments - skip everything until newline + else if (ch == '#' && !in_string) { while (i < input_length && tok_mode->last_expr_buffer[i] != '\0') { if (tok_mode->last_expr_buffer[i] == '\n') { - result[j++] = tok_mode->last_expr_buffer[i]; + result[j++] = tok_mode->last_expr_buffer[i]; // Keep newline break; } - i++; + i++; // Skip comment character } - } else { - result[j++] = tok_mode->last_expr_buffer[i]; + } + // Copy any other character unchanged + else { + result[j++] = ch; } } @@ -164,11 +225,9 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { tok_mode->last_expr_size - tok_mode->last_expr_end, NULL ); - } - - if (!res) { + if (!res) { return -1; } token->metadata = res; From fdc81d612ea240e53df787e703097346b6d533fa Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Sat, 7 Jun 2025 02:06:46 +0100 Subject: [PATCH 2/4] Address review --- Lib/test/test_fstring.py | 6 +++ Parser/lexer/lexer.c | 82 ++++++++++++++++------------------------ 2 files changed, 38 insertions(+), 50 deletions(-) diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index fcda09fc58d2c7..89d425d6e27aa7 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -1657,6 +1657,12 @@ def __repr__(self): self.assertEqual(f'{" # nooo "=}', '" # nooo "=\' # nooo \'') self.assertEqual(f'{" \" # nooo \" "=}', '" \\" # nooo \\" "=\' " # nooo " \'') + self.assertEqual(f'{ # some comment goes here + """hello"""=}', ' \n """hello"""=\'hello\'') + self.assertEqual(f'{"""# this is not a comment + a""" # this is a comment + }', '# this is not a comment\n a') + # These next lines contains tabs. Backslash escapes don't # work in f-strings. # patchcheck doesn't like these tabs. So the only way to test diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index bfa98b2fe70d43..2a461ac49e8cbe 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -121,99 +121,81 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { } PyObject *res = NULL; - // Check if there is a # character in the expression + // Look for a # character outside of string literals int hash_detected = 0; int in_string = 0; + char quote_char = 0; char string_quote = 0; + for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { char ch = tok_mode->last_expr_buffer[i]; - if (ch == '\\' && i + 1 < tok_mode->last_expr_size - tok_mode->last_expr_end) { - // Skip the next character if it's an escape sequence + + // Skip escaped characters + if (ch == '\\') { i++; continue; } + + // Handle quotes if (ch == '"' || ch == '\'') { if (!in_string) { in_string = 1; - string_quote = ch; - } else if (ch == string_quote) { - // Check for triple quotes - if (i > 0 && tok_mode->last_expr_buffer[i-1] == ch && - i > 1 && tok_mode->last_expr_buffer[i-2] == ch) { - // Skip the rest of the triple quote - i += 2; - } + quote_char = ch; + } + else if (ch == quote_char) { in_string = 0; } - } else if (ch == '#' && !in_string) { + continue; + } + + // Check for # outside strings + if (ch == '#' && !in_string) { hash_detected = 1; break; } } // If we found a # character in the expression, we need to handle comments if (hash_detected) { - // Calculate length of input we need to process - Py_ssize_t input_length = tok_mode->last_expr_size - tok_mode->last_expr_end; - - // Allocate buffer for processed result, with room for null terminator - char *result = (char *)PyMem_Malloc((input_length + 1) * sizeof(char)); + // Allocate buffer for processed result + char *result = (char *)PyMem_Malloc((tok_mode->last_expr_size - tok_mode->last_expr_end + 1) * sizeof(char)); if (!result) { return -1; } - // Initialize counters and state Py_ssize_t i = 0; // Input position Py_ssize_t j = 0; // Output position - in_string = 0; // Whether we're currently inside a string - string_quote = 0; // The quote character for current string (' or ") + in_string = 0; // Whether we're in a string + string_quote = 0; // Current string quote char - // Process each character of input - for (i = 0, j = 0; i < input_length; i++) { + // Process each character + while (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { char ch = tok_mode->last_expr_buffer[i]; - // Handle escape sequences - copy both backslash and next char - if (ch == '\\' && i + 1 < input_length) { - result[j++] = ch; // Copy backslash - result[j++] = tok_mode->last_expr_buffer[++i]; // Copy escaped char - continue; - } - // Handle string quotes if (ch == '"' || ch == '\'') { if (!in_string) { - // Start of new string in_string = 1; string_quote = ch; } else if (ch == string_quote) { - // Potential end of string - check for triple quotes - if (i > 0 && tok_mode->last_expr_buffer[i-1] == ch && - i > 1 && tok_mode->last_expr_buffer[i-2] == ch) { - // Found triple quote - copy all three quotes - result[j++] = ch; - result[j++] = ch; - result[j++] = ch; - i += 2; // Skip the other two quotes - continue; - } - // End of regular string in_string = 0; } - result[j++] = ch; // Copy the quote character + result[j++] = ch; } - // Handle comments - skip everything until newline + // Skip comments else if (ch == '#' && !in_string) { - while (i < input_length && tok_mode->last_expr_buffer[i] != '\0') { - if (tok_mode->last_expr_buffer[i] == '\n') { - result[j++] = tok_mode->last_expr_buffer[i]; // Keep newline - break; - } - i++; // Skip comment character + while (i < tok_mode->last_expr_size - tok_mode->last_expr_end && + tok_mode->last_expr_buffer[i] != '\n') { + i++; + } + if (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { + result[j++] = '\n'; } } - // Copy any other character unchanged + // Copy other chars else { result[j++] = ch; } + i++; } result[j] = '\0'; // Null-terminate the result string From 3c86cce55143ed98db6c8a51a431a363eecbd4aa Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Sat, 7 Jun 2025 14:33:49 +0100 Subject: [PATCH 3/4] Fix linting --- Parser/lexer/lexer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index 2a461ac49e8cbe..04c9777cd616ae 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -126,16 +126,16 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { int in_string = 0; char quote_char = 0; char string_quote = 0; - + for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { char ch = tok_mode->last_expr_buffer[i]; - + // Skip escaped characters if (ch == '\\') { i++; continue; } - + // Handle quotes if (ch == '"' || ch == '\'') { if (!in_string) { @@ -147,7 +147,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { } continue; } - + // Check for # outside strings if (ch == '#' && !in_string) { hash_detected = 1; @@ -183,7 +183,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { } // Skip comments else if (ch == '#' && !in_string) { - while (i < tok_mode->last_expr_size - tok_mode->last_expr_end && + while (i < tok_mode->last_expr_size - tok_mode->last_expr_end && tok_mode->last_expr_buffer[i] != '\n') { i++; } From e736c992906dc0d3fb54a44aed1e23ff2f8ca33b Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 15 Jul 2025 12:16:51 +0200 Subject: [PATCH 4/4] Add comment and remove unnecessary variable --- Parser/lexer/lexer.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index 04c9777cd616ae..e71d7391d8a439 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -125,7 +125,6 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { int hash_detected = 0; int in_string = 0; char quote_char = 0; - char string_quote = 0; for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { char ch = tok_mode->last_expr_buffer[i]; @@ -138,6 +137,13 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { // Handle quotes if (ch == '"' || ch == '\'') { + // The following if/else block works becase there is an off number + // of quotes in STRING tokens and the lexer only ever reaches this + // function with valid STRING tokens. + // For example: """hello""" + // First quote: in_string = 1 + // Second quote: in_string = 0 + // Third quote: in_string = 1 if (!in_string) { in_string = 1; quote_char = ch; @@ -165,7 +171,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { Py_ssize_t i = 0; // Input position Py_ssize_t j = 0; // Output position in_string = 0; // Whether we're in a string - string_quote = 0; // Current string quote char + quote_char = 0; // Current string quote char // Process each character while (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { @@ -173,10 +179,11 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { // Handle string quotes if (ch == '"' || ch == '\'') { + // See comment above to understand this part if (!in_string) { in_string = 1; - string_quote = ch; - } else if (ch == string_quote) { + quote_char = ch; + } else if (ch == quote_char) { in_string = 0; } result[j++] = ch; pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy