From 600051de4931ab4ee3d3a34572c3bbda3eefbf8a Mon Sep 17 00:00:00 2001 From: Josh Klar Date: Sat, 10 Aug 2019 21:27:20 -0700 Subject: [PATCH] py: Implement partial PEP-498 (f-string) support This implements (most of) the PEP-498 spec for f-strings, with two exceptions: - raw f-strings (`fr` or `rf` prefixes) raise `NotImplementedError` - one special corner case does not function as specified in the PEP (more on that in a moment) This is implemented in the core as a syntax translation, brute-forcing all f-strings to run through `String.format`. For example, the statement `x='world'; print(f'hello {x}')` gets translated *at a syntax level* (injected into the lexer) to `x='world'; print('hello {}'.format(x))`. While this may lead to weird column results in tracebacks, it seemed like the fastest, most efficient, and *likely* most RAM-friendly option, despite being implemented under the hood with a completely separate `vstr_t`. Since [string concatenation of adjacent literals is implemented in the lexer](https://github.com/micropython/micropython/commit/534b7c368dc2af7720f3aaed0c936ef46d773957), two side effects emerge: - All strings with at least one f-string portion are concatenated into a single literal which *must* be run through `String.format()` wholesale, and: - Concatenation of a raw string with interpolation characters with an f-string will cause `IndexError`/`KeyError`, which is both different from CPython *and* different from the corner case mentioned in the PEP (which gave an example of the following:) ```python x = 10 y = 'hi' assert ('a' 'b' f'{x}' '{c}' f'str<{y:^4}>' 'd' 'e') == 'ab10{c}str< hi >de' ``` The above-linked commit detailed a pretty solid case for leaving string concatenation in the lexer rather than putting it in the parser, and undoing that decision would likely be disproportionately costly on resources for the sake of a probably-low-impact corner case. An alternative to become complaint with this corner case of the PEP would be to revert to string concatenation in the parser *only when an f-string is part of concatenation*, though I've done no investigation on the difficulty or costs of doing this. A decent set of tests is included. I've manually tested this on the `unix` port on Linux and on a Feather M4 Express (`atmel-samd`) and things seem sane. --- ports/bare-arm/mpconfigport.h | 1 + ports/unix/mpconfigport.h | 1 + ports/windows/mpconfigport.h | 1 + py/lexer.c | 131 +++++++++++++++++++++- py/lexer.h | 8 ++ py/mpconfig.h | 6 + py/parse.c | 8 ++ tests/basics/string_pep498_fstring.py | 115 +++++++++++++++++++ tests/basics/string_pep498_fstring.py.exp | 0 tests/cmdline/cmd_parsetree.py.exp | 4 +- 10 files changed, 268 insertions(+), 7 deletions(-) create mode 100644 tests/basics/string_pep498_fstring.py create mode 100644 tests/basics/string_pep498_fstring.py.exp diff --git a/ports/bare-arm/mpconfigport.h b/ports/bare-arm/mpconfigport.h index 4567676580324..b42d95da30f70 100644 --- a/ports/bare-arm/mpconfigport.h +++ b/ports/bare-arm/mpconfigport.h @@ -37,6 +37,7 @@ #define MICROPY_PY_ARRAY (0) #define MICROPY_PY_ATTRTUPLE (0) #define MICROPY_PY_COLLECTIONS (0) +#define MICROPY_PY_FSTRING (0) #define MICROPY_PY_MATH (0) #define MICROPY_PY_CMATH (0) #define MICROPY_PY_IO (0) diff --git a/ports/unix/mpconfigport.h b/ports/unix/mpconfigport.h index f3c61c18f100d..56c8734ac2ee9 100644 --- a/ports/unix/mpconfigport.h +++ b/ports/unix/mpconfigport.h @@ -122,6 +122,7 @@ #define MICROPY_PY_SYS_EXC_INFO (1) #define MICROPY_PY_COLLECTIONS_DEQUE (1) #define MICROPY_PY_COLLECTIONS_ORDEREDDICT (1) +#define MICROPY_PY_FSTRING (1) #ifndef MICROPY_PY_MATH_SPECIAL_FUNCTIONS #define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (1) #endif diff --git a/ports/windows/mpconfigport.h b/ports/windows/mpconfigport.h index 84c196e11fa9c..a10fad871c62a 100644 --- a/ports/windows/mpconfigport.h +++ b/ports/windows/mpconfigport.h @@ -90,6 +90,7 @@ #define MICROPY_PY_SYS_EXC_INFO (1) #define MICROPY_PY_COLLECTIONS_DEQUE (1) #define MICROPY_PY_COLLECTIONS_ORDEREDDICT (1) +#define MICROPY_PY_FSTRING (1) #define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (1) #define MICROPY_PY_MATH_ISCLOSE (1) #define MICROPY_PY_CMATH (1) diff --git a/py/lexer.c b/py/lexer.c index 10bb999af6b70..d1186daa0c96a 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -62,6 +62,10 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) { return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3; } +STATIC bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) { + return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4; +} + STATIC bool is_char_following(mp_lexer_t *lex, byte c) { return lex->chr1 == c; } @@ -105,7 +109,9 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) { STATIC bool is_string_or_bytes(mp_lexer_t *lex) { return is_char_or(lex, '\'', '\"') - || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"')) + || (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"')) + || (((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r')) + && is_char_following_following_or(lex, '\'', '\"'))) || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"')); } @@ -119,6 +125,29 @@ STATIC bool is_tail_of_identifier(mp_lexer_t *lex) { return is_head_of_identifier(lex) || is_digit(lex); } +STATIC void swap_char_banks(mp_lexer_t *lex) { + if (lex->vstr_postfix_processing) { + lex->chr3 = lex->chr0; + lex->chr4 = lex->chr1; + lex->chr5 = lex->chr2; + lex->chr0 = lex->vstr_postfix.buf[0]; + lex->chr1 = lex->vstr_postfix.buf[1]; + lex->chr2 = lex->vstr_postfix.buf[2]; + + lex->vstr_postfix_idx = 3; + } else { + // blindly reset to the "backup" bank when done postfix processing + // this restores control to the mp_reader + lex->chr0 = lex->chr3; + lex->chr1 = lex->chr4; + lex->chr2 = lex->chr5; + // willfully ignoring setting chr3-5 here - WARNING consider those garbage data now + + vstr_reset(&lex->vstr_postfix); + lex->vstr_postfix_idx = 0; + } +} + STATIC void next_char(mp_lexer_t *lex) { if (lex->chr0 == '\n') { // a new line @@ -134,7 +163,16 @@ STATIC void next_char(mp_lexer_t *lex) { lex->chr0 = lex->chr1; lex->chr1 = lex->chr2; - lex->chr2 = lex->reader.readbyte(lex->reader.data); + + if (lex->vstr_postfix_processing) { + if (lex->vstr_postfix_idx == lex->vstr_postfix.len) { + lex->chr2 = '\0'; + } else { + lex->chr2 = lex->vstr_postfix.buf[lex->vstr_postfix_idx++]; + } + } else { + lex->chr2 = lex->reader.readbyte(lex->reader.data); + } if (lex->chr1 == '\r') { // CR is a new line, converted to LF @@ -149,6 +187,11 @@ STATIC void next_char(mp_lexer_t *lex) { if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') { lex->chr2 = '\n'; } + + if (lex->vstr_postfix_processing && lex->chr0 == '\0') { + lex->vstr_postfix_processing = false; + swap_char_banks(lex); + } } STATIC void indent_push(mp_lexer_t *lex, size_t indent) { @@ -270,7 +313,7 @@ STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) { return true; } -STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) { +STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) { // get first quoting character char quote_char = '\''; if (is_char(lex, '\"')) { @@ -291,15 +334,69 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) { } size_t n_closing = 0; + # if MICROPY_PY_FSTRING + bool in_expression = false; + bool expression_eat = true; + # endif + while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) { if (is_char(lex, quote_char)) { n_closing += 1; vstr_add_char(&lex->vstr, CUR_CHAR(lex)); } else { n_closing = 0; + + # if MICROPY_PY_FSTRING + if (is_fstring && is_char(lex, '{')) { + vstr_add_char(&lex->vstr, CUR_CHAR(lex)); + in_expression = !in_expression; + expression_eat = in_expression; + + if (lex->vstr_postfix.len == 0) { + vstr_add_str(&lex->vstr_postfix, ".format("); + } + + next_char(lex); + continue; + } + + if (is_fstring && is_char(lex, '}')) { + vstr_add_char(&lex->vstr, CUR_CHAR(lex)); + + if (in_expression) { + in_expression = false; + vstr_add_char(&lex->vstr_postfix, ','); + } + + next_char(lex); + continue; + } + + if (in_expression) { + // throw errors for illegal chars inside f-string expressions + if (is_char(lex, '#') || is_char(lex, '\\')) { + lex->tok_kind = MP_TOKEN_MALFORMED_FSTRING; + return; + } else if (is_char(lex, ':')) { + expression_eat = false; + } + + unichar c = CUR_CHAR(lex); + if (expression_eat) { + vstr_add_char(&lex->vstr_postfix, c); + } else { + vstr_add_char(&lex->vstr, c); + } + + next_char(lex); + continue; + } + # endif + if (is_char(lex, '\\')) { next_char(lex); unichar c = CUR_CHAR(lex); + if (is_raw) { // raw strings allow escaping of quotes, but the backslash is also emitted vstr_add_char(&lex->vstr, '\\'); @@ -448,6 +545,13 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) { } void mp_lexer_to_next(mp_lexer_t *lex) { + if (lex->vstr_postfix.len && !lex->vstr_postfix_processing) { + // end format call injection + vstr_add_char(&lex->vstr_postfix, ')'); + lex->vstr_postfix_processing = true; + swap_char_banks(lex); + } + // start new token text vstr_reset(&lex->vstr); @@ -503,6 +607,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) { do { // parse type codes bool is_raw = false; + bool is_fstring = false; mp_token_kind_t kind = MP_TOKEN_STRING; int n_char = 0; if (is_char(lex, 'u')) { @@ -521,7 +626,23 @@ void mp_lexer_to_next(mp_lexer_t *lex) { kind = MP_TOKEN_BYTES; n_char = 2; } + # if MICROPY_PY_FSTRING + if (is_char_following(lex, 'f')) { + lex->tok_kind = MP_TOKEN_FSTRING_RAW; + break; + } + # endif } + # if MICROPY_PY_FSTRING + else if (is_char(lex, 'f')) { + if (is_char_following(lex, 'r')) { + lex->tok_kind = MP_TOKEN_FSTRING_RAW; + break; + } + n_char = 1; + is_fstring = true; + } + # endif // Set or check token kind if (lex->tok_kind == MP_TOKEN_END) { @@ -540,13 +661,12 @@ void mp_lexer_to_next(mp_lexer_t *lex) { } // Parse the literal - parse_string_literal(lex, is_raw); + parse_string_literal(lex, is_raw, is_fstring); // Skip whitespace so we can check if there's another string following skip_whitespace(lex, true); } while (is_string_or_bytes(lex)); - } else if (is_head_of_identifier(lex)) { lex->tok_kind = MP_TOKEN_NAME; @@ -700,6 +820,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) { lex->num_indent_level = 1; lex->indent_level = m_new(uint16_t, lex->alloc_indent_level); vstr_init(&lex->vstr, 32); + vstr_init(&lex->vstr_postfix, 0); // store sentinel for first indentation level lex->indent_level[0] = 0; diff --git a/py/lexer.h b/py/lexer.h index b9f97013a1bd9..88e2f5bcd3591 100644 --- a/py/lexer.h +++ b/py/lexer.h @@ -44,6 +44,10 @@ typedef enum _mp_token_kind_t { MP_TOKEN_INVALID, MP_TOKEN_DEDENT_MISMATCH, MP_TOKEN_LONELY_STRING_OPEN, + # if MICROPY_PY_FSTRING + MP_TOKEN_MALFORMED_FSTRING, + MP_TOKEN_FSTRING_RAW, + # endif MP_TOKEN_NEWLINE, MP_TOKEN_INDENT, @@ -157,6 +161,7 @@ typedef struct _mp_lexer_t { mp_reader_t reader; // stream source unichar chr0, chr1, chr2; // current cached characters from source + unichar chr3, chr4, chr5; // current cached characters from alt source size_t line; // current source line size_t column; // current source column @@ -172,6 +177,9 @@ typedef struct _mp_lexer_t { size_t tok_column; // token source column mp_token_kind_t tok_kind; // token kind vstr_t vstr; // token data + vstr_t vstr_postfix; // postfix to apply to string + bool vstr_postfix_processing; + uint16_t vstr_postfix_idx; } mp_lexer_t; mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader); diff --git a/py/mpconfig.h b/py/mpconfig.h index f2b3af1f2a314..666e0c5ffdb82 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -1108,6 +1108,12 @@ typedef double mp_float_t; #define MICROPY_PY_COLLECTIONS_NAMEDTUPLE__ASDICT (0) #endif +// Whether to include support for PEP-498 f-strings +#ifndef MICROPY_PY_FSTRING +#define MICROPY_PY_FSTRING (0) +#endif + + // Whether to provide "math" module #ifndef MICROPY_PY_MATH #define MICROPY_PY_MATH (1) diff --git a/py/parse.c b/py/parse.c index b93282165f88a..768c7ee88f1e5 100644 --- a/py/parse.c +++ b/py/parse.c @@ -1155,6 +1155,14 @@ mp_parse_tree_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) { } else if (lex->tok_kind == MP_TOKEN_DEDENT_MISMATCH) { exc = mp_obj_new_exception_msg(&mp_type_IndentationError, MP_ERROR_TEXT("unindent doesn't match any outer indent level")); + # if MICROPY_PY_FSTRING + } else if (lex->tok_kind == MP_TOKEN_MALFORMED_FSTRING) { + exc = mp_obj_new_exception_msg(&mp_type_SyntaxError, + MP_ERROR_TEXT("malformed f-string")); + } else if (lex->tok_kind == MP_TOKEN_FSTRING_RAW) { + exc = mp_obj_new_exception_msg(&mp_type_SyntaxError, + MP_ERROR_TEXT("raw f-strings are not supported")); + # endif } else { exc = mp_obj_new_exception_msg(&mp_type_SyntaxError, MP_ERROR_TEXT("invalid syntax")); diff --git a/tests/basics/string_pep498_fstring.py b/tests/basics/string_pep498_fstring.py new file mode 100644 index 0000000000000..429c93d9feeec --- /dev/null +++ b/tests/basics/string_pep498_fstring.py @@ -0,0 +1,115 @@ +# Tests against https://www.python.org/dev/peps/pep-0498/ + +import sys + +assert f'no interpolation' == 'no interpolation' +assert f"no interpolation" == 'no interpolation' + +# Quoth the PEP: +# Backslashes may not appear anywhere within expressions. Comments, using the +# '#' character, are not allowed inside an expression +# +# CPython (3.7.4 on Linux) raises a SyntaxError here: +# >>> f'{#}' +# File "", line 1 +# SyntaxError: f-string expression part cannot include '#' +# >>> f'{\}' +# File "", line 1 +# SyntaxError: f-string expression part cannot include a backslash +# >>> f'{\\}' +# File "", line 1 +# SyntaxError: f-string expression part cannot include a backslash +# >>> f'{\#}' +# File "", line 1 +# SyntaxError: f-string expression part cannot include a backslash + +# Backslashes and comments allowed outside expression +assert f"\\" == "\\" +assert f'#' == '#' + +## But not inside +try: + eval("f'{\}'") +except SyntaxError: + pass +else: + raise AssertionError('f-string with backslash in expression did not raise SyntaxError') + +try: + eval("f'{#}'") +except SyntaxError: + pass +else: + raise AssertionError('f-string with \'#\' in expression did not raise SyntaxError') + +# Quoth the PEP: +# While scanning the string for expressions, any doubled braces '{{' or '}}' +# inside literal portions of an f-string are replaced by the corresponding +# single brace. Doubled literal opening braces do not signify the start of an +# expression. A single closing curly brace '}' in the literal portion of a +# string is an error: literal closing curly braces must be doubled '}}' in +# order to represent a single closing brace. +# +# CPython (3.7.4 on Linux) raises a SyntaxError for the last case: +# >>> f'{{}' +# File "", line 1 +# SyntaxError: f-string: single '}' is not allowed + +assert f'{{}}' == '{}' + +try: + eval("f'{{}'") +except (ValueError, SyntaxError): + pass +else: + raise RuntimeError('Expected ValueError for invalid f-string literal bracing') + +x = 1 +assert f'{x}' == '1' + +# Quoth the PEP: +# The expressions that are extracted from the string are evaluated in the +# context where the f-string appeared. This means the expression has full +# access to local and global variables. Any valid Python expression can be +# used, including function and method calls. Because the f-strings are +# evaluated where the string appears in the source code, there is no additional +# expressiveness available with f-strings. There are also no additional +# security concerns: you could have also just written the same expression, not +# inside of an f-string: + +def foo(): + return 20 + +assert f'result={foo()}' == 'result=20' +assert f'result={foo()}' == 'result={}'.format(foo()) +assert f'result={foo()}' == 'result={result}'.format(result=foo()) + + +# Quoth the PEP: +# Adjacent f-strings and regular strings are concatenated. Regular strings are +# concatenated at compile time, and f-strings are concatenated at run time. For +# example, the expression: +# +# >>> x = 10 +# >>> y = 'hi' +# >>> 'a' 'b' f'{x}' '{c}' f'str<{y:^4}>' 'd' 'e' +# +# yields the value: 'ab10{c}str< hi >de' +# +# Because strings are concatenated at lexer time rather than parser time in +# MicroPython for mostly RAM efficiency reasons (see +# https://github.com/micropython/micropython/commit/534b7c368dc2af7720f3aaed0c936ef46d773957), +# and because f-strings here are implemented as a syntax translation +# (f'{something}' => '{}'.format(something)), this particular functionality is unimplemented, +# and in the above example, the '{c}' portion will trigger a KeyError on String.format() + +x = 10 +y = 'hi' +assert (f'h' f'i') == 'hi' +assert (f'h' 'i') == 'hi' +assert ('h' f'i') == 'hi' +assert f'{x:^4}' == ' 10 ' +assert ('a' 'b' f'{x}' f'str<{y:^4}>' 'd' 'e') == 'ab10str< hi >de' + +# Other tests +assert f'{{{4*10}}}' == '{40}' diff --git a/tests/basics/string_pep498_fstring.py.exp b/tests/basics/string_pep498_fstring.py.exp new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/cmdline/cmd_parsetree.py.exp b/tests/cmdline/cmd_parsetree.py.exp index 18986318a02b4..e5cb1f91fe382 100644 --- a/tests/cmdline/cmd_parsetree.py.exp +++ b/tests/cmdline/cmd_parsetree.py.exp @@ -1,6 +1,6 @@ ---------------- [ 4] rule(1) (n=9) - tok(4) + tok(10) [ 4] rule(22) (n=4) id(i) [ 4] rule(44) (n=1) @@ -9,7 +9,7 @@ NULL [ 6] rule(5) (n=2) id(a) - tok(14) + tok(20) [ 7] rule(5) (n=2) id(b) str(str) pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy