py: Implement partial PEP-498 (f-string) support

klardotsh · klardotsh · commit 600051de4931 · 2020-05-15T17:35:11.000-07:00
This implements (most of) the PEP-498 spec for f-strings, with two exceptions: - raw f-strings (`fr` or `rf` prefixes) raise `NotImplementedError` - one special corner case does not function as specified in the PEP (more on that in a moment) This is implemented in the core as a syntax translation, brute-forcing all f-strings to run through `String.format`. For example, the statement `x='world'; print(f'hello {x}')` gets translated *at a syntax level* (injected into the lexer) to `x='world'; print('hello {}'.format(x))`. While this may lead to weird column results in tracebacks, it seemed like the fastest, most efficient, and *likely* most RAM-friendly option, despite being implemented under the hood with a completely separate `vstr_t`. Since [string concatenation of adjacent literals is implemented in the lexer](534b7c3), two side effects emerge: - All strings with at least one f-string portion are concatenated into a single literal which *must* be run through `String.format()` wholesale, and: - Concatenation of a raw string with interpolation characters with an f-string will cause `IndexError`/`KeyError`, which is both different from CPython *and* different from the corner case mentioned in the PEP (which gave an example of the following:) ```python x = 10 y = 'hi' assert ('a' 'b' f'{x}' '{c}' f'str<{y:^4}>' 'd' 'e') == 'ab10{c}str< hi >de' ``` The above-linked commit detailed a pretty solid case for leaving string concatenation in the lexer rather than putting it in the parser, and undoing that decision would likely be disproportionately costly on resources for the sake of a probably-low-impact corner case. An alternative to become complaint with this corner case of the PEP would be to revert to string concatenation in the parser *only when an f-string is part of concatenation*, though I've done no investigation on the difficulty or costs of doing this. A decent set of tests is included. I've manually tested this on the `unix` port on Linux and on a Feather M4 Express (`atmel-samd`) and things seem sane.
diff --git a/ports/bare-arm/mpconfigport.h b/ports/bare-arm/mpconfigport.h
@@ -37,6 +37,7 @@
 #define MICROPY_PY_ARRAY            (0)
 #define MICROPY_PY_ATTRTUPLE        (0)
 #define MICROPY_PY_COLLECTIONS      (0)
+#define MICROPY_PY_FSTRING          (0)
 #define MICROPY_PY_MATH             (0)
 #define MICROPY_PY_CMATH            (0)
 #define MICROPY_PY_IO               (0)
diff --git a/ports/unix/mpconfigport.h b/ports/unix/mpconfigport.h
@@ -122,6 +122,7 @@
 #define MICROPY_PY_SYS_EXC_INFO     (1)
 #define MICROPY_PY_COLLECTIONS_DEQUE (1)
 #define MICROPY_PY_COLLECTIONS_ORDEREDDICT (1)
+#define MICROPY_PY_FSTRING          (1)
 #ifndef MICROPY_PY_MATH_SPECIAL_FUNCTIONS
 #define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (1)
 #endif
diff --git a/ports/windows/mpconfigport.h b/ports/windows/mpconfigport.h
@@ -90,6 +90,7 @@
 #define MICROPY_PY_SYS_EXC_INFO     (1)
 #define MICROPY_PY_COLLECTIONS_DEQUE (1)
 #define MICROPY_PY_COLLECTIONS_ORDEREDDICT (1)
+#define MICROPY_PY_FSTRING          (1)
 #define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (1)
 #define MICROPY_PY_MATH_ISCLOSE     (1)
 #define MICROPY_PY_CMATH            (1)
diff --git a/py/lexer.c b/py/lexer.c
@@ -62,6 +62,10 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
     return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
 }
 
+STATIC bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) {
+    return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4;
+}
+
 STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
     return lex->chr1 == c;
 }
@@ -105,7 +109,9 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
 
 STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
     return is_char_or(lex, '\'', '\"')
-           || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
+           || (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"'))
+           || (((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r'))
+               && is_char_following_following_or(lex, '\'', '\"')))
            || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
                && is_char_following_following_or(lex, '\'', '\"'));
 }
@@ -119,6 +125,29 @@ STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
     return is_head_of_identifier(lex) || is_digit(lex);
 }
 
+STATIC void swap_char_banks(mp_lexer_t *lex) {
+    if (lex->vstr_postfix_processing) {
+        lex->chr3 = lex->chr0;
+        lex->chr4 = lex->chr1;
+        lex->chr5 = lex->chr2;
+        lex->chr0 = lex->vstr_postfix.buf[0];
+        lex->chr1 = lex->vstr_postfix.buf[1];
+        lex->chr2 = lex->vstr_postfix.buf[2];
+
+        lex->vstr_postfix_idx = 3;
+    } else {
+        // blindly reset to the "backup" bank when done postfix processing
+        // this restores control to the mp_reader
+        lex->chr0 = lex->chr3;
+        lex->chr1 = lex->chr4;
+        lex->chr2 = lex->chr5;
+        // willfully ignoring setting chr3-5 here - WARNING consider those garbage data now
+
+        vstr_reset(&lex->vstr_postfix);
+        lex->vstr_postfix_idx = 0;
+    }
+}
+
 STATIC void next_char(mp_lexer_t *lex) {
     if (lex->chr0 == '\n') {
         // a new line
@@ -134,7 +163,16 @@ STATIC void next_char(mp_lexer_t *lex) {
 
     lex->chr0 = lex->chr1;
     lex->chr1 = lex->chr2;
-    lex->chr2 = lex->reader.readbyte(lex->reader.data);
+
+    if (lex->vstr_postfix_processing) {
+        if (lex->vstr_postfix_idx == lex->vstr_postfix.len) {
+            lex->chr2 = '\0';
+        } else {
+            lex->chr2 = lex->vstr_postfix.buf[lex->vstr_postfix_idx++];
+        }
+    } else {
+        lex->chr2 = lex->reader.readbyte(lex->reader.data);
+    }
 
     if (lex->chr1 == '\r') {
         // CR is a new line, converted to LF
@@ -149,6 +187,11 @@ STATIC void next_char(mp_lexer_t *lex) {
     if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
         lex->chr2 = '\n';
     }
+
+    if (lex->vstr_postfix_processing && lex->chr0 == '\0') {
+        lex->vstr_postfix_processing = false;
+        swap_char_banks(lex);
+    }
 }
 
 STATIC void indent_push(mp_lexer_t *lex, size_t indent) {
@@ -270,7 +313,7 @@ STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
     return true;
 }
 
-STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
+STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) {
     // get first quoting character
     char quote_char = '\'';
     if (is_char(lex, '\"')) {
@@ -291,15 +334,69 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
     }
 
     size_t n_closing = 0;
+    # if MICROPY_PY_FSTRING
+    bool in_expression = false;
+    bool expression_eat = true;
+    # endif
+
     while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
         if (is_char(lex, quote_char)) {
             n_closing += 1;
             vstr_add_char(&lex->vstr, CUR_CHAR(lex));
         } else {
             n_closing = 0;
+
+            # if MICROPY_PY_FSTRING
+            if (is_fstring && is_char(lex, '{')) {
+                vstr_add_char(&lex->vstr, CUR_CHAR(lex));
+                in_expression = !in_expression;
+                expression_eat = in_expression;
+
+                if (lex->vstr_postfix.len == 0) {
+                    vstr_add_str(&lex->vstr_postfix, ".format(");
+                }
+
+                next_char(lex);
+                continue;
+            }
+
+            if (is_fstring && is_char(lex, '}')) {
+                vstr_add_char(&lex->vstr, CUR_CHAR(lex));
+
+                if (in_expression) {
+                    in_expression = false;
+                    vstr_add_char(&lex->vstr_postfix, ',');
+                }
+
+                next_char(lex);
+                continue;
+            }
+
+            if (in_expression) {
+                // throw errors for illegal chars inside f-string expressions
+                if (is_char(lex, '#') || is_char(lex, '\\')) {
+                    lex->tok_kind = MP_TOKEN_MALFORMED_FSTRING;
+                    return;
+                } else if (is_char(lex, ':')) {
+                    expression_eat = false;
+                }
+
+                unichar c = CUR_CHAR(lex);
+                if (expression_eat) {
+                    vstr_add_char(&lex->vstr_postfix, c);
+                } else {
+                    vstr_add_char(&lex->vstr, c);
+                }
+
+                next_char(lex);
+                continue;
+            }
+            # endif
+
             if (is_char(lex, '\\')) {
                 next_char(lex);
                 unichar c = CUR_CHAR(lex);
+
                 if (is_raw) {
                     // raw strings allow escaping of quotes, but the backslash is also emitted
                     vstr_add_char(&lex->vstr, '\\');
@@ -448,6 +545,13 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
 }
 
 void mp_lexer_to_next(mp_lexer_t *lex) {
+    if (lex->vstr_postfix.len && !lex->vstr_postfix_processing) {
+        // end format call injection
+        vstr_add_char(&lex->vstr_postfix, ')');
+        lex->vstr_postfix_processing = true;
+        swap_char_banks(lex);
+    }
+
     // start new token text
     vstr_reset(&lex->vstr);
 
@@ -503,6 +607,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
         do {
             // parse type codes
             bool is_raw = false;
+            bool is_fstring = false;
             mp_token_kind_t kind = MP_TOKEN_STRING;
             int n_char = 0;
             if (is_char(lex, 'u')) {
@@ -521,7 +626,23 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
                     kind = MP_TOKEN_BYTES;
                     n_char = 2;
                 }
+                # if MICROPY_PY_FSTRING
+                if (is_char_following(lex, 'f')) {
+                    lex->tok_kind = MP_TOKEN_FSTRING_RAW;
+                    break;
+                }
+                # endif
             }
+            # if MICROPY_PY_FSTRING
+            else if (is_char(lex, 'f')) {
+                if (is_char_following(lex, 'r')) {
+                    lex->tok_kind = MP_TOKEN_FSTRING_RAW;
+                    break;
+                }
+                n_char = 1;
+                is_fstring = true;
+            }
+            # endif
 
             // Set or check token kind
             if (lex->tok_kind == MP_TOKEN_END) {
@@ -540,13 +661,12 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
             }
 
             // Parse the literal
-            parse_string_literal(lex, is_raw);
+            parse_string_literal(lex, is_raw, is_fstring);
 
             // Skip whitespace so we can check if there's another string following
             skip_whitespace(lex, true);
 
         } while (is_string_or_bytes(lex));
-
     } else if (is_head_of_identifier(lex)) {
         lex->tok_kind = MP_TOKEN_NAME;
 
@@ -700,6 +820,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
     lex->num_indent_level = 1;
     lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
     vstr_init(&lex->vstr, 32);
+    vstr_init(&lex->vstr_postfix, 0);
 
     // store sentinel for first indentation level
     lex->indent_level[0] = 0;
diff --git a/py/lexer.h b/py/lexer.h
@@ -44,6 +44,10 @@ typedef enum _mp_token_kind_t {
     MP_TOKEN_INVALID,
     MP_TOKEN_DEDENT_MISMATCH,
     MP_TOKEN_LONELY_STRING_OPEN,
+    # if MICROPY_PY_FSTRING
+    MP_TOKEN_MALFORMED_FSTRING,
+    MP_TOKEN_FSTRING_RAW,
+    # endif
 
     MP_TOKEN_NEWLINE,
     MP_TOKEN_INDENT,
@@ -157,6 +161,7 @@ typedef struct _mp_lexer_t {
     mp_reader_t reader;         // stream source
 
     unichar chr0, chr1, chr2;   // current cached characters from source
+    unichar chr3, chr4, chr5;   // current cached characters from alt source
 
     size_t line;                // current source line
     size_t column;              // current source column
@@ -172,6 +177,9 @@ typedef struct _mp_lexer_t {
     size_t tok_column;          // token source column
     mp_token_kind_t tok_kind;   // token kind
     vstr_t vstr;                // token data
+    vstr_t vstr_postfix;        // postfix to apply to string
+    bool vstr_postfix_processing;
+    uint16_t vstr_postfix_idx;
 } mp_lexer_t;
 
 mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader);
diff --git a/py/mpconfig.h b/py/mpconfig.h
@@ -1108,6 +1108,12 @@ typedef double mp_float_t;
 #define MICROPY_PY_COLLECTIONS_NAMEDTUPLE__ASDICT (0)
 #endif
 
+// Whether to include support for PEP-498 f-strings
+#ifndef MICROPY_PY_FSTRING
+#define MICROPY_PY_FSTRING (0)
+#endif
+
+
 // Whether to provide "math" module
 #ifndef MICROPY_PY_MATH
 #define MICROPY_PY_MATH (1)
diff --git a/py/parse.c b/py/parse.c
@@ -1155,6 +1155,14 @@ mp_parse_tree_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) {
         } else if (lex->tok_kind == MP_TOKEN_DEDENT_MISMATCH) {
             exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
                 MP_ERROR_TEXT("unindent doesn't match any outer indent level"));
+        # if MICROPY_PY_FSTRING
+        } else if (lex->tok_kind == MP_TOKEN_MALFORMED_FSTRING) {
+            exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
+                MP_ERROR_TEXT("malformed f-string"));
+        } else if (lex->tok_kind == MP_TOKEN_FSTRING_RAW) {
+            exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
+                MP_ERROR_TEXT("raw f-strings are not supported"));
+        # endif
         } else {
             exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
                 MP_ERROR_TEXT("invalid syntax"));
diff --git a/tests/basics/string_pep498_fstring.py b/tests/basics/string_pep498_fstring.py
diff --git a/tests/basics/string_pep498_fstring.py.exp b/tests/basics/string_pep498_fstring.py.exp
diff --git a/tests/cmdline/cmd_parsetree.py.exp b/tests/cmdline/cmd_parsetree.py.exp