Skip to content

Commit 600051d

Browse files
committed
py: Implement partial PEP-498 (f-string) support
This implements (most of) the PEP-498 spec for f-strings, with two exceptions: - raw f-strings (`fr` or `rf` prefixes) raise `NotImplementedError` - one special corner case does not function as specified in the PEP (more on that in a moment) This is implemented in the core as a syntax translation, brute-forcing all f-strings to run through `String.format`. For example, the statement `x='world'; print(f'hello {x}')` gets translated *at a syntax level* (injected into the lexer) to `x='world'; print('hello {}'.format(x))`. While this may lead to weird column results in tracebacks, it seemed like the fastest, most efficient, and *likely* most RAM-friendly option, despite being implemented under the hood with a completely separate `vstr_t`. Since [string concatenation of adjacent literals is implemented in the lexer](534b7c3), two side effects emerge: - All strings with at least one f-string portion are concatenated into a single literal which *must* be run through `String.format()` wholesale, and: - Concatenation of a raw string with interpolation characters with an f-string will cause `IndexError`/`KeyError`, which is both different from CPython *and* different from the corner case mentioned in the PEP (which gave an example of the following:) ```python x = 10 y = 'hi' assert ('a' 'b' f'{x}' '{c}' f'str<{y:^4}>' 'd' 'e') == 'ab10{c}str< hi >de' ``` The above-linked commit detailed a pretty solid case for leaving string concatenation in the lexer rather than putting it in the parser, and undoing that decision would likely be disproportionately costly on resources for the sake of a probably-low-impact corner case. An alternative to become complaint with this corner case of the PEP would be to revert to string concatenation in the parser *only when an f-string is part of concatenation*, though I've done no investigation on the difficulty or costs of doing this. A decent set of tests is included. I've manually tested this on the `unix` port on Linux and on a Feather M4 Express (`atmel-samd`) and things seem sane.
1 parent cd9a8c1 commit 600051d

File tree

10 files changed

+268
-7
lines changed

10 files changed

+268
-7
lines changed

ports/bare-arm/mpconfigport.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#define MICROPY_PY_ARRAY (0)
3838
#define MICROPY_PY_ATTRTUPLE (0)
3939
#define MICROPY_PY_COLLECTIONS (0)
40+
#define MICROPY_PY_FSTRING (0)
4041
#define MICROPY_PY_MATH (0)
4142
#define MICROPY_PY_CMATH (0)
4243
#define MICROPY_PY_IO (0)

ports/unix/mpconfigport.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@
122122
#define MICROPY_PY_SYS_EXC_INFO (1)
123123
#define MICROPY_PY_COLLECTIONS_DEQUE (1)
124124
#define MICROPY_PY_COLLECTIONS_ORDEREDDICT (1)
125+
#define MICROPY_PY_FSTRING (1)
125126
#ifndef MICROPY_PY_MATH_SPECIAL_FUNCTIONS
126127
#define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (1)
127128
#endif

ports/windows/mpconfigport.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@
9090
#define MICROPY_PY_SYS_EXC_INFO (1)
9191
#define MICROPY_PY_COLLECTIONS_DEQUE (1)
9292
#define MICROPY_PY_COLLECTIONS_ORDEREDDICT (1)
93+
#define MICROPY_PY_FSTRING (1)
9394
#define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (1)
9495
#define MICROPY_PY_MATH_ISCLOSE (1)
9596
#define MICROPY_PY_CMATH (1)

py/lexer.c

Lines changed: 126 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
6262
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
6363
}
6464

65+
STATIC bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) {
66+
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4;
67+
}
68+
6569
STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
6670
return lex->chr1 == c;
6771
}
@@ -105,7 +109,9 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
105109

106110
STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
107111
return is_char_or(lex, '\'', '\"')
108-
|| (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
112+
|| (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"'))
113+
|| (((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r'))
114+
&& is_char_following_following_or(lex, '\'', '\"')))
109115
|| ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
110116
&& is_char_following_following_or(lex, '\'', '\"'));
111117
}
@@ -119,6 +125,29 @@ STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
119125
return is_head_of_identifier(lex) || is_digit(lex);
120126
}
121127

128+
STATIC void swap_char_banks(mp_lexer_t *lex) {
129+
if (lex->vstr_postfix_processing) {
130+
lex->chr3 = lex->chr0;
131+
lex->chr4 = lex->chr1;
132+
lex->chr5 = lex->chr2;
133+
lex->chr0 = lex->vstr_postfix.buf[0];
134+
lex->chr1 = lex->vstr_postfix.buf[1];
135+
lex->chr2 = lex->vstr_postfix.buf[2];
136+
137+
lex->vstr_postfix_idx = 3;
138+
} else {
139+
// blindly reset to the "backup" bank when done postfix processing
140+
// this restores control to the mp_reader
141+
lex->chr0 = lex->chr3;
142+
lex->chr1 = lex->chr4;
143+
lex->chr2 = lex->chr5;
144+
// willfully ignoring setting chr3-5 here - WARNING consider those garbage data now
145+
146+
vstr_reset(&lex->vstr_postfix);
147+
lex->vstr_postfix_idx = 0;
148+
}
149+
}
150+
122151
STATIC void next_char(mp_lexer_t *lex) {
123152
if (lex->chr0 == '\n') {
124153
// a new line
@@ -134,7 +163,16 @@ STATIC void next_char(mp_lexer_t *lex) {
134163

135164
lex->chr0 = lex->chr1;
136165
lex->chr1 = lex->chr2;
137-
lex->chr2 = lex->reader.readbyte(lex->reader.data);
166+
167+
if (lex->vstr_postfix_processing) {
168+
if (lex->vstr_postfix_idx == lex->vstr_postfix.len) {
169+
lex->chr2 = '\0';
170+
} else {
171+
lex->chr2 = lex->vstr_postfix.buf[lex->vstr_postfix_idx++];
172+
}
173+
} else {
174+
lex->chr2 = lex->reader.readbyte(lex->reader.data);
175+
}
138176

139177
if (lex->chr1 == '\r') {
140178
// CR is a new line, converted to LF
@@ -149,6 +187,11 @@ STATIC void next_char(mp_lexer_t *lex) {
149187
if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
150188
lex->chr2 = '\n';
151189
}
190+
191+
if (lex->vstr_postfix_processing && lex->chr0 == '\0') {
192+
lex->vstr_postfix_processing = false;
193+
swap_char_banks(lex);
194+
}
152195
}
153196

154197
STATIC void indent_push(mp_lexer_t *lex, size_t indent) {
@@ -270,7 +313,7 @@ STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
270313
return true;
271314
}
272315

273-
STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
316+
STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) {
274317
// get first quoting character
275318
char quote_char = '\'';
276319
if (is_char(lex, '\"')) {
@@ -291,15 +334,69 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
291334
}
292335

293336
size_t n_closing = 0;
337+
# if MICROPY_PY_FSTRING
338+
bool in_expression = false;
339+
bool expression_eat = true;
340+
# endif
341+
294342
while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
295343
if (is_char(lex, quote_char)) {
296344
n_closing += 1;
297345
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
298346
} else {
299347
n_closing = 0;
348+
349+
# if MICROPY_PY_FSTRING
350+
if (is_fstring && is_char(lex, '{')) {
351+
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
352+
in_expression = !in_expression;
353+
expression_eat = in_expression;
354+
355+
if (lex->vstr_postfix.len == 0) {
356+
vstr_add_str(&lex->vstr_postfix, ".format(");
357+
}
358+
359+
next_char(lex);
360+
continue;
361+
}
362+
363+
if (is_fstring && is_char(lex, '}')) {
364+
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
365+
366+
if (in_expression) {
367+
in_expression = false;
368+
vstr_add_char(&lex->vstr_postfix, ',');
369+
}
370+
371+
next_char(lex);
372+
continue;
373+
}
374+
375+
if (in_expression) {
376+
// throw errors for illegal chars inside f-string expressions
377+
if (is_char(lex, '#') || is_char(lex, '\\')) {
378+
lex->tok_kind = MP_TOKEN_MALFORMED_FSTRING;
379+
return;
380+
} else if (is_char(lex, ':')) {
381+
expression_eat = false;
382+
}
383+
384+
unichar c = CUR_CHAR(lex);
385+
if (expression_eat) {
386+
vstr_add_char(&lex->vstr_postfix, c);
387+
} else {
388+
vstr_add_char(&lex->vstr, c);
389+
}
390+
391+
next_char(lex);
392+
continue;
393+
}
394+
# endif
395+
300396
if (is_char(lex, '\\')) {
301397
next_char(lex);
302398
unichar c = CUR_CHAR(lex);
399+
303400
if (is_raw) {
304401
// raw strings allow escaping of quotes, but the backslash is also emitted
305402
vstr_add_char(&lex->vstr, '\\');
@@ -448,6 +545,13 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
448545
}
449546

450547
void mp_lexer_to_next(mp_lexer_t *lex) {
548+
if (lex->vstr_postfix.len && !lex->vstr_postfix_processing) {
549+
// end format call injection
550+
vstr_add_char(&lex->vstr_postfix, ')');
551+
lex->vstr_postfix_processing = true;
552+
swap_char_banks(lex);
553+
}
554+
451555
// start new token text
452556
vstr_reset(&lex->vstr);
453557

@@ -503,6 +607,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
503607
do {
504608
// parse type codes
505609
bool is_raw = false;
610+
bool is_fstring = false;
506611
mp_token_kind_t kind = MP_TOKEN_STRING;
507612
int n_char = 0;
508613
if (is_char(lex, 'u')) {
@@ -521,7 +626,23 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
521626
kind = MP_TOKEN_BYTES;
522627
n_char = 2;
523628
}
629+
# if MICROPY_PY_FSTRING
630+
if (is_char_following(lex, 'f')) {
631+
lex->tok_kind = MP_TOKEN_FSTRING_RAW;
632+
break;
633+
}
634+
# endif
524635
}
636+
# if MICROPY_PY_FSTRING
637+
else if (is_char(lex, 'f')) {
638+
if (is_char_following(lex, 'r')) {
639+
lex->tok_kind = MP_TOKEN_FSTRING_RAW;
640+
break;
641+
}
642+
n_char = 1;
643+
is_fstring = true;
644+
}
645+
# endif
525646

526647
// Set or check token kind
527648
if (lex->tok_kind == MP_TOKEN_END) {
@@ -540,13 +661,12 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
540661
}
541662

542663
// Parse the literal
543-
parse_string_literal(lex, is_raw);
664+
parse_string_literal(lex, is_raw, is_fstring);
544665

545666
// Skip whitespace so we can check if there's another string following
546667
skip_whitespace(lex, true);
547668

548669
} while (is_string_or_bytes(lex));
549-
550670
} else if (is_head_of_identifier(lex)) {
551671
lex->tok_kind = MP_TOKEN_NAME;
552672

@@ -700,6 +820,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
700820
lex->num_indent_level = 1;
701821
lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
702822
vstr_init(&lex->vstr, 32);
823+
vstr_init(&lex->vstr_postfix, 0);
703824

704825
// store sentinel for first indentation level
705826
lex->indent_level[0] = 0;

py/lexer.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@ typedef enum _mp_token_kind_t {
4444
MP_TOKEN_INVALID,
4545
MP_TOKEN_DEDENT_MISMATCH,
4646
MP_TOKEN_LONELY_STRING_OPEN,
47+
# if MICROPY_PY_FSTRING
48+
MP_TOKEN_MALFORMED_FSTRING,
49+
MP_TOKEN_FSTRING_RAW,
50+
# endif
4751

4852
MP_TOKEN_NEWLINE,
4953
MP_TOKEN_INDENT,
@@ -157,6 +161,7 @@ typedef struct _mp_lexer_t {
157161
mp_reader_t reader; // stream source
158162

159163
unichar chr0, chr1, chr2; // current cached characters from source
164+
unichar chr3, chr4, chr5; // current cached characters from alt source
160165

161166
size_t line; // current source line
162167
size_t column; // current source column
@@ -172,6 +177,9 @@ typedef struct _mp_lexer_t {
172177
size_t tok_column; // token source column
173178
mp_token_kind_t tok_kind; // token kind
174179
vstr_t vstr; // token data
180+
vstr_t vstr_postfix; // postfix to apply to string
181+
bool vstr_postfix_processing;
182+
uint16_t vstr_postfix_idx;
175183
} mp_lexer_t;
176184

177185
mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader);

py/mpconfig.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,6 +1108,12 @@ typedef double mp_float_t;
11081108
#define MICROPY_PY_COLLECTIONS_NAMEDTUPLE__ASDICT (0)
11091109
#endif
11101110

1111+
// Whether to include support for PEP-498 f-strings
1112+
#ifndef MICROPY_PY_FSTRING
1113+
#define MICROPY_PY_FSTRING (0)
1114+
#endif
1115+
1116+
11111117
// Whether to provide "math" module
11121118
#ifndef MICROPY_PY_MATH
11131119
#define MICROPY_PY_MATH (1)

py/parse.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1155,6 +1155,14 @@ mp_parse_tree_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) {
11551155
} else if (lex->tok_kind == MP_TOKEN_DEDENT_MISMATCH) {
11561156
exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
11571157
MP_ERROR_TEXT("unindent doesn't match any outer indent level"));
1158+
# if MICROPY_PY_FSTRING
1159+
} else if (lex->tok_kind == MP_TOKEN_MALFORMED_FSTRING) {
1160+
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
1161+
MP_ERROR_TEXT("malformed f-string"));
1162+
} else if (lex->tok_kind == MP_TOKEN_FSTRING_RAW) {
1163+
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
1164+
MP_ERROR_TEXT("raw f-strings are not supported"));
1165+
# endif
11581166
} else {
11591167
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
11601168
MP_ERROR_TEXT("invalid syntax"));

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy