From faefec5b1f96a93dfdecccfb241e3bf340743ba7 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 6 Oct 2022 16:09:56 -0700 Subject: [PATCH 1/5] gh-97997: Add col_offset field to tokenizer and use that for AST nodes --- Parser/tokenizer.c | 46 +++++++++++++++++++++++++++++++++++++++------- Parser/tokenizer.h | 2 ++ 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index c5d3e580247cc1..0420cb23092f73 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -37,6 +37,8 @@ #define TABSIZE 8 #define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end) +#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\ + type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end)) /* Forward */ static struct tok_state *tok_new(void); @@ -73,6 +75,8 @@ tok_new(void) tok->pendin = 0; tok->prompt = tok->nextprompt = NULL; tok->lineno = 0; + tok->starting_col_offset = -1; + tok->col_offset = -1; tok->level = 0; tok->altindstack[0] = 0; tok->decoding_state = STATE_INIT; @@ -872,6 +876,7 @@ tok_underflow_string(struct tok_state *tok) { } tok->line_start = tok->cur; tok->lineno++; + tok->col_offset = 0; tok->inp = end; return 1; } @@ -931,6 +936,7 @@ tok_underflow_interactive(struct tok_state *tok) { Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; size_t size = strlen(newtok); tok->lineno++; + tok->col_offset = 0; if (!tok_reserve_buf(tok, size + 1)) { PyMem_Free(tok->buf); tok->buf = NULL; @@ -944,6 +950,7 @@ tok_underflow_interactive(struct tok_state *tok) { } else { tok->lineno++; + tok->col_offset = 0; PyMem_Free(tok->buf); tok->buf = newtok; tok->cur = tok->buf; @@ -999,6 +1006,7 @@ tok_underflow_file(struct tok_state *tok) { } tok->lineno++; + tok->col_offset = 0; if (tok->decoding_state != STATE_NORMAL) { if (tok->lineno > 2) { tok->decoding_state = STATE_NORMAL; @@ -1056,6 +1064,7 @@ tok_nextc(struct tok_state *tok) int rc; for (;;) { if (tok->cur != tok->inp) { + tok->col_offset++; return Py_CHARMASK(*tok->cur++); /* Fast path */ } if (tok->done != E_OK) { @@ -1104,6 +1113,7 @@ tok_backup(struct tok_state *tok, int c) if ((int)(unsigned char)*tok->cur != c) { Py_FatalError("tok_backup: wrong character"); } + tok->col_offset--; } } @@ -1390,6 +1400,19 @@ tok_continuation_line(struct tok_state *tok) { return c; } +static int +type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset, + int end_col_offset, const char *start, const char *end) +{ + token->level = tok->level; + token->lineno = token->end_lineno = tok->lineno; + token->col_offset = col_offset; + token->end_col_offset = end_col_offset; + token->start = start; + token->end = end; + return type; +} + static int token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end) { @@ -1397,14 +1420,13 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st token->level = tok->level; token->lineno = type == STRING ? tok->first_lineno : tok->lineno; token->end_lineno = tok->lineno; - token->col_offset = -1; - token->end_col_offset = -1; + token->col_offset = token->end_col_offset = -1; token->start = start; token->end = end; + if (start != NULL && end != NULL) { - const char *line_start = type == STRING ? tok->multi_line_start : tok->line_start; - token->col_offset = (start >= line_start) ? (int)(start - line_start) : -1; - token->end_col_offset = (end >= tok->line_start) ? (int)(end - tok->line_start) : -1; + token->col_offset = tok->starting_col_offset; + token->end_col_offset = tok->col_offset; } return type; } @@ -1419,6 +1441,7 @@ tok_get(struct tok_state *tok, struct token *token) const char *p_end = NULL; nextline: tok->start = NULL; + tok->starting_col_offset = -1; blankline = 0; /* Get indentation level */ @@ -1426,6 +1449,7 @@ tok_get(struct tok_state *tok, struct token *token) int col = 0; int altcol = 0; tok->atbol = 0; + tok->starting_col_offset = 0; int cont_line_col = 0; for (;;) { c = tok_nextc(tok); @@ -1518,6 +1542,7 @@ tok_get(struct tok_state *tok, struct token *token) } tok->start = tok->cur; + tok->starting_col_offset = tok->col_offset; /* Return pending indents/dedents */ if (tok->pendin != 0) { @@ -1565,10 +1590,12 @@ tok_get(struct tok_state *tok, struct token *token) /* Set start of current token */ tok->start = tok->cur == NULL ? NULL : tok->cur - 1; + tok->starting_col_offset = tok->col_offset - 1; /* Skip comment, unless it's a type comment */ if (c == '#') { const char *prefix, *p, *type_start; + int current_starting_col_offset; while (c != EOF && c != '\n') { c = tok_nextc(tok); @@ -1576,14 +1603,17 @@ tok_get(struct tok_state *tok, struct token *token) if (tok->type_comments) { p = tok->start; + current_starting_col_offset = tok->starting_col_offset; prefix = type_comment_prefix; while (*prefix && p < tok->cur) { if (*prefix == ' ') { while (*p == ' ' || *p == '\t') { p++; + current_starting_col_offset++; } } else if (*prefix == *p) { p++; + current_starting_col_offset++; } else { break; } @@ -1595,6 +1625,7 @@ tok_get(struct tok_state *tok, struct token *token) if (!*prefix) { int is_type_ignore = 1; const char *ignore_end = p + 6; + const int ignore_end_col_offset = current_starting_col_offset + 6; tok_backup(tok, c); /* don't eat the newline or EOF */ type_start = p; @@ -1615,11 +1646,12 @@ tok_get(struct tok_state *tok, struct token *token) tok_nextc(tok); tok->atbol = 1; } - return MAKE_TOKEN(TYPE_IGNORE); + // +6 below cause we need to skip the ignore part + return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset); } else { p_start = type_start; p_end = tok->cur; - return MAKE_TOKEN(TYPE_COMMENT); + return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset); } } } diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 5b8c7f314386ec..2542d30e1da0ed 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -57,6 +57,8 @@ struct tok_state { int lineno; /* Current line number */ int first_lineno; /* First line of a single line or multi line string expression (cf. issue 16806) */ + int starting_col_offset; /* The column offset at the beginning of a token */ + int col_offset; /* Current col offset */ int level; /* () [] {} Parentheses nesting level */ /* Used to allow free continuations inside them */ char parenstack[MAXLEVEL]; From d3f852ca568be8258d0c8ba6bf069876e05e2790 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Thu, 6 Oct 2022 23:13:37 +0000 Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst b/Misc/NEWS.d/next/Core and Builtins/2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst new file mode 100644 index 00000000000000..5cb5e2126638be --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst @@ -0,0 +1 @@ +Add running column offset to the tokenizer state to avoid calculating AST column information with pointer arithmetic. From 5c47419ee8941e752ddab2a608d8f6f0d5aafee6 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 6 Oct 2022 17:18:59 -0700 Subject: [PATCH 3/5] Remove unnecessary assignment of starting_col_offset --- Parser/tokenizer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 0420cb23092f73..68808a62826c70 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1449,7 +1449,6 @@ tok_get(struct tok_state *tok, struct token *token) int col = 0; int altcol = 0; tok->atbol = 0; - tok->starting_col_offset = 0; int cont_line_col = 0; for (;;) { c = tok_nextc(tok); From 4796a400f71a3adb0f285f11f301f3ba33f188d8 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 6 Oct 2022 17:21:16 -0700 Subject: [PATCH 4/5] Add comment --- Parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 68808a62826c70..62af172f52b3cd 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1623,6 +1623,7 @@ tok_get(struct tok_state *tok, struct token *token) /* This is a type comment if we matched all of type_comment_prefix. */ if (!*prefix) { int is_type_ignore = 1; + // +6 in order to skip the word 'ignore' const char *ignore_end = p + 6; const int ignore_end_col_offset = current_starting_col_offset + 6; tok_backup(tok, c); /* don't eat the newline or EOF */ @@ -1645,7 +1646,6 @@ tok_get(struct tok_state *tok, struct token *token) tok_nextc(tok); tok->atbol = 1; } - // +6 below cause we need to skip the ignore part return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset); } else { p_start = type_start; From d128a37eae72ce5eafa3536f421668ee1b1bbbbc Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Fri, 7 Oct 2022 10:20:47 -0700 Subject: [PATCH 5/5] Address feedback; add macro to advance new line --- Parser/tokenizer.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 62af172f52b3cd..1c356d3d47c945 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -39,6 +39,9 @@ #define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end) #define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\ type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end)) +#define ADVANCE_LINENO() \ + tok->lineno++; \ + tok->col_offset = 0; /* Forward */ static struct tok_state *tok_new(void); @@ -875,8 +878,7 @@ tok_underflow_string(struct tok_state *tok) { tok->buf = tok->cur; } tok->line_start = tok->cur; - tok->lineno++; - tok->col_offset = 0; + ADVANCE_LINENO(); tok->inp = end; return 1; } @@ -935,8 +937,7 @@ tok_underflow_interactive(struct tok_state *tok) { else if (tok->start != NULL) { Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; size_t size = strlen(newtok); - tok->lineno++; - tok->col_offset = 0; + ADVANCE_LINENO(); if (!tok_reserve_buf(tok, size + 1)) { PyMem_Free(tok->buf); tok->buf = NULL; @@ -949,8 +950,7 @@ tok_underflow_interactive(struct tok_state *tok) { tok->multi_line_start = tok->buf + cur_multi_line_start; } else { - tok->lineno++; - tok->col_offset = 0; + ADVANCE_LINENO(); PyMem_Free(tok->buf); tok->buf = newtok; tok->cur = tok->buf; @@ -1005,8 +1005,7 @@ tok_underflow_file(struct tok_state *tok) { *tok->inp = '\0'; } - tok->lineno++; - tok->col_offset = 0; + ADVANCE_LINENO(); if (tok->decoding_state != STATE_NORMAL) { if (tok->lineno > 2) { tok->decoding_state = STATE_NORMAL; pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy