Skip to content

Commit d809fd0

Browse files
committed
Improve parser's one-extra-token lookahead mechanism.
There are a couple of places in our grammar that fail to be strict LALR(1), by requiring more than a single token of lookahead to decide what to do. Up to now we've dealt with that by using a filter between the lexer and parser that merges adjacent tokens into one in the places where two tokens of lookahead are necessary. But that creates a number of user-visible anomalies, for instance that you can't name a CTE "ordinality" because "WITH ordinality AS ..." triggers folding of WITH and ORDINALITY into one token. I realized that there's a better way. In this patch, we still do the lookahead basically as before, but we never merge the second token into the first; we replace just the first token by a special lookahead symbol when one of the lookahead pairs is seen. This requires a couple extra productions in the grammar, but it involves fewer special tokens, so that the grammar tables come out a bit smaller than before. The filter logic is no slower than before, perhaps a bit faster. I also fixed the filter logic so that when backing up after a lookahead, the current token's terminator is correctly restored; this eliminates some weird behavior in error message issuance, as is shown by the one change in existing regression test outputs. I believe that this patch entirely eliminates odd behaviors caused by lookahead for WITH. It doesn't really improve the situation for NULLS followed by FIRST/LAST unfortunately: those sequences still act like a reserved word, even though there are cases where they should be seen as two ordinary identifiers, eg "SELECT nulls first FROM ...". I experimented with additional grammar hacks but couldn't find any simple solution for that. Still, this is better than before, and it seems much more likely that we *could* somehow solve the NULLS case on the basis of this filter behavior than the previous one.
1 parent 23a7835 commit d809fd0

File tree

8 files changed

+173
-113
lines changed

8 files changed

+173
-113
lines changed

src/backend/parser/gram.y

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -633,9 +633,9 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
633633
/*
634634
* The grammar thinks these are keywords, but they are not in the kwlist.h
635635
* list and so can never be entered directly. The filter in parser.c
636-
* creates these tokens when required.
636+
* creates these tokens when required (based on looking one token ahead).
637637
*/
638-
%token NULLS_FIRST NULLS_LAST WITH_ORDINALITY WITH_TIME
638+
%token NULLS_LA WITH_LA
639639

640640

641641
/* Precedence: lowest to highest */
@@ -873,6 +873,7 @@ CreateRoleStmt:
873873

874874

875875
opt_with: WITH {}
876+
| WITH_LA {}
876877
| /*EMPTY*/ {}
877878
;
878879

@@ -6673,8 +6674,8 @@ opt_asc_desc: ASC { $$ = SORTBY_ASC; }
66736674
| /*EMPTY*/ { $$ = SORTBY_DEFAULT; }
66746675
;
66756676

6676-
opt_nulls_order: NULLS_FIRST { $$ = SORTBY_NULLS_FIRST; }
6677-
| NULLS_LAST { $$ = SORTBY_NULLS_LAST; }
6677+
opt_nulls_order: NULLS_LA FIRST_P { $$ = SORTBY_NULLS_FIRST; }
6678+
| NULLS_LA LAST_P { $$ = SORTBY_NULLS_LAST; }
66786679
| /*EMPTY*/ { $$ = SORTBY_NULLS_DEFAULT; }
66796680
;
66806681

@@ -8923,7 +8924,7 @@ AlterTSDictionaryStmt:
89238924
;
89248925

89258926
AlterTSConfigurationStmt:
8926-
ALTER TEXT_P SEARCH CONFIGURATION any_name ADD_P MAPPING FOR name_list WITH any_name_list
8927+
ALTER TEXT_P SEARCH CONFIGURATION any_name ADD_P MAPPING FOR name_list any_with any_name_list
89278928
{
89288929
AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt);
89298930
n->cfgname = $5;
@@ -8933,7 +8934,7 @@ AlterTSConfigurationStmt:
89338934
n->replace = false;
89348935
$$ = (Node*)n;
89358936
}
8936-
| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list WITH any_name_list
8937+
| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list any_with any_name_list
89378938
{
89388939
AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt);
89398940
n->cfgname = $5;
@@ -8943,7 +8944,7 @@ AlterTSConfigurationStmt:
89438944
n->replace = false;
89448945
$$ = (Node*)n;
89458946
}
8946-
| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING REPLACE any_name WITH any_name
8947+
| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING REPLACE any_name any_with any_name
89478948
{
89488949
AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt);
89498950
n->cfgname = $5;
@@ -8953,7 +8954,7 @@ AlterTSConfigurationStmt:
89538954
n->replace = true;
89548955
$$ = (Node*)n;
89558956
}
8956-
| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list REPLACE any_name WITH any_name
8957+
| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list REPLACE any_name any_with any_name
89578958
{
89588959
AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt);
89598960
n->cfgname = $5;
@@ -8981,6 +8982,11 @@ AlterTSConfigurationStmt:
89818982
}
89828983
;
89838984

8985+
/* Use this if TIME or ORDINALITY after WITH should be taken as an identifier */
8986+
any_with: WITH {}
8987+
| WITH_LA {}
8988+
;
8989+
89848990

89858991
/*****************************************************************************
89868992
*
@@ -9891,6 +9897,8 @@ simple_select:
98919897
* AS (query) [ SEARCH or CYCLE clause ]
98929898
*
98939899
* We don't currently support the SEARCH or CYCLE clause.
9900+
*
9901+
* Recognizing WITH_LA here allows a CTE to be named TIME or ORDINALITY.
98949902
*/
98959903
with_clause:
98969904
WITH cte_list
@@ -9900,6 +9908,13 @@ with_clause:
99009908
$$->recursive = false;
99019909
$$->location = @1;
99029910
}
9911+
| WITH_LA cte_list
9912+
{
9913+
$$ = makeNode(WithClause);
9914+
$$->ctes = $2;
9915+
$$->recursive = false;
9916+
$$->location = @1;
9917+
}
99039918
| WITH RECURSIVE cte_list
99049919
{
99059920
$$ = makeNode(WithClause);
@@ -10601,7 +10616,7 @@ opt_col_def_list: AS '(' TableFuncElementList ')' { $$ = $3; }
1060110616
| /*EMPTY*/ { $$ = NIL; }
1060210617
;
1060310618

10604-
opt_ordinality: WITH_ORDINALITY { $$ = true; }
10619+
opt_ordinality: WITH_LA ORDINALITY { $$ = true; }
1060510620
| /*EMPTY*/ { $$ = false; }
1060610621
;
1060710622

@@ -11057,7 +11072,7 @@ ConstInterval:
1105711072
;
1105811073

1105911074
opt_timezone:
11060-
WITH_TIME ZONE { $$ = TRUE; }
11075+
WITH_LA TIME ZONE { $$ = TRUE; }
1106111076
| WITHOUT TIME ZONE { $$ = FALSE; }
1106211077
| /*EMPTY*/ { $$ = FALSE; }
1106311078
;

src/backend/parser/parser.c

Lines changed: 58 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,13 @@ raw_parser(const char *str)
6464
/*
6565
* Intermediate filter between parser and core lexer (core_yylex in scan.l).
6666
*
67-
* The filter is needed because in some cases the standard SQL grammar
67+
* This filter is needed because in some cases the standard SQL grammar
6868
* requires more than one token lookahead. We reduce these cases to one-token
69-
* lookahead by combining tokens here, in order to keep the grammar LALR(1).
69+
* lookahead by replacing tokens here, in order to keep the grammar LALR(1).
7070
*
7171
* Using a filter is simpler than trying to recognize multiword tokens
7272
* directly in scan.l, because we'd have to allow for comments between the
73-
* words. Furthermore it's not clear how to do it without re-introducing
73+
* words. Furthermore it's not clear how to do that without re-introducing
7474
* scanner backtrack, which would cost more performance than this filter
7575
* layer does.
7676
*
@@ -84,7 +84,7 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
8484
base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
8585
int cur_token;
8686
int next_token;
87-
core_YYSTYPE cur_yylval;
87+
int cur_token_length;
8888
YYLTYPE cur_yylloc;
8989

9090
/* Get next token --- we might already have it */
@@ -93,74 +93,85 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
9393
cur_token = yyextra->lookahead_token;
9494
lvalp->core_yystype = yyextra->lookahead_yylval;
9595
*llocp = yyextra->lookahead_yylloc;
96+
*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
9697
yyextra->have_lookahead = false;
9798
}
9899
else
99100
cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
100101

101-
/* Do we need to look ahead for a possible multiword token? */
102+
/*
103+
* If this token isn't one that requires lookahead, just return it. If it
104+
* does, determine the token length. (We could get that via strlen(), but
105+
* since we have such a small set of possibilities, hardwiring seems
106+
* feasible and more efficient.)
107+
*/
102108
switch (cur_token)
103109
{
104110
case NULLS_P:
111+
cur_token_length = 5;
112+
break;
113+
case WITH:
114+
cur_token_length = 4;
115+
break;
116+
default:
117+
return cur_token;
118+
}
105119

106-
/*
107-
* NULLS FIRST and NULLS LAST must be reduced to one token
108-
*/
109-
cur_yylval = lvalp->core_yystype;
110-
cur_yylloc = *llocp;
111-
next_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
120+
/*
121+
* Identify end+1 of current token. core_yylex() has temporarily stored a
122+
* '\0' here, and will undo that when we call it again. We need to redo
123+
* it to fully revert the lookahead call for error reporting purposes.
124+
*/
125+
yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
126+
*llocp + cur_token_length;
127+
Assert(*(yyextra->lookahead_end) == '\0');
128+
129+
/*
130+
* Save and restore *llocp around the call. It might look like we could
131+
* avoid this by just passing &lookahead_yylloc to core_yylex(), but that
132+
* does not work because flex actually holds onto the last-passed pointer
133+
* internally, and will use that for error reporting. We need any error
134+
* reports to point to the current token, not the next one.
135+
*/
136+
cur_yylloc = *llocp;
137+
138+
/* Get next token, saving outputs into lookahead variables */
139+
next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
140+
yyextra->lookahead_token = next_token;
141+
yyextra->lookahead_yylloc = *llocp;
142+
143+
*llocp = cur_yylloc;
144+
145+
/* Now revert the un-truncation of the current token */
146+
yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
147+
*(yyextra->lookahead_end) = '\0';
148+
149+
yyextra->have_lookahead = true;
150+
151+
/* Replace cur_token if needed, based on lookahead */
152+
switch (cur_token)
153+
{
154+
case NULLS_P:
155+
/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
112156
switch (next_token)
113157
{
114158
case FIRST_P:
115-
cur_token = NULLS_FIRST;
116-
break;
117159
case LAST_P:
118-
cur_token = NULLS_LAST;
119-
break;
120-
default:
121-
/* save the lookahead token for next time */
122-
yyextra->lookahead_token = next_token;
123-
yyextra->lookahead_yylval = lvalp->core_yystype;
124-
yyextra->lookahead_yylloc = *llocp;
125-
yyextra->have_lookahead = true;
126-
/* and back up the output info to cur_token */
127-
lvalp->core_yystype = cur_yylval;
128-
*llocp = cur_yylloc;
160+
cur_token = NULLS_LA;
129161
break;
130162
}
131163
break;
132164

133165
case WITH:
134-
135-
/*
136-
* WITH TIME and WITH ORDINALITY must each be reduced to one token
137-
*/
138-
cur_yylval = lvalp->core_yystype;
139-
cur_yylloc = *llocp;
140-
next_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
166+
/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
141167
switch (next_token)
142168
{
143169
case TIME:
144-
cur_token = WITH_TIME;
145-
break;
146170
case ORDINALITY:
147-
cur_token = WITH_ORDINALITY;
148-
break;
149-
default:
150-
/* save the lookahead token for next time */
151-
yyextra->lookahead_token = next_token;
152-
yyextra->lookahead_yylval = lvalp->core_yystype;
153-
yyextra->lookahead_yylloc = *llocp;
154-
yyextra->have_lookahead = true;
155-
/* and back up the output info to cur_token */
156-
lvalp->core_yystype = cur_yylval;
157-
*llocp = cur_yylloc;
171+
cur_token = WITH_LA;
158172
break;
159173
}
160174
break;
161-
162-
default:
163-
break;
164175
}
165176

166177
return cur_token;

src/include/parser/gramparse.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ typedef struct base_yy_extra_type
4646
int lookahead_token; /* one-token lookahead */
4747
core_YYSTYPE lookahead_yylval; /* yylval for lookahead token */
4848
YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */
49+
char *lookahead_end; /* end of current token */
50+
char lookahead_hold_char; /* to be put back at *lookahead_end */
4951

5052
/*
5153
* State variables that belong to the grammar.

src/interfaces/ecpg/preproc/parse.pl

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,8 @@
4242

4343
# or in the block
4444
my %replace_string = (
45-
'WITH_TIME' => 'with time',
46-
'WITH_ORDINALITY' => 'with ordinality',
47-
'NULLS_FIRST' => 'nulls first',
48-
'NULLS_LAST' => 'nulls last',
45+
'NULLS_LA' => 'nulls',
46+
'WITH_LA' => 'with',
4947
'TYPECAST' => '::',
5048
'DOT_DOT' => '..',
5149
'COLON_EQUALS' => ':=',);

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy