Skip to content

Commit 173e29a

Browse files
committed
Fix the general case of quantified regex back-references.
Cases where a back-reference is part of a larger subexpression that is quantified have never worked in Spencer's regex engine, because he used a compile-time transformation that neglected the need to check the back-reference match in iterations before the last one. (That was okay for capturing parens, and we still do it if the regex has *only* capturing parens ... but it's not okay for backrefs.) To make this work properly, we have to add an "iteration" node type to the regex engine's vocabulary of sub-regex nodes. Since this is a moderately large change with a fair risk of introducing new bugs of its own, apply to HEAD only, even though it's a fix for a longstanding bug.
1 parent 0c9e5d5 commit 173e29a

File tree

6 files changed

+884
-55
lines changed

6 files changed

+884
-55
lines changed

src/backend/regex/README

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -102,15 +102,15 @@ consists of a tree of sub-expressions ("subre"s). Leaf tree nodes are
102102
either plain regular expressions (which are executed as DFAs in the manner
103103
described above) or back-references (which try to match the input to some
104104
previous substring). Non-leaf nodes are capture nodes (which save the
105-
location of the substring currently matching their child node) or
106-
concatenation or alternation nodes. At execution time, the executor
107-
recursively scans the tree. At concatenation or alternation nodes,
108-
it considers each possible alternative way of matching the input string,
109-
ie each place where the string could be split for a concatenation, or each
110-
child node for an alternation. It tries the next alternative if the match
111-
fails according to the child nodes. This is exactly the sort of
112-
backtracking search done by a traditional NFA regex engine. If there are
113-
many tree levels it can get very slow.
105+
location of the substring currently matching their child node),
106+
concatenation, alternation, or iteration nodes. At execution time, the
107+
executor recursively scans the tree. At concatenation, alternation, or
108+
iteration nodes, it considers each possible alternative way of matching the
109+
input string, that is each place where the string could be split for a
110+
concatenation or iteration, or each child node for an alternation. It
111+
tries the next alternative if the match fails according to the child nodes.
112+
This is exactly the sort of backtracking search done by a traditional NFA
113+
regex engine. If there are many tree levels it can get very slow.
114114

115115
But all is not lost: we can still be smarter than the average pure NFA
116116
engine. To do this, each subre node has an associated DFA, which

src/backend/regex/regcomp.c

Lines changed: 52 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1036,11 +1036,17 @@ parseqatom(struct vars * v,
10361036
/*----------
10371037
* Prepare a general-purpose state skeleton.
10381038
*
1039-
* ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
1040-
* / /
1041-
* [lp] ----> [s2] ----bypass---------------------
1039+
* In the no-backrefs case, we want this:
10421040
*
1043-
* where bypass is an empty, and prefix is some repetitions of atom
1041+
* [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
1042+
*
1043+
* where prefix is some repetitions of atom. In the general case we need
1044+
*
1045+
* [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
1046+
*
1047+
* where the iterator wraps around [begin] ---atom---> [end]
1048+
*
1049+
* We make the s state here for both cases; s2 is made below if needed
10441050
*----------
10451051
*/
10461052
s = newstate(v->nfa); /* first, new endpoints for the atom */
@@ -1051,11 +1057,9 @@ parseqatom(struct vars * v,
10511057
NOERR();
10521058
atom->begin = s;
10531059
atom->end = s2;
1054-
s = newstate(v->nfa); /* and spots for prefix and bypass */
1055-
s2 = newstate(v->nfa);
1060+
s = newstate(v->nfa); /* set up starting state */
10561061
NOERR();
10571062
EMPTYARC(lp, s);
1058-
EMPTYARC(lp, s2);
10591063
NOERR();
10601064

10611065
/* break remaining subRE into x{...} and what follows */
@@ -1089,28 +1093,9 @@ parseqatom(struct vars * v,
10891093
}
10901094

10911095
/*
1092-
* It's quantifier time. If the atom is just a BACKREF, we'll let it deal
1093-
* with quantifiers internally. Otherwise, the first step is to turn
1094-
* x{0,...} into x{1,...}|empty
1096+
* It's quantifier time. If the atom is just a backref, we'll let it deal
1097+
* with quantifiers internally.
10951098
*/
1096-
if (m == 0 && atomtype != BACKREF)
1097-
{
1098-
EMPTYARC(s2, atom->end); /* the bypass */
1099-
assert(PREF(qprefer) != 0);
1100-
f = COMBINE(qprefer, atom->flags);
1101-
t = subre(v, '|', f, lp, atom->end);
1102-
NOERR();
1103-
t->left = atom;
1104-
t->right = subre(v, '|', PREF(f), s2, atom->end);
1105-
NOERR();
1106-
t->right->left = subre(v, '=', 0, s2, atom->end);
1107-
NOERR();
1108-
*atomp = t;
1109-
atomp = &t->left;
1110-
m = 1;
1111-
}
1112-
1113-
/* deal with the rest of the quantifier */
11141099
if (atomtype == BACKREF)
11151100
{
11161101
/* special case: backrefs have internal quantifiers */
@@ -1120,17 +1105,25 @@ parseqatom(struct vars * v,
11201105
atom->min = (short) m;
11211106
atom->max = (short) n;
11221107
atom->flags |= COMBINE(qprefer, atom->flags);
1108+
/* rest of branch can be strung starting from atom->end */
1109+
s2 = atom->end;
11231110
}
11241111
else if (m == 1 && n == 1)
11251112
{
11261113
/* no/vacuous quantifier: done */
11271114
EMPTYARC(s, atom->begin); /* empty prefix */
1115+
/* rest of branch can be strung starting from atom->end */
1116+
s2 = atom->end;
11281117
}
1129-
else
1118+
else if (m > 0 && !(atom->flags & BACKR))
11301119
{
11311120
/*
1132-
* Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the
1133-
* second x
1121+
* If there's no backrefs involved, we can turn x{m,n} into
1122+
* x{m-1,n-1}x, with capturing parens in only the second x. This
1123+
* is valid because we only care about capturing matches from the
1124+
* final iteration of the quantifier. It's a win because we can
1125+
* implement the backref-free left side as a plain DFA node, since
1126+
* we don't really care where its submatches are.
11341127
*/
11351128
dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
11361129
assert(m >= 1 && m != INFINITY && n >= 1);
@@ -1142,16 +1135,36 @@ parseqatom(struct vars * v,
11421135
NOERR();
11431136
t->right = atom;
11441137
*atomp = t;
1138+
/* rest of branch can be strung starting from atom->end */
1139+
s2 = atom->end;
1140+
}
1141+
else
1142+
{
1143+
/* general case: need an iteration node */
1144+
s2 = newstate(v->nfa);
1145+
NOERR();
1146+
moveouts(v->nfa, atom->end, s2);
1147+
NOERR();
1148+
dupnfa(v->nfa, atom->begin, atom->end, s, s2);
1149+
repeat(v, s, s2, m, n);
1150+
f = COMBINE(qprefer, atom->flags);
1151+
t = subre(v, '*', f, s, s2);
1152+
NOERR();
1153+
t->min = (short) m;
1154+
t->max = (short) n;
1155+
t->left = atom;
1156+
*atomp = t;
1157+
/* rest of branch is to be strung from iteration's end state */
11451158
}
11461159

11471160
/* and finally, look after that postponed recursion */
11481161
t = top->right;
11491162
if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
1150-
t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
1163+
t->right = parsebranch(v, stopper, type, s2, rp, 1);
11511164
else
11521165
{
1153-
EMPTYARC(atom->end, rp);
1154-
t->right = subre(v, '=', 0, atom->end, rp);
1166+
EMPTYARC(s2, rp);
1167+
t->right = subre(v, '=', 0, s2, rp);
11551168
}
11561169
assert(SEE('|') || SEE(stopper) || SEE(EOS));
11571170
t->flags |= COMBINE(t->flags, t->right->flags);
@@ -1214,6 +1227,9 @@ scannum(struct vars * v)
12141227
/*
12151228
* repeat - replicate subNFA for quantifiers
12161229
*
1230+
* The sub-NFA strung from lp to rp is modified to represent m to n
1231+
* repetitions of its initial contents.
1232+
*
12171233
* The duplication sequences used here are chosen carefully so that any
12181234
* pointers starting out pointing into the subexpression end up pointing into
12191235
* the last occurrence. (Note that it may not be strung between the same
@@ -1229,7 +1245,7 @@ repeat(struct vars * v,
12291245
int n)
12301246
{
12311247
#define SOME 2
1232-
#define INF 3
1248+
#define INF 3
12331249
#define PAIR(x, y) ((x)*4 + (y))
12341250
#define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
12351251
const int rm = REDUCE(m);
@@ -1603,7 +1619,7 @@ subre(struct vars * v,
16031619
v->treechain = ret;
16041620
}
16051621

1606-
assert(strchr("|.b(=", op) != NULL);
1622+
assert(strchr("=b|.*(", op) != NULL);
16071623

16081624
ret->op = op;
16091625
ret->flags = flags;

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy