Skip to content

Commit d6061d2

Browse files
committed
Fix regex_fixed_prefix() to cope reasonably well with regex patterns of the
form '^(foo)$'. Before, these could never be optimized into indexscans. The recent changes to make psql and pg_dump generate such patterns (for \d commands and -t and related switches, respectively) therefore represented a big performance hit for people with large pg_class catalogs, as seen in recent gripe from Erik Jones. While at it, be more paranoid about case-sensitivity checking in multibyte encodings, and fix some other corner cases in which a regex might be interpreted too liberally.
1 parent 9c88830 commit d6061d2

File tree

3 files changed

+105
-46
lines changed

3 files changed

+105
-46
lines changed

src/backend/utils/adt/regexp.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.66 2006/10/04 00:29:59 momjian Exp $
11+
* $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.67 2007/01/03 22:39:26 tgl Exp $
1212
*
1313
* Alistair Crooks added the code for the regex caching
1414
* agc - cached the regular expressions used - there's a good chance
@@ -624,3 +624,12 @@ similar_escape(PG_FUNCTION_ARGS)
624624

625625
PG_RETURN_TEXT_P(result);
626626
}
627+
628+
/*
629+
* report whether regex_flavor is currently BASIC
630+
*/
631+
bool
632+
regex_flavor_is_basic(void)
633+
{
634+
return (regex_flavor == REG_BASIC);
635+
}

src/backend/utils/adt/selfuncs.c

Lines changed: 93 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
*
1616
*
1717
* IDENTIFICATION
18-
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.216 2006/12/23 00:43:11 tgl Exp $
18+
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.217 2007/01/03 22:39:26 tgl Exp $
1919
*
2020
*-------------------------------------------------------------------------
2121
*/
@@ -3805,7 +3805,10 @@ get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
38053805
* These routines support analysis of LIKE and regular-expression patterns
38063806
* by the planner/optimizer. It's important that they agree with the
38073807
* regular-expression code in backend/regex/ and the LIKE code in
3808-
* backend/utils/adt/like.c.
3808+
* backend/utils/adt/like.c. Also, the computation of the fixed prefix
3809+
* must be conservative: if we report a string longer than the true fixed
3810+
* prefix, the query may produce actually wrong answers, rather than just
3811+
* getting a bad selectivity estimate!
38093812
*
38103813
* Note that the prefix-analysis functions are called from
38113814
* backend/optimizer/path/indxpath.c as well as from routines in this file.
@@ -3837,6 +3840,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
38373840
Oid typeid = patt_const->consttype;
38383841
int pos,
38393842
match_pos;
3843+
bool is_multibyte = (pg_database_encoding_max_length() > 1);
38403844

38413845
/* the right-hand const is type text or bytea */
38423846
Assert(typeid == BYTEAOID || typeid == TEXTOID);
@@ -3880,11 +3884,16 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
38803884
}
38813885

38823886
/*
3883-
* XXX I suspect isalpha() is not an adequately locale-sensitive test
3884-
* for characters that can vary under case folding?
3887+
* XXX In multibyte character sets, we can't trust isalpha, so assume
3888+
* any multibyte char is potentially case-varying.
38853889
*/
3886-
if (case_insensitive && isalpha((unsigned char) patt[pos]))
3887-
break;
3890+
if (case_insensitive)
3891+
{
3892+
if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
3893+
break;
3894+
if (isalpha((unsigned char) patt[pos]))
3895+
break;
3896+
}
38883897

38893898
/*
38903899
* NOTE: this code used to think that %% meant a literal %, but
@@ -3929,11 +3938,13 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
39293938
int pos,
39303939
match_pos,
39313940
prev_pos,
3932-
prev_match_pos,
3933-
paren_depth;
3941+
prev_match_pos;
3942+
bool have_leading_paren;
39343943
char *patt;
39353944
char *rest;
39363945
Oid typeid = patt_const->consttype;
3946+
bool is_basic = regex_flavor_is_basic();
3947+
bool is_multibyte = (pg_database_encoding_max_length() > 1);
39373948

39383949
/*
39393950
* Should be unnecessary, there are no bytea regex operators defined. As
@@ -3948,8 +3959,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
39483959
/* the right-hand const is type text for all of these */
39493960
patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
39503961

3962+
/*
3963+
* Check for ARE director prefix. It's worth our trouble to recognize
3964+
* this because similar_escape() uses it.
3965+
*/
3966+
pos = 0;
3967+
if (strncmp(patt, "***:", 4) == 0)
3968+
{
3969+
pos = 4;
3970+
is_basic = false;
3971+
}
3972+
39513973
/* Pattern must be anchored left */
3952-
if (patt[0] != '^')
3974+
if (patt[pos] != '^')
39533975
{
39543976
rest = patt;
39553977

@@ -3958,72 +3980,86 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
39583980

39593981
return Pattern_Prefix_None;
39603982
}
3983+
pos++;
39613984

39623985
/*
3963-
* If unquoted | is present at paren level 0 in pattern, then there are
3964-
* multiple alternatives for the start of the string.
3986+
* If '|' is present in pattern, then there may be multiple alternatives
3987+
* for the start of the string. (There are cases where this isn't so,
3988+
* for instance if the '|' is inside parens, but detecting that reliably
3989+
* is too hard.)
39653990
*/
3966-
paren_depth = 0;
3967-
for (pos = 1; patt[pos]; pos++)
3991+
if (strchr(patt + pos, '|') != NULL)
39683992
{
3969-
if (patt[pos] == '|' && paren_depth == 0)
3970-
{
3971-
rest = patt;
3993+
rest = patt;
39723994

3973-
*prefix_const = NULL;
3974-
*rest_const = string_to_const(rest, typeid);
3995+
*prefix_const = NULL;
3996+
*rest_const = string_to_const(rest, typeid);
39753997

3976-
return Pattern_Prefix_None;
3977-
}
3978-
else if (patt[pos] == '(')
3979-
paren_depth++;
3980-
else if (patt[pos] == ')' && paren_depth > 0)
3981-
paren_depth--;
3982-
else if (patt[pos] == '\\')
3983-
{
3984-
/* backslash quotes the next character */
3985-
pos++;
3986-
if (patt[pos] == '\0')
3987-
break;
3988-
}
3998+
return Pattern_Prefix_None;
39893999
}
39904000

39914001
/* OK, allocate space for pattern */
39924002
match = palloc(strlen(patt) + 1);
39934003
prev_match_pos = match_pos = 0;
39944004

3995-
/* note start at pos 1 to skip leading ^ */
3996-
for (prev_pos = pos = 1; patt[pos];)
4005+
/*
4006+
* We special-case the syntax '^(...)$' because psql uses it. But beware:
4007+
* in BRE mode these parentheses are just ordinary characters. Also,
4008+
* sequences beginning "(?" are not what they seem, unless they're "(?:".
4009+
* (We should recognize that, too, because of similar_escape().)
4010+
*
4011+
* Note: it's a bit bogus to be depending on the current regex_flavor
4012+
* setting here, because the setting could change before the pattern is
4013+
* used. We minimize the risk by trusting the flavor as little as we can,
4014+
* but perhaps it would be a good idea to get rid of the "basic" setting.
4015+
*/
4016+
have_leading_paren = false;
4017+
if (patt[pos] == '(' && !is_basic &&
4018+
(patt[pos + 1] != '?' || patt[pos + 2] == ':'))
4019+
{
4020+
have_leading_paren = true;
4021+
pos += (patt[pos + 1] != '?' ? 1 : 3);
4022+
}
4023+
4024+
/* Scan remainder of pattern */
4025+
prev_pos = pos;
4026+
while (patt[pos])
39974027
{
39984028
int len;
39994029

40004030
/*
40014031
* Check for characters that indicate multiple possible matches here.
4002-
* XXX I suspect isalpha() is not an adequately locale-sensitive test
4003-
* for characters that can vary under case folding?
4032+
* Also, drop out at ')' or '$' so the termination test works right.
40044033
*/
40054034
if (patt[pos] == '.' ||
40064035
patt[pos] == '(' ||
4036+
patt[pos] == ')' ||
40074037
patt[pos] == '[' ||
4008-
patt[pos] == '$' ||
4009-
(case_insensitive && isalpha((unsigned char) patt[pos])))
4038+
patt[pos] == '^' ||
4039+
patt[pos] == '$')
40104040
break;
40114041

40124042
/*
4013-
* In AREs, backslash followed by alphanumeric is an escape, not a
4014-
* quoted character. Must treat it as having multiple possible
4015-
* matches.
4043+
* XXX In multibyte character sets, we can't trust isalpha, so assume
4044+
* any multibyte char is potentially case-varying.
40164045
*/
4017-
if (patt[pos] == '\\' && isalnum((unsigned char) patt[pos + 1]))
4018-
break;
4046+
if (case_insensitive)
4047+
{
4048+
if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
4049+
break;
4050+
if (isalpha((unsigned char) patt[pos]))
4051+
break;
4052+
}
40194053

40204054
/*
40214055
* Check for quantifiers. Except for +, this means the preceding
40224056
* character is optional, so we must remove it from the prefix too!
4057+
* Note: in BREs, \{ is a quantifier.
40234058
*/
40244059
if (patt[pos] == '*' ||
40254060
patt[pos] == '?' ||
4026-
patt[pos] == '{')
4061+
patt[pos] == '{' ||
4062+
(patt[pos] == '\\' && patt[pos + 1] == '{'))
40274063
{
40284064
match_pos = prev_match_pos;
40294065
pos = prev_pos;
@@ -4034,9 +4070,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
40344070
pos = prev_pos;
40354071
break;
40364072
}
4073+
4074+
/*
4075+
* Normally, backslash quotes the next character. But in AREs,
4076+
* backslash followed by alphanumeric is an escape, not a quoted
4077+
* character. Must treat it as having multiple possible matches.
4078+
* In BREs, \( is a parenthesis, so don't trust that either.
4079+
* Note: since only ASCII alphanumerics are escapes, we don't have
4080+
* to be paranoid about multibyte here.
4081+
*/
40374082
if (patt[pos] == '\\')
40384083
{
4039-
/* backslash quotes the next character */
4084+
if (isalnum((unsigned char) patt[pos + 1]) || patt[pos + 1] == '(')
4085+
break;
40404086
pos++;
40414087
if (patt[pos] == '\0')
40424088
break;
@@ -4054,6 +4100,9 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
40544100
match[match_pos] = '\0';
40554101
rest = &patt[pos];
40564102

4103+
if (have_leading_paren && patt[pos] == ')')
4104+
pos++;
4105+
40574106
if (patt[pos] == '$' && patt[pos + 1] == '\0')
40584107
{
40594108
rest = &patt[pos + 1];

src/include/utils/builtins.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.283 2006/12/30 21:21:55 tgl Exp $
10+
* $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.284 2007/01/03 22:39:26 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -477,6 +477,7 @@ extern Datum textregexsubstr(PG_FUNCTION_ARGS);
477477
extern Datum textregexreplace_noopt(PG_FUNCTION_ARGS);
478478
extern Datum textregexreplace(PG_FUNCTION_ARGS);
479479
extern Datum similar_escape(PG_FUNCTION_ARGS);
480+
extern bool regex_flavor_is_basic(void);
480481

481482
/* regproc.c */
482483
extern Datum regprocin(PG_FUNCTION_ARGS);

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy