Skip to content

Commit 3c381a5

Browse files
committed
Teach pattern_fixed_prefix() about collations.
This is necessary, not optional, now that ILIKE and regexes are collation aware --- else we might derive a wrong comparison constant for index optimized pattern matches.
1 parent dad1f46 commit 3c381a5

File tree

3 files changed

+117
-55
lines changed

3 files changed

+117
-55
lines changed

src/backend/optimizer/path/indxpath.c

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2446,6 +2446,7 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
24462446
bool isIndexable = false;
24472447
Node *rightop;
24482448
Oid expr_op;
2449+
Oid expr_coll;
24492450
Const *patt;
24502451
Const *prefix = NULL;
24512452
Const *rest = NULL;
@@ -2462,6 +2463,7 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
24622463
/* we know these will succeed */
24632464
rightop = get_rightop(clause);
24642465
expr_op = ((OpExpr *) clause)->opno;
2466+
expr_coll = ((OpExpr *) clause)->inputcollid;
24652467

24662468
/* again, required for all current special ops: */
24672469
if (!IsA(rightop, Const) ||
@@ -2475,13 +2477,13 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
24752477
case OID_BPCHAR_LIKE_OP:
24762478
case OID_NAME_LIKE_OP:
24772479
/* the right-hand const is type text for all of these */
2478-
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like,
2480+
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like, expr_coll,
24792481
&prefix, &rest);
24802482
isIndexable = (pstatus != Pattern_Prefix_None);
24812483
break;
24822484

24832485
case OID_BYTEA_LIKE_OP:
2484-
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like,
2486+
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like, expr_coll,
24852487
&prefix, &rest);
24862488
isIndexable = (pstatus != Pattern_Prefix_None);
24872489
break;
@@ -2490,7 +2492,7 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
24902492
case OID_BPCHAR_ICLIKE_OP:
24912493
case OID_NAME_ICLIKE_OP:
24922494
/* the right-hand const is type text for all of these */
2493-
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like_IC,
2495+
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like_IC, expr_coll,
24942496
&prefix, &rest);
24952497
isIndexable = (pstatus != Pattern_Prefix_None);
24962498
break;
@@ -2499,7 +2501,7 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
24992501
case OID_BPCHAR_REGEXEQ_OP:
25002502
case OID_NAME_REGEXEQ_OP:
25012503
/* the right-hand const is type text for all of these */
2502-
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex,
2504+
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex, expr_coll,
25032505
&prefix, &rest);
25042506
isIndexable = (pstatus != Pattern_Prefix_None);
25052507
break;
@@ -2508,7 +2510,7 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
25082510
case OID_BPCHAR_ICREGEXEQ_OP:
25092511
case OID_NAME_ICREGEXEQ_OP:
25102512
/* the right-hand const is type text for all of these */
2511-
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex_IC,
2513+
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex_IC, expr_coll,
25122514
&prefix, &rest);
25132515
isIndexable = (pstatus != Pattern_Prefix_None);
25142516
break;
@@ -2544,10 +2546,9 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
25442546
*
25452547
* The non-pattern opclasses will not sort the way we need in most non-C
25462548
* locales. We can use such an index anyway for an exact match (simple
2547-
* equality), but not for prefix-match cases. Note that we are looking at
2548-
* the index's collation, not the expression's collation -- this test is
2549-
* not dependent on the LIKE/regex operator's collation (which would only
2550-
* affect case folding behavior of ILIKE, anyway).
2549+
* equality), but not for prefix-match cases. Note that here we are
2550+
* looking at the index's collation, not the expression's collation --
2551+
* this test is *not* dependent on the LIKE/regex operator's collation.
25512552
*/
25522553
switch (expr_op)
25532554
{
@@ -2558,7 +2559,8 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
25582559
isIndexable =
25592560
(opfamily == TEXT_PATTERN_BTREE_FAM_OID) ||
25602561
(opfamily == TEXT_BTREE_FAM_OID &&
2561-
(pstatus == Pattern_Prefix_Exact || lc_collate_is_c(idxcollation)));
2562+
(pstatus == Pattern_Prefix_Exact ||
2563+
lc_collate_is_c(idxcollation)));
25622564
break;
25632565

25642566
case OID_BPCHAR_LIKE_OP:
@@ -2568,7 +2570,8 @@ match_special_index_operator(Expr *clause, Oid opfamily, Oid idxcollation,
25682570
isIndexable =
25692571
(opfamily == BPCHAR_PATTERN_BTREE_FAM_OID) ||
25702572
(opfamily == BPCHAR_BTREE_FAM_OID &&
2571-
(pstatus == Pattern_Prefix_Exact || lc_collate_is_c(idxcollation)));
2573+
(pstatus == Pattern_Prefix_Exact ||
2574+
lc_collate_is_c(idxcollation)));
25722575
break;
25732576

25742577
case OID_NAME_LIKE_OP:
@@ -2770,6 +2773,7 @@ expand_indexqual_opclause(RestrictInfo *rinfo, Oid opfamily, Oid idxcollation)
27702773
Node *leftop = get_leftop(clause);
27712774
Node *rightop = get_rightop(clause);
27722775
Oid expr_op = ((OpExpr *) clause)->opno;
2776+
Oid expr_coll = ((OpExpr *) clause)->inputcollid;
27732777
Const *patt = (Const *) rightop;
27742778
Const *prefix = NULL;
27752779
Const *rest = NULL;
@@ -2791,7 +2795,7 @@ expand_indexqual_opclause(RestrictInfo *rinfo, Oid opfamily, Oid idxcollation)
27912795
case OID_BYTEA_LIKE_OP:
27922796
if (!op_in_opfamily(expr_op, opfamily))
27932797
{
2794-
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like,
2798+
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like, expr_coll,
27952799
&prefix, &rest);
27962800
return prefix_quals(leftop, opfamily, idxcollation, prefix, pstatus);
27972801
}
@@ -2803,7 +2807,7 @@ expand_indexqual_opclause(RestrictInfo *rinfo, Oid opfamily, Oid idxcollation)
28032807
if (!op_in_opfamily(expr_op, opfamily))
28042808
{
28052809
/* the right-hand const is type text for all of these */
2806-
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like_IC,
2810+
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like_IC, expr_coll,
28072811
&prefix, &rest);
28082812
return prefix_quals(leftop, opfamily, idxcollation, prefix, pstatus);
28092813
}
@@ -2815,7 +2819,7 @@ expand_indexqual_opclause(RestrictInfo *rinfo, Oid opfamily, Oid idxcollation)
28152819
if (!op_in_opfamily(expr_op, opfamily))
28162820
{
28172821
/* the right-hand const is type text for all of these */
2818-
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex,
2822+
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex, expr_coll,
28192823
&prefix, &rest);
28202824
return prefix_quals(leftop, opfamily, idxcollation, prefix, pstatus);
28212825
}
@@ -2827,7 +2831,7 @@ expand_indexqual_opclause(RestrictInfo *rinfo, Oid opfamily, Oid idxcollation)
28272831
if (!op_in_opfamily(expr_op, opfamily))
28282832
{
28292833
/* the right-hand const is type text for all of these */
2830-
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex_IC,
2834+
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex_IC, expr_coll,
28312835
&prefix, &rest);
28322836
return prefix_quals(leftop, opfamily, idxcollation, prefix, pstatus);
28332837
}

src/backend/utils/adt/selfuncs.c

Lines changed: 97 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1181,9 +1181,14 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
11811181
return result;
11821182
}
11831183

1184-
/* divide pattern into fixed prefix and remainder */
1184+
/*
1185+
* Divide pattern into fixed prefix and remainder. XXX we have to assume
1186+
* default collation here, because we don't have access to the actual
1187+
* input collation for the operator. FIXME ...
1188+
*/
11851189
patt = (Const *) other;
1186-
pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
1190+
pstatus = pattern_fixed_prefix(patt, ptype, DEFAULT_COLLATION_OID,
1191+
&prefix, &rest);
11871192

11881193
/*
11891194
* If necessary, coerce the prefix constant to the right type. (The "rest"
@@ -4755,6 +4760,29 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
47554760
*-------------------------------------------------------------------------
47564761
*/
47574762

4763+
/*
4764+
* Check whether char is a letter (and, hence, subject to case-folding)
4765+
*
4766+
* In multibyte character sets, we can't use isalpha, and it does not seem
4767+
* worth trying to convert to wchar_t to use iswalpha. Instead, just assume
4768+
* any multibyte char is potentially case-varying.
4769+
*/
4770+
static int
4771+
pattern_char_isalpha(char c, bool is_multibyte,
4772+
pg_locale_t locale, bool locale_is_c)
4773+
{
4774+
if (locale_is_c)
4775+
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
4776+
else if (is_multibyte && IS_HIGHBIT_SET(c))
4777+
return true;
4778+
#ifdef HAVE_LOCALE_T
4779+
else if (locale)
4780+
return isalpha_l((unsigned char) c, locale);
4781+
#endif
4782+
else
4783+
return isalpha((unsigned char) c);
4784+
}
4785+
47584786
/*
47594787
* Extract the fixed prefix, if any, for a pattern.
47604788
*
@@ -4769,7 +4797,7 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
47694797
*/
47704798

47714799
static Pattern_Prefix_Status
4772-
like_fixed_prefix(Const *patt_const, bool case_insensitive,
4800+
like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
47734801
Const **prefix_const, Const **rest_const)
47744802
{
47754803
char *match;
@@ -4780,15 +4808,39 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
47804808
int pos,
47814809
match_pos;
47824810
bool is_multibyte = (pg_database_encoding_max_length() > 1);
4811+
pg_locale_t locale = 0;
4812+
bool locale_is_c = false;
47834813

47844814
/* the right-hand const is type text or bytea */
47854815
Assert(typeid == BYTEAOID || typeid == TEXTOID);
47864816

4787-
if (typeid == BYTEAOID && case_insensitive)
4788-
ereport(ERROR,
4789-
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4817+
if (case_insensitive)
4818+
{
4819+
if (typeid == BYTEAOID)
4820+
ereport(ERROR,
4821+
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
47904822
errmsg("case insensitive matching not supported on type bytea")));
47914823

4824+
/* If case-insensitive, we need locale info */
4825+
if (lc_ctype_is_c(collation))
4826+
locale_is_c = true;
4827+
else if (collation != DEFAULT_COLLATION_OID)
4828+
{
4829+
if (!OidIsValid(collation))
4830+
{
4831+
/*
4832+
* This typically means that the parser could not resolve a
4833+
* conflict of implicit collations, so report it that way.
4834+
*/
4835+
ereport(ERROR,
4836+
(errcode(ERRCODE_INDETERMINATE_COLLATION),
4837+
errmsg("could not determine which collation to use for ILIKE"),
4838+
errhint("Use the COLLATE clause to set the collation explicitly.")));
4839+
}
4840+
locale = pg_newlocale_from_collation(collation);
4841+
}
4842+
}
4843+
47924844
if (typeid != BYTEAOID)
47934845
{
47944846
patt = TextDatumGetCString(patt_const->constvalue);
@@ -4822,23 +4874,11 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
48224874
break;
48234875
}
48244876

4825-
/*
4826-
* XXX In multibyte character sets, we can't trust isalpha, so assume
4827-
* any multibyte char is potentially case-varying.
4828-
*/
4829-
if (case_insensitive)
4830-
{
4831-
if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
4832-
break;
4833-
if (isalpha((unsigned char) patt[pos]))
4834-
break;
4835-
}
4877+
/* Stop if case-varying character (it's sort of a wildcard) */
4878+
if (case_insensitive &&
4879+
pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
4880+
break;
48364881

4837-
/*
4838-
* NOTE: this code used to think that %% meant a literal %, but
4839-
* textlike() itself does not think that, and the SQL92 spec doesn't
4840-
* say any such thing either.
4841-
*/
48424882
match[match_pos++] = patt[pos];
48434883
}
48444884

@@ -4870,7 +4910,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
48704910
}
48714911

48724912
static Pattern_Prefix_Status
4873-
regex_fixed_prefix(Const *patt_const, bool case_insensitive,
4913+
regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
48744914
Const **prefix_const, Const **rest_const)
48754915
{
48764916
char *match;
@@ -4883,6 +4923,8 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
48834923
char *rest;
48844924
Oid typeid = patt_const->consttype;
48854925
bool is_multibyte = (pg_database_encoding_max_length() > 1);
4926+
pg_locale_t locale = 0;
4927+
bool locale_is_c = false;
48864928

48874929
/*
48884930
* Should be unnecessary, there are no bytea regex operators defined. As
@@ -4894,6 +4936,28 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
48944936
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
48954937
errmsg("regular-expression matching not supported on type bytea")));
48964938

4939+
if (case_insensitive)
4940+
{
4941+
/* If case-insensitive, we need locale info */
4942+
if (lc_ctype_is_c(collation))
4943+
locale_is_c = true;
4944+
else if (collation != DEFAULT_COLLATION_OID)
4945+
{
4946+
if (!OidIsValid(collation))
4947+
{
4948+
/*
4949+
* This typically means that the parser could not resolve a
4950+
* conflict of implicit collations, so report it that way.
4951+
*/
4952+
ereport(ERROR,
4953+
(errcode(ERRCODE_INDETERMINATE_COLLATION),
4954+
errmsg("could not determine which collation to use for regular expression"),
4955+
errhint("Use the COLLATE clause to set the collation explicitly.")));
4956+
}
4957+
locale = pg_newlocale_from_collation(collation);
4958+
}
4959+
}
4960+
48974961
/* the right-hand const is type text for all of these */
48984962
patt = TextDatumGetCString(patt_const->constvalue);
48994963

@@ -4969,17 +5033,10 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
49695033
patt[pos] == '$')
49705034
break;
49715035

4972-
/*
4973-
* XXX In multibyte character sets, we can't trust isalpha, so assume
4974-
* any multibyte char is potentially case-varying.
4975-
*/
4976-
if (case_insensitive)
4977-
{
4978-
if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
4979-
break;
4980-
if (isalpha((unsigned char) patt[pos]))
4981-
break;
4982-
}
5036+
/* Stop if case-varying character (it's sort of a wildcard) */
5037+
if (case_insensitive &&
5038+
pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
5039+
break;
49835040

49845041
/*
49855042
* Check for quantifiers. Except for +, this means the preceding
@@ -5004,7 +5061,7 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
50045061
* backslash followed by alphanumeric is an escape, not a quoted
50055062
* character. Must treat it as having multiple possible matches.
50065063
* Note: since only ASCII alphanumerics are escapes, we don't have to
5007-
* be paranoid about multibyte here.
5064+
* be paranoid about multibyte or collations here.
50085065
*/
50095066
if (patt[pos] == '\\')
50105067
{
@@ -5056,24 +5113,24 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
50565113
}
50575114

50585115
Pattern_Prefix_Status
5059-
pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
5116+
pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
50605117
Const **prefix, Const **rest)
50615118
{
50625119
Pattern_Prefix_Status result;
50635120

50645121
switch (ptype)
50655122
{
50665123
case Pattern_Type_Like:
5067-
result = like_fixed_prefix(patt, false, prefix, rest);
5124+
result = like_fixed_prefix(patt, false, collation, prefix, rest);
50685125
break;
50695126
case Pattern_Type_Like_IC:
5070-
result = like_fixed_prefix(patt, true, prefix, rest);
5127+
result = like_fixed_prefix(patt, true, collation, prefix, rest);
50715128
break;
50725129
case Pattern_Type_Regex:
5073-
result = regex_fixed_prefix(patt, false, prefix, rest);
5130+
result = regex_fixed_prefix(patt, false, collation, prefix, rest);
50745131
break;
50755132
case Pattern_Type_Regex_IC:
5076-
result = regex_fixed_prefix(patt, true, prefix, rest);
5133+
result = regex_fixed_prefix(patt, true, collation, prefix, rest);
50775134
break;
50785135
default:
50795136
elog(ERROR, "unrecognized ptype: %d", (int) ptype);

src/include/utils/selfuncs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ extern double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
132132

133133
extern Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
134134
Pattern_Type ptype,
135+
Oid collation,
135136
Const **prefix,
136137
Const **rest);
137138
extern Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc);

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy