Skip to content

Commit 329304c

Browse files
committed
Support text position search functions with nondeterministic collations
This allows using text position search functions with nondeterministic collations. These functions are - position, strpos - replace - split_part - string_to_array - string_to_table which all use common internal infrastructure. There was previously no internal implementation of this, so it was met with a not-supported error. This adds the internal implementation and removes the error. Unlike with deterministic collations, the search cannot use any byte-by-byte optimized techniques but has to go substring by substring. We also need to consider that the found match could have a different length than the needle and that there could be substrings of different length matching at a position. In most cases, we need to find the longest such substring (greedy semantics), but this can be configured by each caller. Reviewed-by: Euler Taveira <euler@eulerto.com> Discussion: https://www.postgresql.org/message-id/flat/582b2613-0900-48ca-8b0d-340c06f4d400@eisentraut.org
1 parent 41336bf commit 329304c

File tree

3 files changed

+246
-48
lines changed

3 files changed

+246
-48
lines changed

src/backend/utils/adt/varlena.c

Lines changed: 88 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,9 @@ typedef struct varlena VarString;
5454
*/
5555
typedef struct
5656
{
57+
pg_locale_t locale; /* collation used for substring matching */
5758
bool is_multibyte_char_in_char; /* need to check char boundaries? */
59+
bool greedy; /* find longest possible substring? */
5860

5961
char *str1; /* haystack string */
6062
char *str2; /* needle string */
@@ -65,7 +67,13 @@ typedef struct
6567
int skiptablemask; /* mask for ANDing with skiptable subscripts */
6668
int skiptable[256]; /* skip distance for given mismatched char */
6769

70+
/*
71+
* Note that with nondeterministic collations, the length of the last
72+
* match is not necessarily equal to the length of the "needle" passed in.
73+
*/
6874
char *last_match; /* pointer to last match in 'str1' */
75+
int last_match_len; /* length of last match */
76+
int last_match_len_tmp; /* same but for internal use */
6977

7078
/*
7179
* Sometimes we need to convert the byte position of a match to a
@@ -1178,15 +1186,21 @@ text_position(text *t1, text *t2, Oid collid)
11781186
TextPositionState state;
11791187
int result;
11801188

1189+
check_collation_set(collid);
1190+
11811191
/* Empty needle always matches at position 1 */
11821192
if (VARSIZE_ANY_EXHDR(t2) < 1)
11831193
return 1;
11841194

11851195
/* Otherwise, can't match if haystack is shorter than needle */
1186-
if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1196+
if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2) &&
1197+
pg_newlocale_from_collation(collid)->deterministic)
11871198
return 0;
11881199

11891200
text_position_setup(t1, t2, collid, &state);
1201+
/* don't need greedy mode here */
1202+
state.greedy = false;
1203+
11901204
if (!text_position_next(&state))
11911205
result = 0;
11921206
else
@@ -1217,18 +1231,17 @@ text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
12171231
{
12181232
int len1 = VARSIZE_ANY_EXHDR(t1);
12191233
int len2 = VARSIZE_ANY_EXHDR(t2);
1220-
pg_locale_t mylocale;
12211234

12221235
check_collation_set(collid);
12231236

1224-
mylocale = pg_newlocale_from_collation(collid);
1237+
state->locale = pg_newlocale_from_collation(collid);
12251238

1226-
if (!mylocale->deterministic)
1227-
ereport(ERROR,
1228-
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1229-
errmsg("nondeterministic collations are not supported for substring searches")));
1239+
/*
1240+
* Most callers need greedy mode, but some might want to unset this to
1241+
* optimize.
1242+
*/
1243+
state->greedy = true;
12301244

1231-
Assert(len1 > 0);
12321245
Assert(len2 > 0);
12331246

12341247
/*
@@ -1264,8 +1277,11 @@ text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
12641277
* point in wasting cycles initializing the table. We also choose not to
12651278
* use B-M-H for needles of length 1, since the skip table can't possibly
12661279
* save anything in that case.
1280+
*
1281+
* (With nondeterministic collations, the search is already
1282+
* multibyte-aware, so we don't need this.)
12671283
*/
1268-
if (len1 >= len2 && len2 > 1)
1284+
if (len1 >= len2 && len2 > 1 && state->locale->deterministic)
12691285
{
12701286
int searchlength = len1 - len2;
12711287
int skiptablemask;
@@ -1343,7 +1359,7 @@ text_position_next(TextPositionState *state)
13431359

13441360
/* Start from the point right after the previous match. */
13451361
if (state->last_match)
1346-
start_ptr = state->last_match + needle_len;
1362+
start_ptr = state->last_match + state->last_match_len;
13471363
else
13481364
start_ptr = state->str1;
13491365

@@ -1359,7 +1375,7 @@ text_position_next(TextPositionState *state)
13591375
* multi-byte character, we need to verify that the match was at a
13601376
* character boundary, not in the middle of a multi-byte character.
13611377
*/
1362-
if (state->is_multibyte_char_in_char)
1378+
if (state->is_multibyte_char_in_char && state->locale->deterministic)
13631379
{
13641380
/* Walk one character at a time, until we reach the match. */
13651381

@@ -1387,6 +1403,7 @@ text_position_next(TextPositionState *state)
13871403
}
13881404

13891405
state->last_match = matchptr;
1406+
state->last_match_len = state->last_match_len_tmp;
13901407
return true;
13911408
}
13921409

@@ -1408,7 +1425,62 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
14081425

14091426
Assert(start_ptr >= haystack && start_ptr <= haystack_end);
14101427

1411-
if (needle_len == 1)
1428+
state->last_match_len_tmp = needle_len;
1429+
1430+
if (!state->locale->deterministic)
1431+
{
1432+
/*
1433+
* With a nondeterministic collation, we have to use an unoptimized
1434+
* route. We walk through the haystack and see if at each position
1435+
* there is a substring of the remaining string that is equal to the
1436+
* needle under the given collation.
1437+
*
1438+
* Note, the found substring could have a different length than the
1439+
* needle, including being empty. Callers that want to skip over the
1440+
* found string need to read the length of the found substring from
1441+
* last_match_len rather than just using the length of their needle.
1442+
*
1443+
* Most callers will require "greedy" semantics, meaning that we need
1444+
* to find the longest such substring, not the shortest. For callers
1445+
* that don't need greedy semantics, we can finish on the first match.
1446+
*/
1447+
const char *result_hptr = NULL;
1448+
1449+
hptr = start_ptr;
1450+
while (hptr < haystack_end)
1451+
{
1452+
/*
1453+
* First check the common case that there is a match in the
1454+
* haystack of exactly the length of the needle.
1455+
*/
1456+
if (!state->greedy &&
1457+
haystack_end - hptr >= needle_len &&
1458+
pg_strncoll(hptr, needle_len, needle, needle_len, state->locale) == 0)
1459+
return (char *) hptr;
1460+
1461+
/*
1462+
* Else check if any of the possible substrings starting at hptr
1463+
* are equal to the needle.
1464+
*/
1465+
for (const char *test_end = hptr; test_end < haystack_end; test_end += pg_mblen(test_end))
1466+
{
1467+
if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0)
1468+
{
1469+
state->last_match_len_tmp = (test_end - hptr);
1470+
result_hptr = hptr;
1471+
if (!state->greedy)
1472+
break;
1473+
}
1474+
}
1475+
if (result_hptr)
1476+
break;
1477+
1478+
hptr += pg_mblen(hptr);
1479+
}
1480+
1481+
return (char *) result_hptr;
1482+
}
1483+
else if (needle_len == 1)
14121484
{
14131485
/* No point in using B-M-H for a one-character needle */
14141486
char nchar = *needle;
@@ -4055,7 +4127,7 @@ replace_text(PG_FUNCTION_ARGS)
40554127

40564128
appendStringInfoText(&str, to_sub_text);
40574129

4058-
start_ptr = curr_ptr + from_sub_text_len;
4130+
start_ptr = curr_ptr + state.last_match_len;
40594131

40604132
found = text_position_next(&state);
40614133
if (found)
@@ -4445,7 +4517,7 @@ split_part(PG_FUNCTION_ARGS)
44454517
/* special case of last field does not require an extra pass */
44464518
if (fldnum == -1)
44474519
{
4448-
start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4520+
start_ptr = text_position_get_match_ptr(&state) + state.last_match_len;
44494521
end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
44504522
text_position_cleanup(&state);
44514523
PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
@@ -4475,7 +4547,7 @@ split_part(PG_FUNCTION_ARGS)
44754547
while (found && --fldnum > 0)
44764548
{
44774549
/* identify bounds of next field */
4478-
start_ptr = end_ptr + fldsep_len;
4550+
start_ptr = end_ptr + state.last_match_len;
44794551
found = text_position_next(&state);
44804552
if (found)
44814553
end_ptr = text_position_get_match_ptr(&state);
@@ -4691,7 +4763,7 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
46914763
if (!found)
46924764
break;
46934765

4694-
start_ptr = end_ptr + fldsep_len;
4766+
start_ptr = end_ptr + state.last_match_len;
46954767
}
46964768

46974769
text_position_cleanup(&state);

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy