Skip to content

Commit a5cf12e

Browse files
committed
Fix performance issues in replace_text(), replace_text_regexp(), and
text_to_array(): they all had O(N^2) behavior on long input strings in multibyte encodings, because of repeated rescanning of the input text to identify substrings whose positions/lengths were computed in characters instead of bytes. Fix by tracking the current source position as a char pointer as well as a character-count. Also avoid some unnecessary palloc operations. text_to_array() also leaked memory intracall due to failure to pfree temporary strings. Per gripe from Tatsuo Ishii.
1 parent 9d6f263 commit a5cf12e

File tree

1 file changed

+131
-56
lines changed

1 file changed

+131
-56
lines changed

src/backend/utils/adt/varlena.c

Lines changed: 131 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.152 2006/10/07 00:11:53 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.153 2006/11/08 19:22:25 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -21,6 +21,7 @@
2121
#include "catalog/pg_type.h"
2222
#include "libpq/md5.h"
2323
#include "libpq/pqformat.h"
24+
#include "miscadmin.h"
2425
#include "parser/scansup.h"
2526
#include "regex/regex.h"
2627
#include "utils/builtins.h"
@@ -477,6 +478,32 @@ textcat(PG_FUNCTION_ARGS)
477478
PG_RETURN_TEXT_P(result);
478479
}
479480

481+
/*
482+
* charlen_to_bytelen()
483+
* Compute the number of bytes occupied by n characters starting at *p
484+
*
485+
* It is caller's responsibility that there actually are n characters;
486+
* the string need not be null-terminated.
487+
*/
488+
static int
489+
charlen_to_bytelen(const char *p, int n)
490+
{
491+
if (pg_database_encoding_max_length() == 1)
492+
{
493+
/* Optimization for single-byte encodings */
494+
return n;
495+
}
496+
else
497+
{
498+
const char *s;
499+
500+
for (s = p; n > 0; n--)
501+
s += pg_mblen(s);
502+
503+
return s - p;
504+
}
505+
}
506+
480507
/*
481508
* text_substr()
482509
* Return a substring starting at the specified position.
@@ -534,6 +561,8 @@ text_substr_no_len(PG_FUNCTION_ARGS)
534561
* functions. Note that the argument is passed as a Datum, to indicate that
535562
* it may still be in compressed/toasted form. We can avoid detoasting all
536563
* of it in some cases.
564+
*
565+
* The result is always a freshly palloc'd datum.
537566
*/
538567
static text *
539568
text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
@@ -649,11 +678,23 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
649678
*/
650679
slice_size = (S1 + L1) * eml;
651680
}
652-
slice = DatumGetTextPSlice(str, slice_start, slice_size);
681+
682+
/*
683+
* If we're working with an untoasted source, no need to do an
684+
* extra copying step.
685+
*/
686+
if (VARATT_IS_EXTENDED(str))
687+
slice = DatumGetTextPSlice(str, slice_start, slice_size);
688+
else
689+
slice = (text *) DatumGetPointer(str);
653690

654691
/* see if we got back an empty string */
655692
if ((VARSIZE(slice) - VARHDRSZ) == 0)
693+
{
694+
if (slice != (text *) DatumGetPointer(str))
695+
pfree(slice);
656696
return PG_STR_GET_TEXT("");
697+
}
657698

658699
/* Now we can get the actual length of the slice in MB characters */
659700
slice_strlen = pg_mbstrlen_with_len(VARDATA(slice), VARSIZE(slice) - VARHDRSZ);
@@ -663,7 +704,11 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
663704
* says to return a zero-length string.
664705
*/
665706
if (S1 > slice_strlen)
707+
{
708+
if (slice != (text *) DatumGetPointer(str))
709+
pfree(slice);
666710
return PG_STR_GET_TEXT("");
711+
}
667712

668713
/*
669714
* Adjust L1 and E1 now that we know the slice string length. Again
@@ -695,6 +740,9 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
695740
VARATT_SIZEP(ret) = VARHDRSZ + (p - s);
696741
memcpy(VARDATA(ret), s, (p - s));
697742

743+
if (slice != (text *) DatumGetPointer(str))
744+
pfree(slice);
745+
698746
return ret;
699747
}
700748
else
@@ -2076,10 +2124,11 @@ replace_text(PG_FUNCTION_ARGS)
20762124
int src_text_len = TEXTLEN(src_text);
20772125
int from_sub_text_len = TEXTLEN(from_sub_text);
20782126
TextPositionState state;
2079-
text *chunk_text;
20802127
text *ret_text;
20812128
int start_posn;
20822129
int curr_posn;
2130+
int chunk_len;
2131+
char *start_ptr;
20832132
StringInfoData str;
20842133

20852134
if (src_text_len == 0 || from_sub_text_len == 0)
@@ -2097,31 +2146,31 @@ replace_text(PG_FUNCTION_ARGS)
20972146
PG_RETURN_TEXT_P(src_text);
20982147
}
20992148

2149+
/* start_ptr points to the start_posn'th character of src_text */
2150+
start_ptr = (char *) VARDATA(src_text);
2151+
21002152
initStringInfo(&str);
21012153

21022154
do
21032155
{
2104-
chunk_text = text_substring(PointerGetDatum(src_text),
2105-
start_posn,
2106-
curr_posn - start_posn,
2107-
false);
2108-
appendStringInfoText(&str, chunk_text);
2109-
pfree(chunk_text);
2156+
/* copy the data skipped over by last text_position_next() */
2157+
chunk_len = charlen_to_bytelen(start_ptr, curr_posn - start_posn);
2158+
appendBinaryStringInfo(&str, start_ptr, chunk_len);
21102159

21112160
appendStringInfoText(&str, to_sub_text);
21122161

2113-
start_posn = curr_posn + from_sub_text_len;
2162+
start_posn = curr_posn;
2163+
start_ptr += chunk_len;
2164+
start_posn += from_sub_text_len;
2165+
start_ptr += charlen_to_bytelen(start_ptr, from_sub_text_len);
2166+
21142167
curr_posn = text_position_next(start_posn, &state);
21152168
}
21162169
while (curr_posn > 0);
21172170

2118-
/* copy trailing chunk */
2119-
chunk_text = text_substring(PointerGetDatum(src_text),
2120-
start_posn,
2121-
-1,
2122-
true);
2123-
appendStringInfoText(&str, chunk_text);
2124-
pfree(chunk_text);
2171+
/* copy trailing data */
2172+
chunk_len = ((char *) src_text + VARSIZE(src_text)) - start_ptr;
2173+
appendBinaryStringInfo(&str, start_ptr, chunk_len);
21252174

21262175
text_position_cleanup(&state);
21272176

@@ -2166,11 +2215,13 @@ check_replace_text_has_escape_char(const text *replace_text)
21662215
* appendStringInfoRegexpSubstr
21672216
*
21682217
* Append replace_text to str, substituting regexp back references for
2169-
* \n escapes.
2218+
* \n escapes. start_ptr is the start of the match in the source string,
2219+
* at logical character position data_pos.
21702220
*/
21712221
static void
21722222
appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
2173-
regmatch_t *pmatch, text *src_text)
2223+
regmatch_t *pmatch,
2224+
char *start_ptr, int data_pos)
21742225
{
21752226
const char *p = VARDATA(replace_text);
21762227
const char *p_end = p + (VARSIZE(replace_text) - VARHDRSZ);
@@ -2247,16 +2298,17 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
22472298
if (so != -1 && eo != -1)
22482299
{
22492300
/*
2250-
* Copy the text that is back reference of regexp. Because so and
2251-
* eo are counted in characters not bytes, it's easiest to use
2252-
* text_substring to pull out the correct chunk of text.
2301+
* Copy the text that is back reference of regexp. Note so and
2302+
* eo are counted in characters not bytes.
22532303
*/
2254-
text *append_text;
2255-
2256-
append_text = text_substring(PointerGetDatum(src_text),
2257-
so + 1, (eo - so), false);
2258-
appendStringInfoText(str, append_text);
2259-
pfree(append_text);
2304+
char *chunk_start;
2305+
int chunk_len;
2306+
2307+
Assert(so >= data_pos);
2308+
chunk_start = start_ptr;
2309+
chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
2310+
chunk_len = charlen_to_bytelen(chunk_start, eo - so);
2311+
appendBinaryStringInfo(str, chunk_start, chunk_len);
22602312
}
22612313
}
22622314
}
@@ -2284,6 +2336,7 @@ replace_text_regexp(text *src_text, void *regexp,
22842336
size_t data_len;
22852337
int search_start;
22862338
int data_pos;
2339+
char *start_ptr;
22872340
bool have_escape;
22882341

22892342
initStringInfo(&buf);
@@ -2295,10 +2348,17 @@ replace_text_regexp(text *src_text, void *regexp,
22952348
/* Check whether replace_text has escape char. */
22962349
have_escape = check_replace_text_has_escape_char(replace_text);
22972350

2298-
for (search_start = data_pos = 0; search_start <= data_len;)
2351+
/* start_ptr points to the data_pos'th character of src_text */
2352+
start_ptr = (char *) VARDATA(src_text);
2353+
data_pos = 0;
2354+
2355+
search_start = 0;
2356+
while (search_start <= data_len)
22992357
{
23002358
int regexec_result;
23012359

2360+
CHECK_FOR_INTERRUPTS();
2361+
23022362
regexec_result = pg_regexec(re,
23032363
data,
23042364
data_len,
@@ -2322,32 +2382,38 @@ replace_text_regexp(text *src_text, void *regexp,
23222382
}
23232383

23242384
/*
2325-
* Copy the text to the left of the match position. Because we are
2326-
* working with character not byte indexes, it's easiest to use
2327-
* text_substring to pull out the needed data.
2385+
* Copy the text to the left of the match position. Note we are
2386+
* given character not byte indexes.
23282387
*/
23292388
if (pmatch[0].rm_so - data_pos > 0)
23302389
{
2331-
text *left_text;
2332-
2333-
left_text = text_substring(PointerGetDatum(src_text),
2334-
data_pos + 1,
2335-
pmatch[0].rm_so - data_pos,
2336-
false);
2337-
appendStringInfoText(&buf, left_text);
2338-
pfree(left_text);
2390+
int chunk_len;
2391+
2392+
chunk_len = charlen_to_bytelen(start_ptr,
2393+
pmatch[0].rm_so - data_pos);
2394+
appendBinaryStringInfo(&buf, start_ptr, chunk_len);
2395+
/*
2396+
* Advance start_ptr over that text, to avoid multiple rescans
2397+
* of it if the replace_text contains multiple back-references.
2398+
*/
2399+
start_ptr += chunk_len;
2400+
data_pos = pmatch[0].rm_so;
23392401
}
23402402

23412403
/*
23422404
* Copy the replace_text. Process back references when the
23432405
* replace_text has escape characters.
23442406
*/
23452407
if (have_escape)
2346-
appendStringInfoRegexpSubstr(&buf, replace_text, pmatch, src_text);
2408+
appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
2409+
start_ptr, data_pos);
23472410
else
23482411
appendStringInfoText(&buf, replace_text);
23492412

2350-
search_start = data_pos = pmatch[0].rm_eo;
2413+
/* Advance start_ptr and data_pos over the matched text. */
2414+
start_ptr += charlen_to_bytelen(start_ptr,
2415+
pmatch[0].rm_eo - data_pos);
2416+
data_pos = pmatch[0].rm_eo;
23512417

23522418
/*
23532419
* When global option is off, replace the first instance only.
@@ -2358,6 +2424,7 @@ replace_text_regexp(text *src_text, void *regexp,
23582424
/*
23592425
* Search from next character when the matching text is zero width.
23602426
*/
2427+
search_start = data_pos;
23612428
if (pmatch[0].rm_so == pmatch[0].rm_eo)
23622429
search_start++;
23632430
}
@@ -2367,12 +2434,10 @@ replace_text_regexp(text *src_text, void *regexp,
23672434
*/
23682435
if (data_pos < data_len)
23692436
{
2370-
text *right_text;
2437+
int chunk_len;
23712438

2372-
right_text = text_substring(PointerGetDatum(src_text),
2373-
data_pos + 1, -1, true);
2374-
appendStringInfoText(&buf, right_text);
2375-
pfree(right_text);
2439+
chunk_len = ((char *) src_text + VARSIZE(src_text)) - start_ptr;
2440+
appendBinaryStringInfo(&buf, start_ptr, chunk_len);
23762441
}
23772442

23782443
ret_text = PG_STR_GET_TEXT(buf.data);
@@ -2488,6 +2553,8 @@ text_to_array(PG_FUNCTION_ARGS)
24882553
int fldnum;
24892554
int start_posn;
24902555
int end_posn;
2556+
int chunk_len;
2557+
char *start_ptr;
24912558
text *result_text;
24922559
ArrayBuildState *astate = NULL;
24932560

@@ -2506,37 +2573,45 @@ text_to_array(PG_FUNCTION_ARGS)
25062573
text_position_setup(inputstring, fldsep, &state);
25072574

25082575
start_posn = 1;
2576+
/* start_ptr points to the start_posn'th character of inputstring */
2577+
start_ptr = (char *) VARDATA(inputstring);
2578+
25092579
for (fldnum = 1;; fldnum++) /* field number is 1 based */
25102580
{
25112581
end_posn = text_position_next(start_posn, &state);
25122582

25132583
if (end_posn == 0)
25142584
{
25152585
/* fetch last field */
2516-
result_text = text_substring(PointerGetDatum(inputstring),
2517-
start_posn,
2518-
-1,
2519-
true);
2586+
chunk_len = ((char *) inputstring + VARSIZE(inputstring)) - start_ptr;
25202587
}
25212588
else
25222589
{
25232590
/* fetch non-last field */
2524-
result_text = text_substring(PointerGetDatum(inputstring),
2525-
start_posn,
2526-
end_posn - start_posn,
2527-
false);
2591+
chunk_len = charlen_to_bytelen(start_ptr, end_posn - start_posn);
25282592
}
25292593

2594+
/* must build a temp text datum to pass to accumArrayResult */
2595+
result_text = (text *) palloc(VARHDRSZ + chunk_len);
2596+
VARATT_SIZEP(result_text) = VARHDRSZ + chunk_len;
2597+
memcpy(VARDATA(result_text), start_ptr, chunk_len);
2598+
25302599
/* stash away this field */
25312600
astate = accumArrayResult(astate,
25322601
PointerGetDatum(result_text),
25332602
false,
25342603
TEXTOID,
25352604
CurrentMemoryContext);
25362605

2606+
pfree(result_text);
2607+
25372608
if (end_posn == 0)
25382609
break;
2539-
start_posn = end_posn + fldsep_len;
2610+
2611+
start_posn = end_posn;
2612+
start_ptr += chunk_len;
2613+
start_posn += fldsep_len;
2614+
start_ptr += charlen_to_bytelen(start_ptr, fldsep_len);
25402615
}
25412616

25422617
text_position_cleanup(&state);

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy