Skip to content

Commit d074b4e

Browse files
committed
Fix regexp_matches() handling of zero-length matches.
We'd find the same match twice if it was of zero length and not immediately adjacent to the previous match. replace_text_regexp() got similar cases right, so adjust this search logic to match that. Note that even though the regexp_split_to_xxx() functions share this code, they did not display equivalent misbehavior, because the second match would be considered degenerate and ignored. Jeevan Chalke, with some cosmetic changes by me.
1 parent c876fb4 commit d074b4e

File tree

4 files changed

+75
-8
lines changed

4 files changed

+75
-8
lines changed

src/backend/utils/adt/regexp.c

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -957,14 +957,13 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
957957
break;
958958

959959
/*
960-
* Advance search position. Normally we start just after the end of
961-
* the previous match, but always advance at least one character (the
962-
* special case can occur if the pattern matches zero characters just
963-
* after the prior match or at the end of the string).
960+
* Advance search position. Normally we start the next search at the
961+
* end of the previous match; but if the match was of zero length, we
962+
* have to advance by one character, or we'd just find the same match
963+
* again.
964964
*/
965-
if (start_search < pmatch[0].rm_eo)
966-
start_search = pmatch[0].rm_eo;
967-
else
965+
start_search = prev_match_end;
966+
if (pmatch[0].rm_so == pmatch[0].rm_eo)
968967
start_search++;
969968
if (start_search > wide_len)
970969
break;

src/backend/utils/adt/varlena.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3083,7 +3083,10 @@ replace_text_regexp(text *src_text, void *regexp,
30833083
break;
30843084

30853085
/*
3086-
* Search from next character when the matching text is zero width.
3086+
* Advance search position. Normally we start the next search at the
3087+
* end of the previous match; but if the match was of zero length, we
3088+
* have to advance by one character, or we'd just find the same match
3089+
* again.
30873090
*/
30883091
search_start = data_pos;
30893092
if (pmatch[0].rm_so == pmatch[0].rm_eo)

src/test/regress/expected/strings.out

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,64 @@ SELECT regexp_matches('foobarbequebaz', $re$barbeque$re$);
440440
{barbeque}
441441
(1 row)
442442

443+
-- start/end-of-line matches are of zero length
444+
SELECT regexp_matches('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '^', 'mg');
445+
regexp_matches
446+
----------------
447+
{""}
448+
{""}
449+
{""}
450+
{""}
451+
(4 rows)
452+
453+
SELECT regexp_matches('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '$', 'mg');
454+
regexp_matches
455+
----------------
456+
{""}
457+
{""}
458+
{""}
459+
{""}
460+
(4 rows)
461+
462+
SELECT regexp_matches('1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4' || chr(10), '^.?', 'mg');
463+
regexp_matches
464+
----------------
465+
{1}
466+
{2}
467+
{3}
468+
{4}
469+
{""}
470+
(5 rows)
471+
472+
SELECT regexp_matches(chr(10) || '1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4' || chr(10), '.?$', 'mg');
473+
regexp_matches
474+
----------------
475+
{""}
476+
{1}
477+
{""}
478+
{2}
479+
{""}
480+
{3}
481+
{""}
482+
{4}
483+
{""}
484+
{""}
485+
(10 rows)
486+
487+
SELECT regexp_matches(chr(10) || '1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4', '.?$', 'mg');
488+
regexp_matches
489+
----------------
490+
{""}
491+
{1}
492+
{""}
493+
{2}
494+
{""}
495+
{3}
496+
{""}
497+
{4}
498+
{""}
499+
(9 rows)
500+
443501
-- give me errors
444502
SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'gz');
445503
ERROR: invalid regexp option: "z"

src/test/regress/sql/strings.sql

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,13 @@ SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)?(beque)$re$);
158158
-- no capture groups
159159
SELECT regexp_matches('foobarbequebaz', $re$barbeque$re$);
160160

161+
-- start/end-of-line matches are of zero length
162+
SELECT regexp_matches('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '^', 'mg');
163+
SELECT regexp_matches('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '$', 'mg');
164+
SELECT regexp_matches('1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4' || chr(10), '^.?', 'mg');
165+
SELECT regexp_matches(chr(10) || '1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4' || chr(10), '.?$', 'mg');
166+
SELECT regexp_matches(chr(10) || '1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4', '.?$', 'mg');
167+
161168
-- give me errors
162169
SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'gz');
163170
SELECT regexp_matches('foobarbequebaz', $re$(barbeque$re$);

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy