Skip to content

Commit b2bdb7b

Browse files
committed
Fix regexp_matches() handling of zero-length matches.
We'd find the same match twice if it was of zero length and not immediately adjacent to the previous match. replace_text_regexp() got similar cases right, so adjust this search logic to match that. Note that even though the regexp_split_to_xxx() functions share this code, they did not display equivalent misbehavior, because the second match would be considered degenerate and ignored. Jeevan Chalke, with some cosmetic changes by me.
1 parent 21c2d4c commit b2bdb7b

File tree

4 files changed

+75
-8
lines changed

4 files changed

+75
-8
lines changed

src/backend/utils/adt/regexp.c

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -937,14 +937,13 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
937937
break;
938938

939939
/*
940-
* Advance search position. Normally we start just after the end of
941-
* the previous match, but always advance at least one character (the
942-
* special case can occur if the pattern matches zero characters just
943-
* after the prior match or at the end of the string).
940+
* Advance search position. Normally we start the next search at the
941+
* end of the previous match; but if the match was of zero length, we
942+
* have to advance by one character, or we'd just find the same match
943+
* again.
944944
*/
945-
if (start_search < pmatch[0].rm_eo)
946-
start_search = pmatch[0].rm_eo;
947-
else
945+
start_search = prev_match_end;
946+
if (pmatch[0].rm_so == pmatch[0].rm_eo)
948947
start_search++;
949948
if (start_search > wide_len)
950949
break;

src/backend/utils/adt/varlena.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2624,7 +2624,10 @@ replace_text_regexp(text *src_text, void *regexp,
26242624
break;
26252625

26262626
/*
2627-
* Search from next character when the matching text is zero width.
2627+
* Advance search position. Normally we start the next search at the
2628+
* end of the previous match; but if the match was of zero length, we
2629+
* have to advance by one character, or we'd just find the same match
2630+
* again.
26282631
*/
26292632
search_start = data_pos;
26302633
if (pmatch[0].rm_so == pmatch[0].rm_eo)

src/test/regress/expected/strings.out

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,64 @@ SELECT regexp_matches('foobarbequebaz', $re$barbeque$re$);
347347
{barbeque}
348348
(1 row)
349349

350+
-- start/end-of-line matches are of zero length
351+
SELECT regexp_matches('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '^', 'mg');
352+
regexp_matches
353+
----------------
354+
{""}
355+
{""}
356+
{""}
357+
{""}
358+
(4 rows)
359+
360+
SELECT regexp_matches('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '$', 'mg');
361+
regexp_matches
362+
----------------
363+
{""}
364+
{""}
365+
{""}
366+
{""}
367+
(4 rows)
368+
369+
SELECT regexp_matches('1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4' || chr(10), '^.?', 'mg');
370+
regexp_matches
371+
----------------
372+
{1}
373+
{2}
374+
{3}
375+
{4}
376+
{""}
377+
(5 rows)
378+
379+
SELECT regexp_matches(chr(10) || '1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4' || chr(10), '.?$', 'mg');
380+
regexp_matches
381+
----------------
382+
{""}
383+
{1}
384+
{""}
385+
{2}
386+
{""}
387+
{3}
388+
{""}
389+
{4}
390+
{""}
391+
{""}
392+
(10 rows)
393+
394+
SELECT regexp_matches(chr(10) || '1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4', '.?$', 'mg');
395+
regexp_matches
396+
----------------
397+
{""}
398+
{1}
399+
{""}
400+
{2}
401+
{""}
402+
{3}
403+
{""}
404+
{4}
405+
{""}
406+
(9 rows)
407+
350408
-- give me errors
351409
SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'gz');
352410
ERROR: invalid regexp option: "z"

src/test/regress/sql/strings.sql

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,13 @@ SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)?(beque)$re$);
137137
-- no capture groups
138138
SELECT regexp_matches('foobarbequebaz', $re$barbeque$re$);
139139

140+
-- start/end-of-line matches are of zero length
141+
SELECT regexp_matches('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '^', 'mg');
142+
SELECT regexp_matches('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '$', 'mg');
143+
SELECT regexp_matches('1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4' || chr(10), '^.?', 'mg');
144+
SELECT regexp_matches(chr(10) || '1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4' || chr(10), '.?$', 'mg');
145+
SELECT regexp_matches(chr(10) || '1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4', '.?$', 'mg');
146+
140147
-- give me errors
141148
SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'gz');
142149
SELECT regexp_matches('foobarbequebaz', $re$(barbeque$re$);

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy