Content-Length: 559187 | pFad | http://github.com/postgres/postgres/commit/1fe15d25e65c5ef4fe3be9efd2927f3e3891f7b1

93 Fix conversion of SIMILAR TO regexes for character classes · postgres/postgres@1fe15d2 · GitHub
Skip to content

Commit 1fe15d2

Browse files
committed
Fix conversion of SIMILAR TO regexes for character classes
The code that translates SIMILAR TO pattern matching expressions to POSIX-style regular expressions did not consider that square brackets can be nested. For example, in an expression like [[:alpha:]%_], the logic replaced the placeholders '_' and '%' but it should not. This commit fixes the conversion logic by tracking the nesting level of square brackets marking character class areas, while considering that in expressions like []] or [^]] the first closing square bracket is a regular character. Multiple tests are added to show how the conversions should or should not apply applied while in a character class area, with specific cases added for all the characters converted outside character classes like an opening parenthesis '(', dollar sign '$', etc. Author: Laurenz Albe <laurenz.albe@cybertec.at> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Reviewed-by: Michael Paquier <michael@paquier.xyz> Discussion: https://postgr.es/m/16ab039d1af455652bdf4173402ddda145f2c73b.camel@cybertec.at Backpatch-through: 13
1 parent 1412781 commit 1fe15d2

File tree

3 files changed

+114
-6
lines changed

3 files changed

+114
-6
lines changed

src/backend/utils/adt/regexp.c

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -673,8 +673,11 @@ similar_escape_internal(text *pat_text, text *esc_text)
673673
int plen,
674674
elen;
675675
bool afterescape = false;
676-
bool incharclass = false;
677676
int nquotes = 0;
677+
int charclass_depth = 0; /* Nesting level of character classes,
678+
* encompassed by square brackets */
679+
int charclass_start = 0; /* State of the character class start,
680+
* for carets */
678681

679682
p = VARDATA_ANY(pat_text);
680683
plen = VARSIZE_ANY_EXHDR(pat_text);
@@ -804,7 +807,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
804807
/* fast path */
805808
if (afterescape)
806809
{
807-
if (pchar == '"' && !incharclass) /* escape-double-quote? */
810+
if (pchar == '"' && charclass_depth < 1) /* escape-double-quote? */
808811
{
809812
/* emit appropriate part separator, per notes above */
810813
if (nquotes == 0)
@@ -853,18 +856,41 @@ similar_escape_internal(text *pat_text, text *esc_text)
853856
/* SQL escape character; do not send to output */
854857
afterescape = true;
855858
}
856-
else if (incharclass)
859+
else if (charclass_depth > 0)
857860
{
858861
if (pchar == '\\')
859862
*r++ = '\\';
860863
*r++ = pchar;
861-
if (pchar == ']')
862-
incharclass = false;
864+
865+
/*
866+
* Ignore a closing bracket at the start of a character class.
867+
* Such a bracket is taken literally rather than closing the
868+
* class. "charclass_start" is 1 right at the beginning of a
869+
* class and 2 after an initial caret.
870+
*/
871+
if (pchar == ']' && charclass_start > 2)
872+
charclass_depth--;
873+
else if (pchar == '[')
874+
charclass_depth++;
875+
876+
/*
877+
* If there is a caret right after the opening bracket, it negates
878+
* the character class, but a following closing bracket should
879+
* still be treated as a normal character. That holds only for
880+
* the first caret, so only the values 1 and 2 mean that closing
881+
* brackets should be taken literally.
882+
*/
883+
if (pchar == '^')
884+
charclass_start++;
885+
else
886+
charclass_start = 3; /* definitely past the start */
863887
}
864888
else if (pchar == '[')
865889
{
890+
/* start of a character class */
866891
*r++ = pchar;
867-
incharclass = true;
892+
charclass_depth++;
893+
charclass_start = 1;
868894
}
869895
else if (pchar == '%')
870896
{

src/test/regress/expected/strings.out

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,68 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
564564
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
565565
ERROR: invalid escape string
566566
HINT: Escape string must be empty or one character.
567+
-- Characters that should be left alone in character classes when a
568+
-- SIMILAR TO regexp pattern is converted to POSIX style.
569+
-- Underscore "_"
570+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
571+
QUERY PLAN
572+
------------------------------------------------
573+
Seq Scan on text_tbl
574+
Filter: (f1 ~ '^(?:.[_[:alpha:]_].)$'::text)
575+
(2 rows)
576+
577+
-- Percentage "%"
578+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
579+
QUERY PLAN
580+
--------------------------------------------------
581+
Seq Scan on text_tbl
582+
Filter: (f1 ~ '^(?:.*[%[:alnum:]%].*)$'::text)
583+
(2 rows)
584+
585+
-- Dot "."
586+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
587+
QUERY PLAN
588+
--------------------------------------------------
589+
Seq Scan on text_tbl
590+
Filter: (f1 ~ '^(?:\.[.[:alnum:].]\.)$'::text)
591+
(2 rows)
592+
593+
-- Dollar "$"
594+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
595+
QUERY PLAN
596+
--------------------------------------------------
597+
Seq Scan on text_tbl
598+
Filter: (f1 ~ '^(?:\$[$[:alnum:]$]\$)$'::text)
599+
(2 rows)
600+
601+
-- Opening parenthesis "("
602+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
603+
ERROR: invalid regular expression: parentheses () not balanced
604+
-- Caret "^"
605+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
606+
QUERY PLAN
607+
------------------------------------------------------------------------
608+
Seq Scan on text_tbl
609+
Filter: (f1 ~ '^(?:\^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]\^)$'::text)
610+
(2 rows)
611+
612+
-- Closing square bracket "]" at the beginning of character class
613+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
614+
QUERY PLAN
615+
------------------------------------------------
616+
Seq Scan on text_tbl
617+
Filter: (f1 ~ '^(?:[]%][^]%][^%].*)$'::text)
618+
(2 rows)
619+
620+
-- Closing square bracket effective after two carets at the beginning
621+
-- of character class.
622+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
623+
QUERY PLAN
624+
---------------------------------------
625+
Seq Scan on text_tbl
626+
Filter: (f1 ~ '^(?:[^^]\^)$'::text)
627+
(2 rows)
628+
567629
-- Test back reference in regexp_replace
568630
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
569631
regexp_replace

src/test/regress/sql/strings.sql

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,26 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true;
185185
SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
186186
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
187187

188+
-- Characters that should be left alone in character classes when a
189+
-- SIMILAR TO regexp pattern is converted to POSIX style.
190+
-- Underscore "_"
191+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
192+
-- Percentage "%"
193+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
194+
-- Dot "."
195+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
196+
-- Dollar "$"
197+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
198+
-- Opening parenthesis "("
199+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
200+
-- Caret "^"
201+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
202+
-- Closing square bracket "]" at the beginning of character class
203+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
204+
-- Closing square bracket effective after two carets at the beginning
205+
-- of character class.
206+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
207+
188208
-- Test back reference in regexp_replace
189209
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
190210
SELECT regexp_replace('AAA BBB CCC ', E'\\s+', ' ', 'g');

0 commit comments

Comments
 (0)








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: http://github.com/postgres/postgres/commit/1fe15d25e65c5ef4fe3be9efd2927f3e3891f7b1

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy