Content-Length: 559458 | pFad | http://github.com/postgres/postgres/commit/b3e99115e44c5040c949c99d081ff3812e6ec4a3

CE Fix conversion of SIMILAR TO regexes for character classes · postgres/postgres@b3e9911 · GitHub
Skip to content

Commit b3e9911

Browse files
committed
Fix conversion of SIMILAR TO regexes for character classes
The code that translates SIMILAR TO pattern matching expressions to POSIX-style regular expressions did not consider that square brackets can be nested. For example, in an expression like [[:alpha:]%_], the logic replaced the placeholders '_' and '%' but it should not. This commit fixes the conversion logic by tracking the nesting level of square brackets marking character class areas, while considering that in expressions like []] or [^]] the first closing square bracket is a regular character. Multiple tests are added to show how the conversions should or should not apply applied while in a character class area, with specific cases added for all the characters converted outside character classes like an opening parenthesis '(', dollar sign '$', etc. Author: Laurenz Albe <laurenz.albe@cybertec.at> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Reviewed-by: Michael Paquier <michael@paquier.xyz> Discussion: https://postgr.es/m/16ab039d1af455652bdf4173402ddda145f2c73b.camel@cybertec.at Backpatch-through: 13
1 parent 6b0d69d commit b3e9911

File tree

3 files changed

+114
-6
lines changed

3 files changed

+114
-6
lines changed

src/backend/utils/adt/regexp.c

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -759,8 +759,11 @@ similar_escape_internal(text *pat_text, text *esc_text)
759759
int plen,
760760
elen;
761761
bool afterescape = false;
762-
bool incharclass = false;
763762
int nquotes = 0;
763+
int charclass_depth = 0; /* Nesting level of character classes,
764+
* encompassed by square brackets */
765+
int charclass_start = 0; /* State of the character class start,
766+
* for carets */
764767

765768
p = VARDATA_ANY(pat_text);
766769
plen = VARSIZE_ANY_EXHDR(pat_text);
@@ -890,7 +893,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
890893
/* fast path */
891894
if (afterescape)
892895
{
893-
if (pchar == '"' && !incharclass) /* escape-double-quote? */
896+
if (pchar == '"' && charclass_depth < 1) /* escape-double-quote? */
894897
{
895898
/* emit appropriate part separator, per notes above */
896899
if (nquotes == 0)
@@ -939,18 +942,41 @@ similar_escape_internal(text *pat_text, text *esc_text)
939942
/* SQL escape character; do not send to output */
940943
afterescape = true;
941944
}
942-
else if (incharclass)
945+
else if (charclass_depth > 0)
943946
{
944947
if (pchar == '\\')
945948
*r++ = '\\';
946949
*r++ = pchar;
947-
if (pchar == ']')
948-
incharclass = false;
950+
951+
/*
952+
* Ignore a closing bracket at the start of a character class.
953+
* Such a bracket is taken literally rather than closing the
954+
* class. "charclass_start" is 1 right at the beginning of a
955+
* class and 2 after an initial caret.
956+
*/
957+
if (pchar == ']' && charclass_start > 2)
958+
charclass_depth--;
959+
else if (pchar == '[')
960+
charclass_depth++;
961+
962+
/*
963+
* If there is a caret right after the opening bracket, it negates
964+
* the character class, but a following closing bracket should
965+
* still be treated as a normal character. That holds only for
966+
* the first caret, so only the values 1 and 2 mean that closing
967+
* brackets should be taken literally.
968+
*/
969+
if (pchar == '^')
970+
charclass_start++;
971+
else
972+
charclass_start = 3; /* definitely past the start */
949973
}
950974
else if (pchar == '[')
951975
{
976+
/* start of a character class */
952977
*r++ = pchar;
953-
incharclass = true;
978+
charclass_depth++;
979+
charclass_start = 1;
954980
}
955981
else if (pchar == '%')
956982
{

src/test/regress/expected/strings.out

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,68 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
571571
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
572572
ERROR: invalid escape string
573573
HINT: Escape string must be empty or one character.
574+
-- Characters that should be left alone in character classes when a
575+
-- SIMILAR TO regexp pattern is converted to POSIX style.
576+
-- Underscore "_"
577+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
578+
QUERY PLAN
579+
------------------------------------------------
580+
Seq Scan on text_tbl
581+
Filter: (f1 ~ '^(?:.[_[:alpha:]_].)$'::text)
582+
(2 rows)
583+
584+
-- Percentage "%"
585+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
586+
QUERY PLAN
587+
--------------------------------------------------
588+
Seq Scan on text_tbl
589+
Filter: (f1 ~ '^(?:.*[%[:alnum:]%].*)$'::text)
590+
(2 rows)
591+
592+
-- Dot "."
593+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
594+
QUERY PLAN
595+
--------------------------------------------------
596+
Seq Scan on text_tbl
597+
Filter: (f1 ~ '^(?:\.[.[:alnum:].]\.)$'::text)
598+
(2 rows)
599+
600+
-- Dollar "$"
601+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
602+
QUERY PLAN
603+
--------------------------------------------------
604+
Seq Scan on text_tbl
605+
Filter: (f1 ~ '^(?:\$[$[:alnum:]$]\$)$'::text)
606+
(2 rows)
607+
608+
-- Opening parenthesis "("
609+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
610+
ERROR: invalid regular expression: parentheses () not balanced
611+
-- Caret "^"
612+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
613+
QUERY PLAN
614+
------------------------------------------------------------------------
615+
Seq Scan on text_tbl
616+
Filter: (f1 ~ '^(?:\^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]\^)$'::text)
617+
(2 rows)
618+
619+
-- Closing square bracket "]" at the beginning of character class
620+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
621+
QUERY PLAN
622+
------------------------------------------------
623+
Seq Scan on text_tbl
624+
Filter: (f1 ~ '^(?:[]%][^]%][^%].*)$'::text)
625+
(2 rows)
626+
627+
-- Closing square bracket effective after two carets at the beginning
628+
-- of character class.
629+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
630+
QUERY PLAN
631+
---------------------------------------
632+
Seq Scan on text_tbl
633+
Filter: (f1 ~ '^(?:[^^]\^)$'::text)
634+
(2 rows)
635+
574636
-- Test backslash escapes in regexp_replace's replacement string
575637
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
576638
regexp_replace

src/test/regress/sql/strings.sql

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,26 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true;
187187
SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
188188
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
189189

190+
-- Characters that should be left alone in character classes when a
191+
-- SIMILAR TO regexp pattern is converted to POSIX style.
192+
-- Underscore "_"
193+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
194+
-- Percentage "%"
195+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
196+
-- Dot "."
197+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
198+
-- Dollar "$"
199+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
200+
-- Opening parenthesis "("
201+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
202+
-- Caret "^"
203+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
204+
-- Closing square bracket "]" at the beginning of character class
205+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
206+
-- Closing square bracket effective after two carets at the beginning
207+
-- of character class.
208+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
209+
190210
-- Test backslash escapes in regexp_replace's replacement string
191211
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
192212
SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g');

0 commit comments

Comments
 (0)








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: http://github.com/postgres/postgres/commit/b3e99115e44c5040c949c99d081ff3812e6ec4a3

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy