Skip to content

Commit b3e9911

Browse files
committed
Fix conversion of SIMILAR TO regexes for character classes
The code that translates SIMILAR TO pattern matching expressions to POSIX-style regular expressions did not consider that square brackets can be nested. For example, in an expression like [[:alpha:]%_], the logic replaced the placeholders '_' and '%' but it should not. This commit fixes the conversion logic by tracking the nesting level of square brackets marking character class areas, while considering that in expressions like []] or [^]] the first closing square bracket is a regular character. Multiple tests are added to show how the conversions should or should not apply applied while in a character class area, with specific cases added for all the characters converted outside character classes like an opening parenthesis '(', dollar sign '$', etc. Author: Laurenz Albe <laurenz.albe@cybertec.at> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Reviewed-by: Michael Paquier <michael@paquier.xyz> Discussion: https://postgr.es/m/16ab039d1af455652bdf4173402ddda145f2c73b.camel@cybertec.at Backpatch-through: 13
1 parent 6b0d69d commit b3e9911

File tree

3 files changed

+114
-6
lines changed

3 files changed

+114
-6
lines changed

src/backend/utils/adt/regexp.c

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -759,8 +759,11 @@ similar_escape_internal(text *pat_text, text *esc_text)
759759
int plen,
760760
elen;
761761
bool afterescape = false;
762-
bool incharclass = false;
763762
int nquotes = 0;
763+
int charclass_depth = 0; /* Nesting level of character classes,
764+
* encompassed by square brackets */
765+
int charclass_start = 0; /* State of the character class start,
766+
* for carets */
764767

765768
p = VARDATA_ANY(pat_text);
766769
plen = VARSIZE_ANY_EXHDR(pat_text);
@@ -890,7 +893,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
890893
/* fast path */
891894
if (afterescape)
892895
{
893-
if (pchar == '"' && !incharclass) /* escape-double-quote? */
896+
if (pchar == '"' && charclass_depth < 1) /* escape-double-quote? */
894897
{
895898
/* emit appropriate part separator, per notes above */
896899
if (nquotes == 0)
@@ -939,18 +942,41 @@ similar_escape_internal(text *pat_text, text *esc_text)
939942
/* SQL escape character; do not send to output */
940943
afterescape = true;
941944
}
942-
else if (incharclass)
945+
else if (charclass_depth > 0)
943946
{
944947
if (pchar == '\\')
945948
*r++ = '\\';
946949
*r++ = pchar;
947-
if (pchar == ']')
948-
incharclass = false;
950+
951+
/*
952+
* Ignore a closing bracket at the start of a character class.
953+
* Such a bracket is taken literally rather than closing the
954+
* class. "charclass_start" is 1 right at the beginning of a
955+
* class and 2 after an initial caret.
956+
*/
957+
if (pchar == ']' && charclass_start > 2)
958+
charclass_depth--;
959+
else if (pchar == '[')
960+
charclass_depth++;
961+
962+
/*
963+
* If there is a caret right after the opening bracket, it negates
964+
* the character class, but a following closing bracket should
965+
* still be treated as a normal character. That holds only for
966+
* the first caret, so only the values 1 and 2 mean that closing
967+
* brackets should be taken literally.
968+
*/
969+
if (pchar == '^')
970+
charclass_start++;
971+
else
972+
charclass_start = 3; /* definitely past the start */
949973
}
950974
else if (pchar == '[')
951975
{
976+
/* start of a character class */
952977
*r++ = pchar;
953-
incharclass = true;
978+
charclass_depth++;
979+
charclass_start = 1;
954980
}
955981
else if (pchar == '%')
956982
{

src/test/regress/expected/strings.out

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,68 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
571571
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
572572
ERROR: invalid escape string
573573
HINT: Escape string must be empty or one character.
574+
-- Characters that should be left alone in character classes when a
575+
-- SIMILAR TO regexp pattern is converted to POSIX style.
576+
-- Underscore "_"
577+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
578+
QUERY PLAN
579+
------------------------------------------------
580+
Seq Scan on text_tbl
581+
Filter: (f1 ~ '^(?:.[_[:alpha:]_].)$'::text)
582+
(2 rows)
583+
584+
-- Percentage "%"
585+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
586+
QUERY PLAN
587+
--------------------------------------------------
588+
Seq Scan on text_tbl
589+
Filter: (f1 ~ '^(?:.*[%[:alnum:]%].*)$'::text)
590+
(2 rows)
591+
592+
-- Dot "."
593+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
594+
QUERY PLAN
595+
--------------------------------------------------
596+
Seq Scan on text_tbl
597+
Filter: (f1 ~ '^(?:\.[.[:alnum:].]\.)$'::text)
598+
(2 rows)
599+
600+
-- Dollar "$"
601+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
602+
QUERY PLAN
603+
--------------------------------------------------
604+
Seq Scan on text_tbl
605+
Filter: (f1 ~ '^(?:\$[$[:alnum:]$]\$)$'::text)
606+
(2 rows)
607+
608+
-- Opening parenthesis "("
609+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
610+
ERROR: invalid regular expression: parentheses () not balanced
611+
-- Caret "^"
612+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
613+
QUERY PLAN
614+
------------------------------------------------------------------------
615+
Seq Scan on text_tbl
616+
Filter: (f1 ~ '^(?:\^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]\^)$'::text)
617+
(2 rows)
618+
619+
-- Closing square bracket "]" at the beginning of character class
620+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
621+
QUERY PLAN
622+
------------------------------------------------
623+
Seq Scan on text_tbl
624+
Filter: (f1 ~ '^(?:[]%][^]%][^%].*)$'::text)
625+
(2 rows)
626+
627+
-- Closing square bracket effective after two carets at the beginning
628+
-- of character class.
629+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
630+
QUERY PLAN
631+
---------------------------------------
632+
Seq Scan on text_tbl
633+
Filter: (f1 ~ '^(?:[^^]\^)$'::text)
634+
(2 rows)
635+
574636
-- Test backslash escapes in regexp_replace's replacement string
575637
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
576638
regexp_replace

src/test/regress/sql/strings.sql

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,26 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true;
187187
SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
188188
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
189189

190+
-- Characters that should be left alone in character classes when a
191+
-- SIMILAR TO regexp pattern is converted to POSIX style.
192+
-- Underscore "_"
193+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
194+
-- Percentage "%"
195+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
196+
-- Dot "."
197+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
198+
-- Dollar "$"
199+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
200+
-- Opening parenthesis "("
201+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
202+
-- Caret "^"
203+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
204+
-- Closing square bracket "]" at the beginning of character class
205+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
206+
-- Closing square bracket effective after two carets at the beginning
207+
-- of character class.
208+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
209+
190210
-- Test backslash escapes in regexp_replace's replacement string
191211
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
192212
SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g');

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy