Skip to content

Commit bfc5992

Browse files
committed
Add SQL function CASEFOLD().
Useful for caseless matching. Similar to LOWER(), but avoids edge-case problems with using LOWER() for caseless matching. For collations that support it, CASEFOLD() handles characters with more than two case variations or multi-character case variations. Some characters may fold to uppercase. The results of case folding are also more stable across Unicode versions than LOWER() or UPPER(). Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com Reviewed-by: Ian Lawrence Barwick
1 parent f15538c commit bfc5992

File tree

14 files changed

+278
-3
lines changed

14 files changed

+278
-3
lines changed

doc/src/sgml/func.sgml

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2596,7 +2596,7 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
25962596

25972597
<row>
25982598
<entry role="func_table_entry"><para role="func_signature">
2599-
<indexterm>
2599+
<indexterm id="function-lower">
26002600
<primary>lower</primary>
26012601
</indexterm>
26022602
<function>lower</function> ( <type>text</type> )
@@ -2657,7 +2657,7 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
26572657

26582658
<row>
26592659
<entry role="func_table_entry"><para role="func_signature">
2660-
<indexterm>
2660+
<indexterm id="function-normalize">
26612661
<primary>normalize</primary>
26622662
</indexterm>
26632663
<indexterm>
@@ -3109,6 +3109,48 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
31093109
</para></entry>
31103110
</row>
31113111

3112+
<row>
3113+
<entry role="func_table_entry"><para role="func_signature">
3114+
<indexterm>
3115+
<primary>casefold</primary>
3116+
</indexterm>
3117+
<function>casefold</function> ( <type>text</type> )
3118+
<returnvalue>text</returnvalue>
3119+
</para>
3120+
<para>
3121+
Performs case folding of the input string according to the collation.
3122+
Case folding is similar to case conversion, but the purpose of case
3123+
folding is to facilitate case-insensitive comparison of strings,
3124+
whereas the purpose of case conversion is to convert to a particular
3125+
cased form. This function can only be used when the server encoding
3126+
is <literal>UTF8</literal>.
3127+
</para>
3128+
<para>
3129+
Ordinarily, case folding simply converts to lowercase, but there are a
3130+
few notable exceptions depending on the collation. For instance, the
3131+
character <literal>Σ</literal> (U+03A3) has two lowercase forms:
3132+
<literal>σ</literal> (U+03C3) and <literal>ς</literal> (U+03C2); case
3133+
folding in the <literal>PG_C_UTF8</literal> collation maps all three
3134+
forms to <literal>σ</literal>. Additionally, the result is not
3135+
necessarily lowercase; some characters may be folded to uppercase.
3136+
</para>
3137+
<para>
3138+
Case folding may change the length of the string. For instance, in
3139+
the <literal>PG_UNICODE_FAST</literal> collation, <literal>ß</literal>
3140+
(U+00DF) folds to <literal>ss</literal>.
3141+
</para>
3142+
<para>
3143+
<function>casefold</function> can be used for Unicode Default Caseless
3144+
Matching. It does not always preserve the normalized form of the
3145+
input string (see <xref linkend="function-normalize"/>).
3146+
</para>
3147+
<para>
3148+
The <literal>libc</literal> provider doesn't support case folding, so
3149+
<function>casefold</function> is identical to <xref
3150+
linkend="function-lower"/>.
3151+
</para></entry>
3152+
</row>
3153+
31123154
<row>
31133155
<entry role="func_table_entry"><para role="func_signature">
31143156
<indexterm>

src/backend/utils/adt/formatting.c

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1819,6 +1819,75 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
18191819
return result;
18201820
}
18211821

1822+
/*
1823+
* collation-aware, wide-character-aware case folding
1824+
*
1825+
* We pass the number of bytes so we can pass varlena and char*
1826+
* to this function. The result is a palloc'd, null-terminated string.
1827+
*/
1828+
char *
1829+
str_casefold(const char *buff, size_t nbytes, Oid collid)
1830+
{
1831+
char *result;
1832+
pg_locale_t mylocale;
1833+
1834+
if (!buff)
1835+
return NULL;
1836+
1837+
if (!OidIsValid(collid))
1838+
{
1839+
/*
1840+
* This typically means that the parser could not resolve a conflict
1841+
* of implicit collations, so report it that way.
1842+
*/
1843+
ereport(ERROR,
1844+
(errcode(ERRCODE_INDETERMINATE_COLLATION),
1845+
errmsg("could not determine which collation to use for %s function",
1846+
"lower()"),
1847+
errhint("Use the COLLATE clause to set the collation explicitly.")));
1848+
}
1849+
1850+
if (GetDatabaseEncoding() != PG_UTF8)
1851+
ereport(ERROR,
1852+
(errcode(ERRCODE_SYNTAX_ERROR),
1853+
errmsg("Unicode case folding can only be performed if server encoding is UTF8")));
1854+
1855+
mylocale = pg_newlocale_from_collation(collid);
1856+
1857+
/* C/POSIX collations use this path regardless of database encoding */
1858+
if (mylocale->ctype_is_c)
1859+
{
1860+
result = asc_tolower(buff, nbytes);
1861+
}
1862+
else
1863+
{
1864+
const char *src = buff;
1865+
size_t srclen = nbytes;
1866+
size_t dstsize;
1867+
char *dst;
1868+
size_t needed;
1869+
1870+
/* first try buffer of equal size plus terminating NUL */
1871+
dstsize = srclen + 1;
1872+
dst = palloc(dstsize);
1873+
1874+
needed = pg_strfold(dst, dstsize, src, srclen, mylocale);
1875+
if (needed + 1 > dstsize)
1876+
{
1877+
/* grow buffer if needed and retry */
1878+
dstsize = needed + 1;
1879+
dst = repalloc(dst, dstsize);
1880+
needed = pg_strfold(dst, dstsize, src, srclen, mylocale);
1881+
Assert(needed + 1 <= dstsize);
1882+
}
1883+
1884+
Assert(dst[needed] == '\0');
1885+
result = dst;
1886+
}
1887+
1888+
return result;
1889+
}
1890+
18221891
/*
18231892
* ASCII-only lower function
18241893
*

src/backend/utils/adt/oracle_compat.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,22 @@ initcap(PG_FUNCTION_ARGS)
126126
PG_RETURN_TEXT_P(result);
127127
}
128128

129+
Datum
130+
casefold(PG_FUNCTION_ARGS)
131+
{
132+
text *in_string = PG_GETARG_TEXT_PP(0);
133+
char *out_string;
134+
text *result;
135+
136+
out_string = str_casefold(VARDATA_ANY(in_string),
137+
VARSIZE_ANY_EXHDR(in_string),
138+
PG_GET_COLLATION());
139+
result = cstring_to_text(out_string);
140+
pfree(out_string);
141+
142+
PG_RETURN_TEXT_P(result);
143+
}
144+
129145

130146
/********************************************************************
131147
*

src/backend/utils/adt/pg_locale.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,13 +106,17 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
106106
ssize_t srclen, pg_locale_t locale);
107107
extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src,
108108
ssize_t srclen, pg_locale_t locale);
109+
extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src,
110+
ssize_t srclen, pg_locale_t locale);
109111

110112
extern size_t strlower_icu(char *dst, size_t dstsize, const char *src,
111113
ssize_t srclen, pg_locale_t locale);
112114
extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
113115
ssize_t srclen, pg_locale_t locale);
114116
extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
115117
ssize_t srclen, pg_locale_t locale);
118+
extern size_t strfold_icu(char *dst, size_t dstsize, const char *src,
119+
ssize_t srclen, pg_locale_t locale);
116120

117121
extern size_t strlower_libc(char *dst, size_t dstsize, const char *src,
118122
ssize_t srclen, pg_locale_t locale);
@@ -1447,6 +1451,26 @@ pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
14471451
return 0; /* keep compiler quiet */
14481452
}
14491453

1454+
size_t
1455+
pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
1456+
pg_locale_t locale)
1457+
{
1458+
if (locale->provider == COLLPROVIDER_BUILTIN)
1459+
return strfold_builtin(dst, dstsize, src, srclen, locale);
1460+
#ifdef USE_ICU
1461+
else if (locale->provider == COLLPROVIDER_ICU)
1462+
return strfold_icu(dst, dstsize, src, srclen, locale);
1463+
#endif
1464+
/* for libc, just use strlower */
1465+
else if (locale->provider == COLLPROVIDER_LIBC)
1466+
return strlower_libc(dst, dstsize, src, srclen, locale);
1467+
else
1468+
/* shouldn't happen */
1469+
PGLOCALE_SUPPORT_ERROR(locale->provider);
1470+
1471+
return 0; /* keep compiler quiet */
1472+
}
1473+
14501474
/*
14511475
* pg_strcoll
14521476
*

src/backend/utils/adt/pg_locale_builtin.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
3131
ssize_t srclen, pg_locale_t locale);
3232
extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src,
3333
ssize_t srclen, pg_locale_t locale);
34+
extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src,
35+
ssize_t srclen, pg_locale_t locale);
3436

3537

3638
struct WordBoundaryState
@@ -107,6 +109,14 @@ strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
107109
locale->info.builtin.casemap_full);
108110
}
109111

112+
size_t
113+
strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
114+
pg_locale_t locale)
115+
{
116+
return unicode_strfold(dest, destsize, src, srclen,
117+
locale->info.builtin.casemap_full);
118+
}
119+
110120
pg_locale_t
111121
create_pg_locale_builtin(Oid collid, MemoryContext context)
112122
{

src/backend/utils/adt/pg_locale_icu.c

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
5454
ssize_t srclen, pg_locale_t locale);
5555
extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
5656
ssize_t srclen, pg_locale_t locale);
57+
extern size_t strfold_icu(char *dst, size_t dstsize, const char *src,
58+
ssize_t srclen, pg_locale_t locale);
5759

5860
#ifdef USE_ICU
5961

@@ -117,6 +119,10 @@ static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
117119
const UChar *src, int32_t srcLength,
118120
const char *locale,
119121
UErrorCode *pErrorCode);
122+
static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
123+
const UChar *src, int32_t srcLength,
124+
const char *locale,
125+
UErrorCode *pErrorCode);
120126

121127
static const struct collate_methods collate_methods_icu = {
122128
.strncoll = strncoll_icu,
@@ -439,6 +445,26 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
439445
return result_len;
440446
}
441447

448+
size_t
449+
strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
450+
pg_locale_t locale)
451+
{
452+
int32_t len_uchar;
453+
int32_t len_conv;
454+
UChar *buff_uchar;
455+
UChar *buff_conv;
456+
size_t result_len;
457+
458+
len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
459+
len_conv = icu_convert_case(u_strFoldCase_default, locale,
460+
&buff_conv, buff_uchar, len_uchar);
461+
result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
462+
pfree(buff_uchar);
463+
pfree(buff_conv);
464+
465+
return result_len;
466+
}
467+
442468
/*
443469
* strncoll_icu_utf8
444470
*
@@ -673,6 +699,38 @@ u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
673699
NULL, locale, pErrorCode);
674700
}
675701

702+
static int32_t
703+
u_strFoldCase_default(UChar *dest, int32_t destCapacity,
704+
const UChar *src, int32_t srcLength,
705+
const char *locale,
706+
UErrorCode *pErrorCode)
707+
{
708+
uint32 options = U_FOLD_CASE_DEFAULT;
709+
char lang[3];
710+
UErrorCode status;
711+
712+
/*
713+
* Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
714+
* folding does not accept a locale. Instead it just supports a single
715+
* option relevant to Turkic languages 'az' and 'tr'; check for those
716+
* languages to enable the option.
717+
*/
718+
status = U_ZERO_ERROR;
719+
uloc_getLanguage(locale, lang, 3, &status);
720+
if (U_SUCCESS(status))
721+
{
722+
/*
723+
* The option name is confusing, but it causes u_strFoldCase to use
724+
* the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
725+
*/
726+
if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
727+
options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
728+
}
729+
730+
return u_strFoldCase(dest, destCapacity, src, srcLength,
731+
options, pErrorCode);
732+
}
733+
676734
/*
677735
* strncoll_icu
678736
*

src/include/catalog/catversion.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,6 @@
5757
*/
5858

5959
/* yyyymmddN */
60-
#define CATALOG_VERSION_NO 202501231
60+
#define CATALOG_VERSION_NO 202501232
6161

6262
#endif

src/include/catalog/pg_proc.dat

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3623,6 +3623,9 @@
36233623
{ oid => '872', descr => 'capitalize each word',
36243624
proname => 'initcap', prorettype => 'text', proargtypes => 'text',
36253625
prosrc => 'initcap' },
3626+
{ oid => '9569', descr => 'fold case',
3627+
proname => 'casefold', prorettype => 'text', proargtypes => 'text',
3628+
prosrc => 'casefold' },
36263629
{ oid => '873', descr => 'left-pad string to length',
36273630
proname => 'lpad', prorettype => 'text', proargtypes => 'text int4 text',
36283631
prosrc => 'lpad' },

src/include/utils/formatting.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
extern char *str_tolower(const char *buff, size_t nbytes, Oid collid);
2222
extern char *str_toupper(const char *buff, size_t nbytes, Oid collid);
2323
extern char *str_initcap(const char *buff, size_t nbytes, Oid collid);
24+
extern char *str_casefold(const char *buff, size_t nbytes, Oid collid);
2425

2526
extern char *asc_tolower(const char *buff, size_t nbytes);
2627
extern char *asc_toupper(const char *buff, size_t nbytes);

src/include/utils/pg_locale.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,9 @@ extern size_t pg_strtitle(char *dest, size_t destsize,
134134
extern size_t pg_strupper(char *dest, size_t destsize,
135135
const char *src, ssize_t srclen,
136136
pg_locale_t locale);
137+
extern size_t pg_strfold(char *dest, size_t destsize,
138+
const char *src, ssize_t srclen,
139+
pg_locale_t locale);
137140
extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale);
138141
extern int pg_strncoll(const char *arg1, ssize_t len1,
139142
const char *arg2, ssize_t len2, pg_locale_t locale);

src/test/regress/expected/collate.icu.utf8.out

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,30 @@ SELECT a, x, y FROM collate_test10 ORDER BY lower(y), a;
255255
1 | hij | hij
256256
(2 rows)
257257

258+
SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu");
259+
lower
260+
-------------------------------
261+
abcd 123 #$% ıiii̇ ß ß dždždž σσς
262+
(1 row)
263+
264+
SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu");
265+
casefold
266+
---------------------------------
267+
abcd 123 #$% ıiii̇ ss ss dždždž σσσ
268+
(1 row)
269+
270+
SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu");
271+
lower
272+
-------------------------------
273+
abcd 123 #$% ıiıi ß ß dždždž σσς
274+
(1 row)
275+
276+
SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu");
277+
casefold
278+
---------------------------------
279+
abcd 123 #$% ıiıi ss ss dždždž σσσ
280+
(1 row)
281+
258282
-- LIKE/ILIKE
259283
SELECT * FROM collate_test1 WHERE b LIKE 'abc';
260284
a | b

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy