Skip to content

Commit 286a365

Browse files
committed
Support Unicode full case mapping and conversion.
Generate tables from Unicode SpecialCasing.txt to support more sophisticated case mapping behavior: * support case mappings to multiple codepoints, such as "ß" uppercasing to "SS" * support conditional case mappings, such as the "final sigma" * support titlecase variants, such as "dž" uppercasing to "DŽ" but titlecasing to "Dž" Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite
1 parent 6a9b2a6 commit 286a365

File tree

9 files changed

+3645
-2993
lines changed

9 files changed

+3645
-2993
lines changed

src/backend/utils/adt/pg_locale_builtin.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ size_t
7878
strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
7979
pg_locale_t locale)
8080
{
81-
return unicode_strlower(dest, destsize, src, srclen);
81+
return unicode_strlower(dest, destsize, src, srclen, false);
8282
}
8383

8484
size_t
@@ -93,15 +93,15 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
9393
.prev_alnum = false,
9494
};
9595

96-
return unicode_strtitle(dest, destsize, src, srclen,
96+
return unicode_strtitle(dest, destsize, src, srclen, false,
9797
initcap_wbnext, &wbstate);
9898
}
9999

100100
size_t
101101
strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
102102
pg_locale_t locale)
103103
{
104-
return unicode_strupper(dest, destsize, src, srclen);
104+
return unicode_strupper(dest, destsize, src, srclen, false);
105105
}
106106

107107
pg_locale_t

src/common/unicode/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian
3030
# These files are part of the Unicode Character Database. Download
3131
# them on demand. The dependency on Makefile.global is for
3232
# UNICODE_VERSION.
33-
CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
33+
CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
3434
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
3535

3636
unicode_version.h: generate-unicode_version.pl
@@ -91,4 +91,4 @@ clean:
9191
rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
9292

9393
distclean: clean
94-
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
94+
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h

src/common/unicode/case_test.c

Lines changed: 191 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,61 @@
1818
#include <wctype.h>
1919

2020
#ifdef USE_ICU
21+
#include <unicode/ucasemap.h>
2122
#include <unicode/uchar.h>
2223
#endif
2324
#include "common/unicode_case.h"
2425
#include "common/unicode_category.h"
2526
#include "common/unicode_version.h"
2627

28+
/* enough to hold largest source or result string, including NUL */
29+
#define BUFSZ 256
30+
31+
#ifdef USE_ICU
32+
static UCaseMap * casemap = NULL;
33+
#endif
34+
35+
typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,
36+
ssize_t srclen);
37+
38+
/* simple boundary iterator copied from pg_locale_builtin.c */
39+
struct WordBoundaryState
40+
{
41+
const char *str;
42+
size_t len;
43+
size_t offset;
44+
bool init;
45+
bool prev_alnum;
46+
};
47+
48+
static size_t
49+
initcap_wbnext(void *state)
50+
{
51+
struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
52+
53+
while (wbstate->offset < wbstate->len &&
54+
wbstate->str[wbstate->offset] != '\0')
55+
{
56+
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
57+
wbstate->offset);
58+
bool curr_alnum = pg_u_isalnum(u, true);
59+
60+
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
61+
{
62+
size_t prev_offset = wbstate->offset;
63+
64+
wbstate->init = true;
65+
wbstate->offset += unicode_utf8len(u);
66+
wbstate->prev_alnum = curr_alnum;
67+
return prev_offset;
68+
}
69+
70+
wbstate->offset += unicode_utf8len(u);
71+
}
72+
73+
return wbstate->len;
74+
}
75+
2776
#ifdef USE_ICU
2877

2978
static void
@@ -48,6 +97,54 @@ icu_test_simple(pg_wchar code)
4897
}
4998
}
5099

100+
static void
101+
icu_test_full(char *str)
102+
{
103+
char lower[BUFSZ];
104+
char title[BUFSZ];
105+
char upper[BUFSZ];
106+
char icu_lower[BUFSZ];
107+
char icu_title[BUFSZ];
108+
char icu_upper[BUFSZ];
109+
UErrorCode status;
110+
struct WordBoundaryState wbstate = {
111+
.str = str,
112+
.len = strlen(str),
113+
.offset = 0,
114+
.init = false,
115+
.prev_alnum = false,
116+
};
117+
118+
unicode_strlower(lower, BUFSZ, str, -1, true);
119+
unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
120+
unicode_strupper(upper, BUFSZ, str, -1, true);
121+
status = U_ZERO_ERROR;
122+
ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
123+
status = U_ZERO_ERROR;
124+
ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
125+
status = U_ZERO_ERROR;
126+
ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
127+
128+
if (strcmp(lower, icu_lower) != 0)
129+
{
130+
printf("case_test: str='%s' lower='%s' icu_lower='%s'\n", str, lower,
131+
icu_lower);
132+
exit(1);
133+
}
134+
if (strcmp(title, icu_title) != 0)
135+
{
136+
printf("case_test: str='%s' title='%s' icu_title='%s'\n", str, title,
137+
icu_title);
138+
exit(1);
139+
}
140+
if (strcmp(upper, icu_upper) != 0)
141+
{
142+
printf("case_test: str='%s' upper='%s' icu_upper='%s'\n", str, upper,
143+
icu_upper);
144+
exit(1);
145+
}
146+
}
147+
51148
/*
52149
* Exhaustively compare case mappings with the results from ICU.
53150
*/
@@ -64,6 +161,7 @@ test_icu(void)
64161
if (category != PG_U_UNASSIGNED)
65162
{
66163
uint8_t icu_category = u_charType(code);
164+
char code_str[5] = {0};
67165

68166
if (icu_category == PG_U_UNASSIGNED)
69167
{
@@ -72,6 +170,9 @@ test_icu(void)
72170
}
73171

74172
icu_test_simple(code);
173+
unicode_to_utf8(code, (unsigned char *) code_str);
174+
icu_test_full(code_str);
175+
75176
successful++;
76177
}
77178
}
@@ -86,7 +187,7 @@ test_icu(void)
86187
#endif
87188

88189
static void
89-
test_strlower(const char *test_string, const char *expected)
190+
test_convert(TestFunc tfunc, const char *test_string, const char *expected)
90191
{
91192
size_t src1len = strlen(test_string);
92193
size_t src2len = -1; /* NUL-terminated */
@@ -102,10 +203,11 @@ test_strlower(const char *test_string, const char *expected)
102203

103204
/* neither source nor destination are NUL-terminated */
104205
memset(dst1, 0x7F, dst1len);
105-
needed = unicode_strlower(dst1, dst1len, src1, src1len);
206+
needed = tfunc(dst1, dst1len, src1, src1len);
106207
if (needed != strlen(expected))
107208
{
108-
printf("case_test: convert_case test1 FAILURE: needed %zu\n", needed);
209+
printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",
210+
test_string, needed, strlen(expected));
109211
exit(1);
110212
}
111213
if (memcmp(dst1, expected, dst1len) != 0)
@@ -117,10 +219,11 @@ test_strlower(const char *test_string, const char *expected)
117219

118220
/* destination is NUL-terminated and source is not */
119221
memset(dst2, 0x7F, dst2len);
120-
needed = unicode_strlower(dst2, dst2len, src1, src1len);
222+
needed = tfunc(dst2, dst2len, src1, src1len);
121223
if (needed != strlen(expected))
122224
{
123-
printf("case_test: convert_case test2 FAILURE: needed %zu\n", needed);
225+
printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",
226+
test_string, needed, strlen(expected));
124227
exit(1);
125228
}
126229
if (strcmp(dst2, expected) != 0)
@@ -132,9 +235,11 @@ test_strlower(const char *test_string, const char *expected)
132235

133236
/* source is NUL-terminated and destination is not */
134237
memset(dst1, 0x7F, dst1len);
135-
needed = unicode_strlower(dst1, dst1len, src2, src2len);
238+
needed = tfunc(dst1, dst1len, src2, src2len);
136239
if (needed != strlen(expected))
137240
{
241+
printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",
242+
test_string, needed, strlen(expected));
138243
printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed);
139244
exit(1);
140245
}
@@ -147,10 +252,11 @@ test_strlower(const char *test_string, const char *expected)
147252

148253
/* both source and destination are NUL-terminated */
149254
memset(dst2, 0x7F, dst2len);
150-
needed = unicode_strlower(dst2, dst2len, src2, src2len);
255+
needed = tfunc(dst2, dst2len, src2, src2len);
151256
if (needed != strlen(expected))
152257
{
153-
printf("case_test: convert_case test4 FAILURE: needed %zu\n", needed);
258+
printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",
259+
test_string, needed, strlen(expected));
154260
exit(1);
155261
}
156262
if (strcmp(dst2, expected) != 0)
@@ -166,22 +272,92 @@ test_strlower(const char *test_string, const char *expected)
166272
free(dst2);
167273
}
168274

275+
static size_t
276+
tfunc_lower(char *dst, size_t dstsize, const char *src,
277+
ssize_t srclen)
278+
{
279+
return unicode_strlower(dst, dstsize, src, srclen, true);
280+
}
281+
282+
static size_t
283+
tfunc_title(char *dst, size_t dstsize, const char *src,
284+
ssize_t srclen)
285+
{
286+
struct WordBoundaryState wbstate = {
287+
.str = src,
288+
.len = srclen,
289+
.offset = 0,
290+
.init = false,
291+
.prev_alnum = false,
292+
};
293+
294+
return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
295+
&wbstate);
296+
}
297+
298+
static size_t
299+
tfunc_upper(char *dst, size_t dstsize, const char *src,
300+
ssize_t srclen)
301+
{
302+
return unicode_strupper(dst, dstsize, src, srclen, true);
303+
}
304+
305+
169306
static void
170307
test_convert_case()
171308
{
172309
/* test string with no case changes */
173-
test_strlower("√∞", "√∞");
310+
test_convert(tfunc_lower, "√∞", "√∞");
311+
/* test adjust-to-cased behavior */
312+
test_convert(tfunc_title, "abc 123xyz", "Abc 123xyz");
174313
/* test string with case changes */
175-
test_strlower("ABC", "abc");
314+
test_convert(tfunc_upper, "abc", "ABC");
176315
/* test string with case changes and byte length changes */
177-
test_strlower("ȺȺȺ", "ⱥⱥⱥ");
316+
test_convert(tfunc_lower, "ȺȺȺ", "ⱥⱥⱥ");
317+
/* test special case conversions */
318+
test_convert(tfunc_upper, "ß", "SS");
319+
test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
320+
test_convert(tfunc_upper, "ıiIİ", "IIIİ");
321+
/* test final sigma */
322+
test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
323+
test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
324+
test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
325+
326+
#ifdef USE_ICU
327+
icu_test_full("");
328+
icu_test_full("ȺȺȺ");
329+
icu_test_full("ßßß");
330+
icu_test_full("√∞");
331+
icu_test_full("a b");
332+
icu_test_full("abc 123xyz");
333+
icu_test_full("σςΣ ΣΣΣ");
334+
icu_test_full("ıiIİ");
335+
/* test <alpha><iota_subscript><acute> */
336+
icu_test_full("\u0391\u0345\u0301");
337+
#endif
178338

179339
printf("case_test: convert_case: success\n");
180340
}
181341

182342
int
183343
main(int argc, char **argv)
184344
{
345+
#ifdef USE_ICU
346+
UErrorCode status = U_ZERO_ERROR;
347+
348+
/*
349+
* Disable ICU's word break adjustment for titlecase to match the expected
350+
* behavior of unicode_strtitle().
351+
*/
352+
casemap = ucasemap_open("und", U_TITLECASE_NO_BREAK_ADJUSTMENT, &status);
353+
if (U_FAILURE(status))
354+
{
355+
printf("case_test: failure opening UCaseMap: %s\n",
356+
u_errorName(status));
357+
exit(1);
358+
}
359+
#endif
360+
185361
printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
186362
#ifdef USE_ICU
187363
printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
@@ -191,5 +367,9 @@ main(int argc, char **argv)
191367
#endif
192368

193369
test_convert_case();
370+
371+
#ifdef USE_ICU
372+
ucasemap_close(casemap);
373+
#endif
194374
exit(0);
195375
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy