Skip to content

Commit 4e7f62b

Browse files
committed
Add support for Unicode case folding.
Expand case mapping tables to include entries for case folding, which are parsed from CaseFolding.txt. Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com
1 parent 7921927 commit 4e7f62b

File tree

7 files changed

+3280
-3125
lines changed

7 files changed

+3280
-3125
lines changed

src/common/unicode/Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,13 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian
3030
# These files are part of the Unicode Character Database. Download
3131
# them on demand. The dependency on Makefile.global is for
3232
# UNICODE_VERSION.
33-
CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
33+
CompositionExclusions.txt CaseFolding.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
3434
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
3535

3636
unicode_version.h: generate-unicode_version.pl
3737
$(PERL) $< --version $(UNICODE_VERSION)
3838

39-
unicode_case_table.h: generate-unicode_case_table.pl UnicodeData.txt
39+
unicode_case_table.h: generate-unicode_case_table.pl CaseFolding.txt UnicodeData.txt
4040
$(PERL) $<
4141

4242
unicode_category_table.h: generate-unicode_category_table.pl DerivedCoreProperties.txt PropList.txt UnicodeData.txt
@@ -91,4 +91,4 @@ clean:
9191
rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
9292

9393
distclean: clean
94-
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
94+
rm -f CompositionExclusions.txt CaseFolding.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h

src/common/unicode/case_test.c

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,17 +81,20 @@ icu_test_simple(pg_wchar code)
8181
pg_wchar lower = unicode_lowercase_simple(code);
8282
pg_wchar title = unicode_titlecase_simple(code);
8383
pg_wchar upper = unicode_uppercase_simple(code);
84+
pg_wchar fold = unicode_casefold_simple(code);
8485
pg_wchar iculower = u_tolower(code);
8586
pg_wchar icutitle = u_totitle(code);
8687
pg_wchar icuupper = u_toupper(code);
88+
pg_wchar icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
8789

88-
if (lower != iculower || title != icutitle || upper != icuupper)
90+
if (lower != iculower || title != icutitle || upper != icuupper ||
91+
fold != icufold)
8992
{
9093
printf("case_test: FAILURE for codepoint 0x%06x\n", code);
91-
printf("case_test: Postgres lower/title/upper: 0x%06x/0x%06x/0x%06x\n",
92-
lower, title, upper);
93-
printf("case_test: ICU lower/title/upper: 0x%06x/0x%06x/0x%06x\n",
94-
iculower, icutitle, icuupper);
94+
printf("case_test: Postgres lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",
95+
lower, title, upper, fold);
96+
printf("case_test: ICU lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",
97+
iculower, icutitle, icuupper, icufold);
9598
printf("\n");
9699
exit(1);
97100
}
@@ -103,9 +106,11 @@ icu_test_full(char *str)
103106
char lower[BUFSZ];
104107
char title[BUFSZ];
105108
char upper[BUFSZ];
109+
char fold[BUFSZ];
106110
char icu_lower[BUFSZ];
107111
char icu_title[BUFSZ];
108112
char icu_upper[BUFSZ];
113+
char icu_fold[BUFSZ];
109114
UErrorCode status;
110115
struct WordBoundaryState wbstate = {
111116
.str = str,
@@ -118,12 +123,15 @@ icu_test_full(char *str)
118123
unicode_strlower(lower, BUFSZ, str, -1, true);
119124
unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
120125
unicode_strupper(upper, BUFSZ, str, -1, true);
126+
unicode_strfold(fold, BUFSZ, str, -1, true);
121127
status = U_ZERO_ERROR;
122128
ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
123129
status = U_ZERO_ERROR;
124130
ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
125131
status = U_ZERO_ERROR;
126132
ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
133+
status = U_ZERO_ERROR;
134+
ucasemap_utf8FoldCase(casemap, icu_fold, BUFSZ, str, -1, &status);
127135

128136
if (strcmp(lower, icu_lower) != 0)
129137
{
@@ -143,6 +151,12 @@ icu_test_full(char *str)
143151
icu_upper);
144152
exit(1);
145153
}
154+
if (strcmp(fold, icu_fold) != 0)
155+
{
156+
printf("case_test: str='%s' fold='%s' icu_fold='%s'\n", str, fold,
157+
icu_fold);
158+
exit(1);
159+
}
146160
}
147161

148162
/*
@@ -302,6 +316,12 @@ tfunc_upper(char *dst, size_t dstsize, const char *src,
302316
return unicode_strupper(dst, dstsize, src, srclen, true);
303317
}
304318

319+
static size_t
320+
tfunc_fold(char *dst, size_t dstsize, const char *src,
321+
ssize_t srclen)
322+
{
323+
return unicode_strfold(dst, dstsize, src, srclen, true);
324+
}
305325

306326
static void
307327
test_convert_case()
@@ -318,10 +338,12 @@ test_convert_case()
318338
test_convert(tfunc_upper, "ß", "SS");
319339
test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
320340
test_convert(tfunc_upper, "ıiIİ", "IIIİ");
341+
test_convert(tfunc_fold, "ıiIİ", "ıiii\u0307");
321342
/* test final sigma */
322343
test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
323344
test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
324345
test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
346+
test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ");
325347

326348
#ifdef USE_ICU
327349
icu_test_full("");

src/common/unicode/generate-unicode_case_table.pl

Lines changed: 103 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@
4949
$simple{$code} = {
5050
Simple_Lowercase => ($simple_lowercase || $code),
5151
Simple_Titlecase => ($simple_titlecase || $code),
52-
Simple_Uppercase => ($simple_uppercase || $code)
52+
Simple_Uppercase => ($simple_uppercase || $code),
53+
Simple_Foldcase => $code,
5354
};
5455
}
5556
}
@@ -87,6 +88,7 @@
8788
my @lower = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[1]));
8889
my @title = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[2]));
8990
my @upper = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[3]));
91+
my @fold = ();
9092
my @conditions = map {
9193
# supporting negated conditions may require storing a
9294
# mask of relevant conditions for a given rule to differentiate
@@ -101,6 +103,7 @@
101103
push @lower, $code if (scalar @lower == 0);
102104
push @title, $code if (scalar @title == 0);
103105
push @upper, $code if (scalar @upper == 0);
106+
push @fold, $code;
104107

105108
# none should map to more than 3 codepoints
106109
die "lowercase expansion for 0x$elts[0] exceeds maximum: '$elts[1]'"
@@ -114,13 +117,15 @@
114117
while (scalar @upper < $MAX_CASE_EXPANSION) { push @upper, 0x000000 }
115118
while (scalar @lower < $MAX_CASE_EXPANSION) { push @lower, 0x000000 }
116119
while (scalar @title < $MAX_CASE_EXPANSION) { push @title, 0x000000 }
120+
while (scalar @fold < $MAX_CASE_EXPANSION) { push @fold, 0x000000 }
117121

118122
# Characters with special mappings may not have simple mappings;
119123
# ensure that an entry exists.
120124
$simple{$code} ||= {
121125
Simple_Lowercase => $code,
122126
Simple_Titlecase => $code,
123-
Simple_Uppercase => $code
127+
Simple_Uppercase => $code,
128+
Simple_Foldcase => $code
124129
};
125130

126131
# Multiple special case rules for a single codepoint could be
@@ -135,11 +140,96 @@
135140
Lowercase => \@lower,
136141
Titlecase => \@title,
137142
Uppercase => \@upper,
143+
Foldcase => \@fold,
138144
Conditions => $cond_str
139145
};
140146
}
141147
close $FH;
142148

149+
open($FH, '<', "$output_path/CaseFolding.txt")
150+
or die "Could not open $output_path/CaseFolding.txt: $!.";
151+
while (my $line = <$FH>)
152+
{
153+
# remove comments
154+
$line =~ s/^(.*?)#.*$/$1/s;
155+
156+
# ignore empty lines
157+
next unless $line =~ /;/;
158+
159+
my @elts = split(';', $line);
160+
my $code = hex($elts[0]);
161+
my $status = $elts[1] =~ s/^\s+|\s+$//rg;
162+
163+
# Codepoint may map to multiple characters when folding. Split
164+
# each mapping on whitespace and extract the hexadecimal into an
165+
# array of codepoints.
166+
my @fold = map { hex $_ } (grep /[0-9A-F]+/, (split /\s+/, $elts[2]));
167+
168+
die "codepoint $code out of range" if $code > 0x10FFFF;
169+
170+
# status 'T' unsupported; skip
171+
next if $status eq 'T';
172+
173+
# encountered unrecognized status type
174+
die "unsupported status type '$status'"
175+
if $status ne 'S' && $status ne 'C' && $status ne 'F';
176+
177+
# initialize simple case mappings if they don't exist
178+
$simple{$code} ||= {
179+
Simple_Lowercase => $code,
180+
Simple_Titlecase => $code,
181+
Simple_Uppercase => $code,
182+
Simple_Foldcase => $code
183+
};
184+
185+
if ($status eq 'S' || $status eq 'C')
186+
{
187+
die
188+
"Simple case folding for $code has multiple codepoints: '$line' '$elts[2]'"
189+
if scalar @fold != 1;
190+
my $simple_foldcase = $fold[0];
191+
192+
die "Simple_Foldcase $code out of range"
193+
if $simple_foldcase > 0x10FFFF;
194+
195+
$simple{$code}{Simple_Foldcase} = $simple_foldcase;
196+
}
197+
198+
if ($status eq 'F' || ($status eq 'C' && defined $special{$code}))
199+
{
200+
while (scalar @fold < $MAX_CASE_EXPANSION) { push @fold, 0x000000 }
201+
202+
#initialize special case mappings if they don't exist
203+
if (!defined $special{$code})
204+
{
205+
my @lower = ($simple{$code}{Simple_Lowercase});
206+
my @title = ($simple{$code}{Simple_Titlecase});
207+
my @upper = ($simple{$code}{Simple_Uppercase});
208+
while (scalar @lower < $MAX_CASE_EXPANSION)
209+
{
210+
push @lower, 0x000000;
211+
}
212+
while (scalar @title < $MAX_CASE_EXPANSION)
213+
{
214+
push @title, 0x000000;
215+
}
216+
while (scalar @upper < $MAX_CASE_EXPANSION)
217+
{
218+
push @upper, 0x000000;
219+
}
220+
$special{$code} = {
221+
Lowercase => \@lower,
222+
Titlecase => \@title,
223+
Uppercase => \@upper,
224+
Conditions => '0'
225+
};
226+
}
227+
228+
$special{$code}{Foldcase} = \@fold;
229+
}
230+
}
231+
close $FH;
232+
143233
# assign sequential array indexes to the special mappings
144234
my $special_idx = 0;
145235
foreach my $code (sort { $a <=> $b } (keys %special))
@@ -202,6 +292,7 @@
202292
CaseLower = 0,
203293
CaseTitle = 1,
204294
CaseUpper = 2,
295+
CaseFold = 3,
205296
NCaseKind
206297
} CaseKind;
207298
@@ -232,14 +323,17 @@
232323
die if scalar @{ $special{$code}{Lowercase} } != $MAX_CASE_EXPANSION;
233324
die if scalar @{ $special{$code}{Titlecase} } != $MAX_CASE_EXPANSION;
234325
die if scalar @{ $special{$code}{Uppercase} } != $MAX_CASE_EXPANSION;
326+
die if scalar @{ $special{$code}{Foldcase} } != $MAX_CASE_EXPANSION;
235327
my $lower = join ", ",
236328
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Lowercase} });
237329
my $title = join ", ",
238330
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Titlecase} });
239331
my $upper = join ", ",
240332
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} });
333+
my $fold = join ", ",
334+
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Foldcase} });
241335
printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions};
242-
printf $OT "{{%s}, {%s}, {%s}}},\n", $lower, $title, $upper;
336+
printf $OT "{{%s}, {%s}, {%s}, {%s}}},\n", $lower, $title, $upper, $fold;
243337
}
244338

245339
print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n";
@@ -260,11 +354,13 @@
260354
my $lc = ($simple{$code}{Simple_Lowercase} || $code);
261355
my $tc = ($simple{$code}{Simple_Titlecase} || $code);
262356
my $uc = ($simple{$code}{Simple_Uppercase} || $code);
357+
my $fc = ($simple{$code}{Simple_Foldcase} || $code);
358+
263359
die "unexpected special case for code $code"
264360
if defined $special{$code};
265361
printf $OT
266-
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, NULL},\n",
267-
$code, $lc, $tc, $uc;
362+
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, NULL},\n",
363+
$code, $lc, $tc, $uc, $fc;
268364
}
269365
printf $OT "\n";
270366

@@ -280,8 +376,8 @@
280376
$special_case = sprintf "&special_case[%d]", $special{$code}{Index};
281377
}
282378
printf $OT
283-
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, %s},\n",
379+
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, %s},\n",
284380
$code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
285-
$map->{Simple_Uppercase}, $special_case;
381+
$map->{Simple_Uppercase}, $map->{Simple_Foldcase}, $special_case;
286382
}
287383
print $OT "};\n";

src/common/unicode/meson.build

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ endif
1111

1212
# These files are part of the Unicode Character Database. Download them on
1313
# demand.
14-
foreach f : ['CompositionExclusions.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt']
14+
foreach f : ['CompositionExclusions.txt', 'CaseFolding.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt']
1515
url = unicode_baseurl.format(UNICODE_VERSION, f)
1616
target = custom_target(f,
1717
output: f,
@@ -26,7 +26,7 @@ update_unicode_targets = []
2626

2727
update_unicode_targets += \
2828
custom_target('unicode_case_table.h',
29-
input: [unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
29+
input: [unicode_data['CaseFolding.txt'], unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
3030
output: ['unicode_case_table.h'],
3131
command: [
3232
perl, files('generate-unicode_case_table.pl'),

src/common/unicode_case.c

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,14 @@ unicode_uppercase_simple(pg_wchar code)
5151
return map ? map->simplemap[CaseUpper] : code;
5252
}
5353

54+
pg_wchar
55+
unicode_casefold_simple(pg_wchar code)
56+
{
57+
const pg_case_map *map = find_case_map(code);
58+
59+
return map ? map->simplemap[CaseFold] : code;
60+
}
61+
5462
/*
5563
* unicode_strlower()
5664
*
@@ -142,6 +150,30 @@ unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
142150
NULL);
143151
}
144152

153+
/*
154+
* unicode_strfold()
155+
*
156+
* Case fold src, and return the result length (not including terminating
157+
* NUL).
158+
*
159+
* String src must be encoded in UTF-8. If srclen < 0, src must be
160+
* NUL-terminated.
161+
*
162+
* Result string is stored in dst, truncating if larger than dstsize. If
163+
* dstsize is greater than the result length, dst will be NUL-terminated;
164+
* otherwise not.
165+
*
166+
* If dstsize is zero, dst may be NULL. This is useful for calculating the
167+
* required buffer size before allocating.
168+
*/
169+
size_t
170+
unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
171+
bool full)
172+
{
173+
return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
174+
NULL);
175+
}
176+
145177
/*
146178
* Implement Unicode Default Case Conversion algorithm.
147179
*

src/include/common/unicode_case.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,15 @@ typedef size_t (*WordBoundaryNext) (void *wbstate);
2121
pg_wchar unicode_lowercase_simple(pg_wchar code);
2222
pg_wchar unicode_titlecase_simple(pg_wchar code);
2323
pg_wchar unicode_uppercase_simple(pg_wchar code);
24+
pg_wchar unicode_casefold_simple(pg_wchar code);
2425
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
2526
ssize_t srclen, bool full);
2627
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
2728
ssize_t srclen, bool full,
2829
WordBoundaryNext wbnext, void *wbstate);
2930
size_t unicode_strupper(char *dst, size_t dstsize, const char *src,
3031
ssize_t srclen, bool full);
32+
size_t unicode_strfold(char *dst, size_t dstsize, const char *src,
33+
ssize_t srclen, bool full);
3134

3235
#endif /* UNICODE_CASE_H */

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy