Add support for Unicode case folding.

jeff-davis · jeff-davis · commit 4e7f62bc386a · 2025-01-23T09:06:50.000-08:00
Expand case mapping tables to include entries for case folding, which are parsed from CaseFolding.txt. Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com
diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile
@@ -30,13 +30,13 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian
 # These files are part of the Unicode Character Database. Download
 # them on demand.  The dependency on Makefile.global is for
 # UNICODE_VERSION.
-CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
+CompositionExclusions.txt CaseFolding.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
 	$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
 
 unicode_version.h: generate-unicode_version.pl
 	$(PERL) $< --version $(UNICODE_VERSION)
 
-unicode_case_table.h: generate-unicode_case_table.pl UnicodeData.txt
+unicode_case_table.h: generate-unicode_case_table.pl CaseFolding.txt UnicodeData.txt
 	$(PERL) $<
 
 unicode_category_table.h: generate-unicode_category_table.pl DerivedCoreProperties.txt PropList.txt UnicodeData.txt
@@ -91,4 +91,4 @@ clean:
 	rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
 
 distclean: clean
-	rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
+	rm -f CompositionExclusions.txt CaseFolding.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
@@ -81,17 +81,20 @@ icu_test_simple(pg_wchar code)
 	pg_wchar	lower = unicode_lowercase_simple(code);
 	pg_wchar	title = unicode_titlecase_simple(code);
 	pg_wchar	upper = unicode_uppercase_simple(code);
+	pg_wchar	fold = unicode_casefold_simple(code);
 	pg_wchar	iculower = u_tolower(code);
 	pg_wchar	icutitle = u_totitle(code);
 	pg_wchar	icuupper = u_toupper(code);
+	pg_wchar	icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
 
-	if (lower != iculower || title != icutitle || upper != icuupper)
+	if (lower != iculower || title != icutitle || upper != icuupper ||
+		fold != icufold)
 	{
 		printf("case_test: FAILURE for codepoint 0x%06x\n", code);
-		printf("case_test: Postgres lower/title/upper:	0x%06x/0x%06x/0x%06x\n",
-			   lower, title, upper);
-		printf("case_test: ICU lower/title/upper:		0x%06x/0x%06x/0x%06x\n",
-			   iculower, icutitle, icuupper);
+		printf("case_test: Postgres lower/title/upper/fold:	0x%06x/0x%06x/0x%06x/0x%06x\n",
+			   lower, title, upper, fold);
+		printf("case_test: ICU lower/title/upper/fold:		0x%06x/0x%06x/0x%06x/0x%06x\n",
+			   iculower, icutitle, icuupper, icufold);
 		printf("\n");
 		exit(1);
 	}
@@ -103,9 +106,11 @@ icu_test_full(char *str)
 	char		lower[BUFSZ];
 	char		title[BUFSZ];
 	char		upper[BUFSZ];
+	char		fold[BUFSZ];
 	char		icu_lower[BUFSZ];
 	char		icu_title[BUFSZ];
 	char		icu_upper[BUFSZ];
+	char		icu_fold[BUFSZ];
 	UErrorCode	status;
 	struct WordBoundaryState wbstate = {
 		.str = str,
@@ -118,12 +123,15 @@ icu_test_full(char *str)
 	unicode_strlower(lower, BUFSZ, str, -1, true);
 	unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
 	unicode_strupper(upper, BUFSZ, str, -1, true);
+	unicode_strfold(fold, BUFSZ, str, -1, true);
 	status = U_ZERO_ERROR;
 	ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
 	status = U_ZERO_ERROR;
 	ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
 	status = U_ZERO_ERROR;
 	ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
+	status = U_ZERO_ERROR;
+	ucasemap_utf8FoldCase(casemap, icu_fold, BUFSZ, str, -1, &status);
 
 	if (strcmp(lower, icu_lower) != 0)
 	{
@@ -143,6 +151,12 @@ icu_test_full(char *str)
 			   icu_upper);
 		exit(1);
 	}
+	if (strcmp(fold, icu_fold) != 0)
+	{
+		printf("case_test: str='%s' fold='%s' icu_fold='%s'\n", str, fold,
+			   icu_fold);
+		exit(1);
+	}
 }
 
 /*
@@ -302,6 +316,12 @@ tfunc_upper(char *dst, size_t dstsize, const char *src,
 	return unicode_strupper(dst, dstsize, src, srclen, true);
 }
 
+static size_t
+tfunc_fold(char *dst, size_t dstsize, const char *src,
+		   ssize_t srclen)
+{
+	return unicode_strfold(dst, dstsize, src, srclen, true);
+}
 
 static void
 test_convert_case()
@@ -318,10 +338,12 @@ test_convert_case()
 	test_convert(tfunc_upper, "ß", "SS");
 	test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
 	test_convert(tfunc_upper, "ıiIİ", "IIIİ");
+	test_convert(tfunc_fold, "ıiIİ", "ıiii\u0307");
 	/* test final sigma */
 	test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
 	test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
 	test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
+	test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ");
 
 #ifdef USE_ICU
 	icu_test_full("");
diff --git a/src/common/unicode/generate-unicode_case_table.pl b/src/common/unicode/generate-unicode_case_table.pl
@@ -49,7 +49,8 @@
 		$simple{$code} = {
 			Simple_Lowercase => ($simple_lowercase || $code),
 			Simple_Titlecase => ($simple_titlecase || $code),
-			Simple_Uppercase => ($simple_uppercase || $code)
+			Simple_Uppercase => ($simple_uppercase || $code),
+			Simple_Foldcase => $code,
 		};
 	}
 }
@@ -87,6 +88,7 @@
 	my @lower = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[1]));
 	my @title = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[2]));
 	my @upper = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[3]));
+	my @fold = ();
 	my @conditions = map {
 		# supporting negated conditions may require storing a
 		# mask of relevant conditions for a given rule to differentiate
@@ -101,6 +103,7 @@
 	push @lower, $code if (scalar @lower == 0);
 	push @title, $code if (scalar @title == 0);
 	push @upper, $code if (scalar @upper == 0);
+	push @fold, $code;
 
 	# none should map to more than 3 codepoints
 	die "lowercase expansion for 0x$elts[0] exceeds maximum: '$elts[1]'"
@@ -114,13 +117,15 @@
 	while (scalar @upper < $MAX_CASE_EXPANSION) { push @upper, 0x000000 }
 	while (scalar @lower < $MAX_CASE_EXPANSION) { push @lower, 0x000000 }
 	while (scalar @title < $MAX_CASE_EXPANSION) { push @title, 0x000000 }
+	while (scalar @fold < $MAX_CASE_EXPANSION)  { push @fold, 0x000000 }
 
 	# Characters with special mappings may not have simple mappings;
 	# ensure that an entry exists.
 	$simple{$code} ||= {
 		Simple_Lowercase => $code,
 		Simple_Titlecase => $code,
-		Simple_Uppercase => $code
+		Simple_Uppercase => $code,
+		Simple_Foldcase => $code
 	};
 
 	# Multiple special case rules for a single codepoint could be
@@ -135,11 +140,96 @@
 		Lowercase => \@lower,
 		Titlecase => \@title,
 		Uppercase => \@upper,
+		Foldcase => \@fold,
 		Conditions => $cond_str
 	};
 }
 close $FH;
 
+open($FH, '<', "$output_path/CaseFolding.txt")
+  or die "Could not open $output_path/CaseFolding.txt: $!.";
+while (my $line = <$FH>)
+{
+	# remove comments
+	$line =~ s/^(.*?)#.*$/$1/s;
+
+	# ignore empty lines
+	next unless $line =~ /;/;
+
+	my @elts = split(';', $line);
+	my $code = hex($elts[0]);
+	my $status = $elts[1] =~ s/^\s+|\s+$//rg;
+
+	# Codepoint may map to multiple characters when folding. Split
+	# each mapping on whitespace and extract the hexadecimal into an
+	# array of codepoints.
+	my @fold = map { hex $_ } (grep /[0-9A-F]+/, (split /\s+/, $elts[2]));
+
+	die "codepoint $code out of range" if $code > 0x10FFFF;
+
+	# status 'T' unsupported; skip
+	next if $status eq 'T';
+
+	# encountered unrecognized status type
+	die "unsupported status type '$status'"
+	  if $status ne 'S' && $status ne 'C' && $status ne 'F';
+
+	# initialize simple case mappings if they don't exist
+	$simple{$code} ||= {
+		Simple_Lowercase => $code,
+		Simple_Titlecase => $code,
+		Simple_Uppercase => $code,
+		Simple_Foldcase => $code
+	};
+
+	if ($status eq 'S' || $status eq 'C')
+	{
+		die
+		  "Simple case folding for $code has multiple codepoints: '$line' '$elts[2]'"
+		  if scalar @fold != 1;
+		my $simple_foldcase = $fold[0];
+
+		die "Simple_Foldcase $code out of range"
+		  if $simple_foldcase > 0x10FFFF;
+
+		$simple{$code}{Simple_Foldcase} = $simple_foldcase;
+	}
+
+	if ($status eq 'F' || ($status eq 'C' && defined $special{$code}))
+	{
+		while (scalar @fold < $MAX_CASE_EXPANSION) { push @fold, 0x000000 }
+
+		#initialize special case mappings if they don't exist
+		if (!defined $special{$code})
+		{
+			my @lower = ($simple{$code}{Simple_Lowercase});
+			my @title = ($simple{$code}{Simple_Titlecase});
+			my @upper = ($simple{$code}{Simple_Uppercase});
+			while (scalar @lower < $MAX_CASE_EXPANSION)
+			{
+				push @lower, 0x000000;
+			}
+			while (scalar @title < $MAX_CASE_EXPANSION)
+			{
+				push @title, 0x000000;
+			}
+			while (scalar @upper < $MAX_CASE_EXPANSION)
+			{
+				push @upper, 0x000000;
+			}
+			$special{$code} = {
+				Lowercase => \@lower,
+				Titlecase => \@title,
+				Uppercase => \@upper,
+				Conditions => '0'
+			};
+		}
+
+		$special{$code}{Foldcase} = \@fold;
+	}
+}
+close $FH;
+
 # assign sequential array indexes to the special mappings
 my $special_idx = 0;
 foreach my $code (sort { $a <=> $b } (keys %special))
@@ -202,6 +292,7 @@
 	CaseLower = 0,
 	CaseTitle = 1,
 	CaseUpper = 2,
+	CaseFold = 3,
 	NCaseKind
 } CaseKind;
 
@@ -232,14 +323,17 @@
 	die if scalar @{ $special{$code}{Lowercase} } != $MAX_CASE_EXPANSION;
 	die if scalar @{ $special{$code}{Titlecase} } != $MAX_CASE_EXPANSION;
 	die if scalar @{ $special{$code}{Uppercase} } != $MAX_CASE_EXPANSION;
+	die if scalar @{ $special{$code}{Foldcase} } != $MAX_CASE_EXPANSION;
 	my $lower = join ", ",
 	  (map { sprintf "0x%06x", $_ } @{ $special{$code}{Lowercase} });
 	my $title = join ", ",
 	  (map { sprintf "0x%06x", $_ } @{ $special{$code}{Titlecase} });
 	my $upper = join ", ",
 	  (map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} });
+	my $fold = join ", ",
+	  (map { sprintf "0x%06x", $_ } @{ $special{$code}{Foldcase} });
 	printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions};
-	printf $OT "{{%s}, {%s}, {%s}}},\n", $lower, $title, $upper;
+	printf $OT "{{%s}, {%s}, {%s}, {%s}}},\n", $lower, $title, $upper, $fold;
 }
 
 print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n";
@@ -260,11 +354,13 @@
 	my $lc = ($simple{$code}{Simple_Lowercase} || $code);
 	my $tc = ($simple{$code}{Simple_Titlecase} || $code);
 	my $uc = ($simple{$code}{Simple_Uppercase} || $code);
+	my $fc = ($simple{$code}{Simple_Foldcase} || $code);
+
 	die "unexpected special case for code $code"
 	  if defined $special{$code};
 	printf $OT
-	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, NULL},\n",
-	  $code, $lc, $tc, $uc;
+	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, NULL},\n",
+	  $code, $lc, $tc, $uc, $fc;
 }
 printf $OT "\n";
 
@@ -280,8 +376,8 @@
 		$special_case = sprintf "&special_case[%d]", $special{$code}{Index};
 	}
 	printf $OT
-	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, %s},\n",
+	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, %s},\n",
 	  $code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
-	  $map->{Simple_Uppercase}, $special_case;
+	  $map->{Simple_Uppercase}, $map->{Simple_Foldcase}, $special_case;
 }
 print $OT "};\n";
diff --git a/src/common/unicode/meson.build b/src/common/unicode/meson.build
@@ -11,7 +11,7 @@ endif
 
 # These files are part of the Unicode Character Database. Download them on
 # demand.
-foreach f : ['CompositionExclusions.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt']
+foreach f : ['CompositionExclusions.txt', 'CaseFolding.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt']
   url = unicode_baseurl.format(UNICODE_VERSION, f)
   target = custom_target(f,
     output: f,
@@ -26,7 +26,7 @@ update_unicode_targets = []
 
 update_unicode_targets += \
   custom_target('unicode_case_table.h',
-    input: [unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
+    input: [unicode_data['CaseFolding.txt'], unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
     output: ['unicode_case_table.h'],
     command: [
       perl, files('generate-unicode_case_table.pl'),
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
@@ -51,6 +51,14 @@ unicode_uppercase_simple(pg_wchar code)
 	return map ? map->simplemap[CaseUpper] : code;
 }
 
+pg_wchar
+unicode_casefold_simple(pg_wchar code)
+{
+	const pg_case_map *map = find_case_map(code);
+
+	return map ? map->simplemap[CaseFold] : code;
+}
+
 /*
  * unicode_strlower()
  *
@@ -142,6 +150,30 @@ unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 						NULL);
 }
 
+/*
+ * unicode_strfold()
+ *
+ * Case fold src, and return the result length (not including terminating
+ * NUL).
+ *
+ * String src must be encoded in UTF-8. If srclen < 0, src must be
+ * NUL-terminated.
+ *
+ * Result string is stored in dst, truncating if larger than dstsize. If
+ * dstsize is greater than the result length, dst will be NUL-terminated;
+ * otherwise not.
+ *
+ * If dstsize is zero, dst may be NULL. This is useful for calculating the
+ * required buffer size before allocating.
+ */
+size_t
+unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+				bool full)
+{
+	return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
+						NULL);
+}
+
 /*
  * Implement Unicode Default Case Conversion algorithm.
  *
diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h
@@ -21,12 +21,15 @@ typedef size_t (*WordBoundaryNext) (void *wbstate);
 pg_wchar	unicode_lowercase_simple(pg_wchar code);
 pg_wchar	unicode_titlecase_simple(pg_wchar code);
 pg_wchar	unicode_uppercase_simple(pg_wchar code);
+pg_wchar	unicode_casefold_simple(pg_wchar code);
 size_t		unicode_strlower(char *dst, size_t dstsize, const char *src,
 							 ssize_t srclen, bool full);
 size_t		unicode_strtitle(char *dst, size_t dstsize, const char *src,
 							 ssize_t srclen, bool full,
 							 WordBoundaryNext wbnext, void *wbstate);
 size_t		unicode_strupper(char *dst, size_t dstsize, const char *src,
 							 ssize_t srclen, bool full);
+size_t		unicode_strfold(char *dst, size_t dstsize, const char *src,
+							ssize_t srclen, bool full);
 
 #endif							/* UNICODE_CASE_H */
diff --git a/src/include/common/unicode_case_table.h b/src/include/common/unicode_case_table.h