Skip to content

Commit 2ab0796

Browse files
committed
Fix char2wchar/wchar2char to support collations properly.
These functions should take a pg_locale_t, not a collation OID, and should call mbstowcs_l/wcstombs_l where available. Where those functions are not available, temporarily select the correct locale with uselocale(). This change removes the bogus assumption that all locales selectable in a given database have the same wide-character conversion method; in particular, the collate.linux.utf8 regression test now passes with LC_CTYPE=C, so long as the database encoding is UTF8. I decided to move the char2wchar/wchar2char functions out of mbutils.c and into pg_locale.c, because they work on wchar_t not pg_wchar_t and thus don't really belong with the mbutils.c functions. Keeping them where they were would have required importing pg_locale_t into pg_wchar.h somehow, which did not seem like a good plan.
1 parent bb85030 commit 2ab0796

File tree

12 files changed

+217
-144
lines changed

12 files changed

+217
-144
lines changed

configure

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18985,7 +18985,8 @@ fi
1898518985

1898618986

1898718987

18988-
for ac_func in cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs
18988+
18989+
for ac_func in cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs wcstombs_l
1898918990
do
1899018991
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
1899118992
{ $as_echo "$as_me:$LINENO: checking for $ac_func" >&5

configure.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1187,7 +1187,7 @@ PGAC_VAR_INT_TIMEZONE
11871187
AC_FUNC_ACCEPT_ARGTYPES
11881188
PGAC_FUNC_GETTIMEOFDAY_1ARG
11891189

1190-
AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs])
1190+
AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs wcstombs_l])
11911191

11921192
AC_REPLACE_FUNCS(fseeko)
11931193
case $host_os in

src/backend/tsearch/ts_locale.c

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,12 @@ t_isdigit(const char *ptr)
2929
int clen = pg_mblen(ptr);
3030
wchar_t character[2];
3131
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
32+
pg_locale_t mylocale = 0; /* TODO */
3233

3334
if (clen == 1 || lc_ctype_is_c(collation))
3435
return isdigit(TOUCHAR(ptr));
3536

36-
char2wchar(character, 2, ptr, clen, collation);
37+
char2wchar(character, 2, ptr, clen, mylocale);
3738

3839
return iswdigit((wint_t) character[0]);
3940
}
@@ -44,11 +45,12 @@ t_isspace(const char *ptr)
4445
int clen = pg_mblen(ptr);
4546
wchar_t character[2];
4647
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
48+
pg_locale_t mylocale = 0; /* TODO */
4749

4850
if (clen == 1 || lc_ctype_is_c(collation))
4951
return isspace(TOUCHAR(ptr));
5052

51-
char2wchar(character, 2, ptr, clen, collation);
53+
char2wchar(character, 2, ptr, clen, mylocale);
5254

5355
return iswspace((wint_t) character[0]);
5456
}
@@ -59,11 +61,12 @@ t_isalpha(const char *ptr)
5961
int clen = pg_mblen(ptr);
6062
wchar_t character[2];
6163
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
64+
pg_locale_t mylocale = 0; /* TODO */
6265

6366
if (clen == 1 || lc_ctype_is_c(collation))
6467
return isalpha(TOUCHAR(ptr));
6568

66-
char2wchar(character, 2, ptr, clen, collation);
69+
char2wchar(character, 2, ptr, clen, mylocale);
6770

6871
return iswalpha((wint_t) character[0]);
6972
}
@@ -74,11 +77,12 @@ t_isprint(const char *ptr)
7477
int clen = pg_mblen(ptr);
7578
wchar_t character[2];
7679
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
80+
pg_locale_t mylocale = 0; /* TODO */
7781

7882
if (clen == 1 || lc_ctype_is_c(collation))
7983
return isprint(TOUCHAR(ptr));
8084

81-
char2wchar(character, 2, ptr, clen, collation);
85+
char2wchar(character, 2, ptr, clen, mylocale);
8286

8387
return iswprint((wint_t) character[0]);
8488
}
@@ -246,6 +250,7 @@ lowerstr_with_len(const char *str, int len)
246250

247251
#ifdef USE_WIDE_UPPER_LOWER
248252
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
253+
pg_locale_t mylocale = 0; /* TODO */
249254
#endif
250255

251256
if (len == 0)
@@ -272,7 +277,7 @@ lowerstr_with_len(const char *str, int len)
272277
*/
273278
wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
274279

275-
wlen = char2wchar(wstr, len + 1, str, len, collation);
280+
wlen = char2wchar(wstr, len + 1, str, len, mylocale);
276281
Assert(wlen <= len);
277282

278283
while (*wptr)
@@ -287,7 +292,7 @@ lowerstr_with_len(const char *str, int len)
287292
len = pg_database_encoding_max_length() * wlen + 1;
288293
out = (char *) palloc(len);
289294

290-
wlen = wchar2char(out, wstr, len, collation);
295+
wlen = wchar2char(out, wstr, len, mylocale);
291296

292297
pfree(wstr);
293298

src/backend/tsearch/wparser_def.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -300,21 +300,23 @@ TParserInit(char *str, int len)
300300
if (prs->charmaxlen > 1)
301301
{
302302
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
303+
pg_locale_t mylocale = 0; /* TODO */
303304

304305
prs->usewide = true;
305306
if (lc_ctype_is_c(collation))
306307
{
307308
/*
308309
* char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
309-
* be not equal to sizeof(wchar_t)
310+
* be different from sizeof(wchar_t)
310311
*/
311312
prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
312313
pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
313314
}
314315
else
315316
{
316317
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
317-
char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr, collation);
318+
char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
319+
mylocale);
318320
}
319321
}
320322
else

src/backend/utils/adt/formatting.c

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1454,6 +1454,10 @@ str_numth(char *dest, char *num, int type)
14541454
return dest;
14551455
}
14561456

1457+
/*****************************************************************************
1458+
* upper/lower/initcap functions
1459+
*****************************************************************************/
1460+
14571461
/*
14581462
* If the system provides the needed functions for wide-character manipulation
14591463
* (which are all standardized by C99), then we implement upper/lower/initcap
@@ -1527,7 +1531,7 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
15271531
/* Output workspace cannot have more codes than input bytes */
15281532
workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
15291533

1530-
char2wchar(workspace, nbytes + 1, buff, nbytes, collid);
1534+
char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
15311535

15321536
for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
15331537
{
@@ -1543,7 +1547,7 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
15431547
result_size = curr_char * pg_database_encoding_max_length() + 1;
15441548
result = palloc(result_size);
15451549

1546-
wchar2char(result, workspace, result_size, collid);
1550+
wchar2char(result, workspace, result_size, mylocale);
15471551
pfree(workspace);
15481552
}
15491553
#endif /* USE_WIDE_UPPER_LOWER */
@@ -1648,7 +1652,7 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
16481652
/* Output workspace cannot have more codes than input bytes */
16491653
workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
16501654

1651-
char2wchar(workspace, nbytes + 1, buff, nbytes, collid);
1655+
char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
16521656

16531657
for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
16541658
{
@@ -1664,7 +1668,7 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
16641668
result_size = curr_char * pg_database_encoding_max_length() + 1;
16651669
result = palloc(result_size);
16661670

1667-
wchar2char(result, workspace, result_size, collid);
1671+
wchar2char(result, workspace, result_size, mylocale);
16681672
pfree(workspace);
16691673
}
16701674
#endif /* USE_WIDE_UPPER_LOWER */
@@ -1781,7 +1785,7 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
17811785
/* Output workspace cannot have more codes than input bytes */
17821786
workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
17831787

1784-
char2wchar(workspace, nbytes + 1, buff, nbytes, collid);
1788+
char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
17851789

17861790
for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
17871791
{
@@ -1809,7 +1813,7 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
18091813
result_size = curr_char * pg_database_encoding_max_length() + 1;
18101814
result = palloc(result_size);
18111815

1812-
wchar2char(result, workspace, result_size, collid);
1816+
wchar2char(result, workspace, result_size, mylocale);
18131817
pfree(workspace);
18141818
}
18151819
#endif /* USE_WIDE_UPPER_LOWER */

src/backend/utils/adt/pg_locale.c

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,3 +1030,176 @@ pg_newlocale_from_collation(Oid collid)
10301030

10311031
return cache_entry->locale;
10321032
}
1033+
1034+
1035+
/*
1036+
* These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
1037+
* Therefore we keep them here rather than with the mbutils code.
1038+
*/
1039+
1040+
#ifdef USE_WIDE_UPPER_LOWER
1041+
1042+
/*
1043+
* wchar2char --- convert wide characters to multibyte format
1044+
*
1045+
* This has the same API as the standard wcstombs_l() function; in particular,
1046+
* tolen is the maximum number of bytes to store at *to, and *from must be
1047+
* zero-terminated. The output will be zero-terminated iff there is room.
1048+
*/
1049+
size_t
1050+
wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
1051+
{
1052+
size_t result;
1053+
1054+
if (tolen == 0)
1055+
return 0;
1056+
1057+
#ifdef WIN32
1058+
1059+
/*
1060+
* On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
1061+
* for some reason mbstowcs and wcstombs won't do this for us, so we use
1062+
* MultiByteToWideChar().
1063+
*/
1064+
if (GetDatabaseEncoding() == PG_UTF8)
1065+
{
1066+
result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
1067+
NULL, NULL);
1068+
/* A zero return is failure */
1069+
if (result <= 0)
1070+
result = -1;
1071+
else
1072+
{
1073+
Assert(result <= tolen);
1074+
/* Microsoft counts the zero terminator in the result */
1075+
result--;
1076+
}
1077+
}
1078+
else
1079+
#endif /* WIN32 */
1080+
if (locale == (pg_locale_t) 0)
1081+
{
1082+
/* Use wcstombs directly for the default locale */
1083+
result = wcstombs(to, from, tolen);
1084+
}
1085+
else
1086+
{
1087+
#ifdef HAVE_LOCALE_T
1088+
#ifdef HAVE_WCSTOMBS_L
1089+
/* Use wcstombs_l for nondefault locales */
1090+
result = wcstombs_l(to, from, tolen, locale);
1091+
#else /* !HAVE_WCSTOMBS_L */
1092+
/* We have to temporarily set the locale as current ... ugh */
1093+
locale_t save_locale = uselocale(locale);
1094+
1095+
result = wcstombs(to, from, tolen);
1096+
1097+
uselocale(save_locale);
1098+
#endif /* HAVE_WCSTOMBS_L */
1099+
#else /* !HAVE_LOCALE_T */
1100+
/* Can't have locale != 0 without HAVE_LOCALE_T */
1101+
elog(ERROR, "wcstombs_l is not available");
1102+
result = 0; /* keep compiler quiet */
1103+
#endif /* HAVE_LOCALE_T */
1104+
}
1105+
1106+
return result;
1107+
}
1108+
1109+
/*
1110+
* char2wchar --- convert multibyte characters to wide characters
1111+
*
1112+
* This has almost the API of mbstowcs_l(), except that *from need not be
1113+
* null-terminated; instead, the number of input bytes is specified as
1114+
* fromlen. Also, we ereport() rather than returning -1 for invalid
1115+
* input encoding. tolen is the maximum number of wchar_t's to store at *to.
1116+
* The output will be zero-terminated iff there is room.
1117+
*/
1118+
size_t
1119+
char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
1120+
pg_locale_t locale)
1121+
{
1122+
size_t result;
1123+
1124+
if (tolen == 0)
1125+
return 0;
1126+
1127+
#ifdef WIN32
1128+
/* See WIN32 "Unicode" comment above */
1129+
if (GetDatabaseEncoding() == PG_UTF8)
1130+
{
1131+
/* Win32 API does not work for zero-length input */
1132+
if (fromlen == 0)
1133+
result = 0;
1134+
else
1135+
{
1136+
result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
1137+
/* A zero return is failure */
1138+
if (result == 0)
1139+
result = -1;
1140+
}
1141+
1142+
if (result != -1)
1143+
{
1144+
Assert(result < tolen);
1145+
/* Append trailing null wchar (MultiByteToWideChar() does not) */
1146+
to[result] = 0;
1147+
}
1148+
}
1149+
else
1150+
#endif /* WIN32 */
1151+
{
1152+
/* mbstowcs requires ending '\0' */
1153+
char *str = pnstrdup(from, fromlen);
1154+
1155+
if (locale == (pg_locale_t) 0)
1156+
{
1157+
/* Use mbstowcs directly for the default locale */
1158+
result = mbstowcs(to, str, tolen);
1159+
}
1160+
else
1161+
{
1162+
#ifdef HAVE_LOCALE_T
1163+
#ifdef HAVE_WCSTOMBS_L
1164+
/* Use mbstowcs_l for nondefault locales */
1165+
result = mbstowcs_l(to, str, tolen, locale);
1166+
#else /* !HAVE_WCSTOMBS_L */
1167+
/* We have to temporarily set the locale as current ... ugh */
1168+
locale_t save_locale = uselocale(locale);
1169+
1170+
result = mbstowcs(to, str, tolen);
1171+
1172+
uselocale(save_locale);
1173+
#endif /* HAVE_WCSTOMBS_L */
1174+
#else /* !HAVE_LOCALE_T */
1175+
/* Can't have locale != 0 without HAVE_LOCALE_T */
1176+
elog(ERROR, "mbstowcs_l is not available");
1177+
result = 0; /* keep compiler quiet */
1178+
#endif /* HAVE_LOCALE_T */
1179+
}
1180+
1181+
pfree(str);
1182+
}
1183+
1184+
if (result == -1)
1185+
{
1186+
/*
1187+
* Invalid multibyte character encountered. We try to give a useful
1188+
* error message by letting pg_verifymbstr check the string. But it's
1189+
* possible that the string is OK to us, and not OK to mbstowcs ---
1190+
* this suggests that the LC_CTYPE locale is different from the
1191+
* database encoding. Give a generic error message if verifymbstr
1192+
* can't find anything wrong.
1193+
*/
1194+
pg_verifymbstr(from, fromlen, false); /* might not return */
1195+
/* but if it does ... */
1196+
ereport(ERROR,
1197+
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1198+
errmsg("invalid multibyte character for locale"),
1199+
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
1200+
}
1201+
1202+
return result;
1203+
}
1204+
1205+
#endif /* USE_WIDE_UPPER_LOWER */

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy