Skip to content

Commit f2a01b0

Browse files
committed
Fix localization support for multibyte encoding and C locale.
Slightly reworked patch from Tatsuo Ishii
1 parent 7021d6f commit f2a01b0

File tree

3 files changed

+144
-52
lines changed

3 files changed

+144
-52
lines changed

contrib/tsearch2/ts_locale.c

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,13 @@
1212
size_t
1313
wchar2char(char *to, const wchar_t *from, size_t len)
1414
{
15+
if (len == 0)
16+
return 0;
17+
1518
if (GetDatabaseEncoding() == PG_UTF8)
1619
{
1720
int r;
1821

19-
if (len == 0)
20-
return 0;
21-
2222
r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
2323
NULL, NULL);
2424

@@ -34,17 +34,19 @@ wchar2char(char *to, const wchar_t *from, size_t len)
3434

3535
return wcstombs(to, from, len);
3636
}
37+
#endif /* WIN32 */
3738

3839
size_t
3940
char2wchar(wchar_t *to, const char *from, size_t len)
4041
{
42+
if (len == 0)
43+
return 0;
44+
45+
#ifdef WIN32
4146
if (GetDatabaseEncoding() == PG_UTF8)
4247
{
4348
int r;
4449

45-
if (len == 0)
46-
return 0;
47-
4850
r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
4951

5052
if (!r)
@@ -60,29 +62,44 @@ char2wchar(wchar_t *to, const char *from, size_t len)
6062

6163
return r;
6264
}
65+
else
66+
#endif /* WIN32 */
67+
if ( lc_ctype_is_c() )
68+
{
69+
/*
70+
* pg_mb2wchar_with_len always adds trailing '\0', so
71+
* 'to' should be allocated with sufficient space
72+
*/
73+
return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
74+
}
6375

6476
return mbstowcs(to, from, len);
6577
}
66-
#endif /* WIN32 */
6778

6879
int
6980
_t_isalpha(const char *ptr)
7081
{
71-
wchar_t character;
82+
wchar_t character[2];
83+
84+
if (lc_ctype_is_c())
85+
return isalpha(TOUCHAR(ptr));
7286

73-
char2wchar(&character, ptr, 1);
87+
char2wchar(character, ptr, 1);
7488

75-
return iswalpha((wint_t) character);
89+
return iswalpha((wint_t) *character);
7690
}
7791

7892
int
7993
_t_isprint(const char *ptr)
8094
{
81-
wchar_t character;
95+
wchar_t character[2];
96+
97+
if (lc_ctype_is_c())
98+
return isprint(TOUCHAR(ptr));
8299

83-
char2wchar(&character, ptr, 1);
100+
char2wchar(character, ptr, 1);
84101

85-
return iswprint((wint_t) character);
102+
return iswprint((wint_t) *character);
86103
}
87104
#endif /* TS_USE_WIDE */
88105

@@ -126,7 +143,7 @@ lowerstr(char *str)
126143
if ( wlen < 0 )
127144
ereport(ERROR,
128145
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
129-
errmsg("transalation failed from server encoding to wchar_t")));
146+
errmsg("translation failed from server encoding to wchar_t")));
130147

131148
Assert(wlen<=len);
132149
wstr[wlen] = 0;
@@ -152,7 +169,7 @@ lowerstr(char *str)
152169
if ( wlen < 0 )
153170
ereport(ERROR,
154171
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
155-
errmsg("transalation failed from wchar_t to server encoding %d", errno)));
172+
errmsg("translation failed from wchar_t to server encoding %d", errno)));
156173
Assert(wlen<=len);
157174
out[wlen]='\0';
158175
}

contrib/tsearch2/ts_locale.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,17 @@
3030
#define TOUCHAR(x) (*((unsigned char*)(x)))
3131

3232
#ifdef TS_USE_WIDE
33+
size_t char2wchar(wchar_t *to, const char *from, size_t len);
3334

3435
#ifdef WIN32
3536

3637
size_t wchar2char(char *to, const wchar_t *from, size_t len);
37-
size_t char2wchar(wchar_t *to, const char *from, size_t len);
38+
3839
#else /* WIN32 */
3940

40-
/* correct mbstowcs */
41-
#define char2wchar mbstowcs
41+
/* correct wcstombs */
4242
#define wchar2char wcstombs
43+
4344
#endif /* WIN32 */
4445

4546
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
@@ -55,10 +56,10 @@ extern int _t_isprint(const char *ptr);
5556
*/
5657
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
5758

58-
#define COPYCHAR(d,s) do { \
59-
int lll = pg_mblen( s ); \
60-
\
61-
while( lll-- ) \
59+
#define COPYCHAR(d,s) do { \
60+
int lll = pg_mblen( s ); \
61+
\
62+
while( lll-- ) \
6263
TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
6364
} while(0)
6465

contrib/tsearch2/wordparser/parser.c

Lines changed: 104 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.11 2006/10/04 00:29:47 momjian Exp $ */
1+
/* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.12 2007/01/15 15:16:28 teodor Exp $ */
22

33
#include "postgres.h"
44

@@ -40,16 +40,13 @@ TParserInit(char *str, int len)
4040
#ifdef TS_USE_WIDE
4141

4242
/*
43-
* Use wide char code only when max encoding length > 1 and ctype != C.
44-
* Some operating systems fail with multi-byte encodings and a C locale.
45-
* Also, for a C locale there is no need to process as multibyte. From
46-
* backend/utils/adt/oracle_compat.c Teodor
43+
* Use wide char code only when max encoding length > 1.
4744
*/
4845

49-
if (prs->charmaxlen > 1 && !lc_ctype_is_c())
46+
if (prs->charmaxlen > 1)
5047
{
5148
prs->usewide = true;
52-
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
49+
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));
5350
prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
5451
}
5552
else
@@ -83,25 +80,99 @@ TParserClose(TParser * prs)
8380

8481
/*
8582
* defining support function, equvalent is* macroses, but
86-
* working with any possible encodings and locales
83+
* working with any possible encodings and locales. Note,
84+
* that with multibyte encoding and C-locale isw* function may fail
85+
* or give wrong result. Note 2: multibyte encoding and C-locale
86+
* often are used for Asian languages.
8787
*/
8888

8989
#ifdef TS_USE_WIDE
9090

91-
#define p_iswhat(type) \
92-
static int \
93-
p_is##type(TParser *prs) { \
94-
Assert( prs->state ); \
95-
return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
96-
is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \
97-
} \
98-
\
99-
static int \
100-
p_isnot##type(TParser *prs) { \
101-
return !p_is##type(prs); \
91+
#define p_iswhat(type) \
92+
static int \
93+
p_is##type(TParser *prs) { \
94+
Assert( prs->state ); \
95+
if ( prs->usewide ) \
96+
{ \
97+
if ( lc_ctype_is_c() ) \
98+
return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \
99+
\
100+
return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \
101+
} \
102+
\
103+
return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
104+
} \
105+
\
106+
static int \
107+
p_isnot##type(TParser *prs) { \
108+
return !p_is##type(prs); \
102109
}
103110

111+
static int
112+
p_isalnum(TParser *prs)
113+
{
114+
Assert( prs->state );
115+
116+
if (prs->usewide)
117+
{
118+
if (lc_ctype_is_c())
119+
{
120+
unsigned int c = *(unsigned int*)(prs->wstr + prs->state->poschar);
121+
122+
/*
123+
* any non-ascii symbol with multibyte encoding
124+
* with C-locale is an alpha character
125+
*/
126+
if ( c > 0x7f )
127+
return 1;
128+
129+
return isalnum(0xff & c);
130+
}
131+
132+
return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
133+
}
104134

135+
return isalnum( *(unsigned char*)( prs->str + prs->state->posbyte ));
136+
}
137+
138+
static int
139+
p_isnotalnum(TParser *prs)
140+
{
141+
return !p_isalnum(prs);
142+
}
143+
144+
static int
145+
p_isalpha(TParser *prs)
146+
{
147+
Assert( prs->state );
148+
149+
if (prs->usewide)
150+
{
151+
if (lc_ctype_is_c())
152+
{
153+
unsigned int c = *(prs->wstr + prs->state->poschar);
154+
155+
/*
156+
* any non-ascii symbol with multibyte encoding
157+
* with C-locale is an alpha character
158+
*/
159+
if ( c > 0x7f )
160+
return 1;
161+
162+
return isalpha(0xff & c);
163+
}
164+
165+
return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
166+
}
167+
168+
return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte ));
169+
}
170+
171+
static int
172+
p_isnotalpha(TParser *prs)
173+
{
174+
return !p_isalpha(prs);
175+
}
105176

106177
/* p_iseq should be used only for ascii symbols */
107178

@@ -111,18 +182,19 @@ p_iseq(TParser * prs, char c)
111182
Assert(prs->state);
112183
return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
113184
}
185+
114186
#else /* TS_USE_WIDE */
115187

116-
#define p_iswhat(type) \
117-
static int \
118-
p_is##type(TParser *prs) { \
119-
Assert( prs->state ); \
120-
return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
121-
} \
122-
\
123-
static int \
124-
p_isnot##type(TParser *prs) { \
125-
return !p_is##type(prs); \
188+
#define p_iswhat(type) \
189+
static int \
190+
p_is##type(TParser *prs) { \
191+
Assert( prs->state ); \
192+
return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
193+
} \
194+
\
195+
static int \
196+
p_isnot##type(TParser *prs) { \
197+
return !p_is##type(prs); \
126198
}
127199

128200

@@ -132,10 +204,12 @@ p_iseq(TParser * prs, char c)
132204
Assert(prs->state);
133205
return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
134206
}
135-
#endif /* TS_USE_WIDE */
136207

137208
p_iswhat(alnum)
138209
p_iswhat(alpha)
210+
211+
#endif /* TS_USE_WIDE */
212+
139213
p_iswhat(digit)
140214
p_iswhat(lower)
141215
p_iswhat(print)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy