Skip to content

Commit 4ea4f8b

Browse files
committed
Fix for Unicode characters above 0x10000.
John Hansen
1 parent 917c8bb commit 4ea4f8b

File tree

2 files changed

+83
-45
lines changed

2 files changed

+83
-45
lines changed

src/backend/utils/mb/wchar.c

Lines changed: 72 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* conversion functions between pg_wchar and multibyte streams.
33
* Tatsuo Ishii
4-
* $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.38 2004/09/17 21:59:57 petere Exp $
4+
* $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.39 2004/12/02 22:37:13 momjian Exp $
55
*
66
* WIN1250 client encoding updated by Pavel Behal
77
*
@@ -343,6 +343,31 @@ pg_johab_dsplen(const unsigned char *s)
343343
return (pg_euc_dsplen(s));
344344
}
345345

346+
bool isLegalUTF8(const UTF8 *source, int len) {
347+
UTF8 a;
348+
const UTF8 *srcptr = source+len;
349+
if(!source || (pg_utf_mblen(source) != len)) return false;
350+
switch (len) {
351+
default: return false;
352+
/* Everything else falls through when "true"... */
353+
case 6: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
354+
case 5: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
355+
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
356+
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
357+
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
358+
switch (*source) {
359+
/* no fall-through in this inner switch */
360+
case 0xE0: if (a < 0xA0) return false; break;
361+
case 0xF0: if (a < 0x90) return false; break;
362+
case 0xF4: if (a > 0x8F) return false; break;
363+
default: if (a < 0x80) return false;
364+
}
365+
case 1: if (*source >= 0x80 && *source < 0xC2) return false;
366+
if (*source > 0xFD) return false;
367+
}
368+
return true;
369+
}
370+
346371
/*
347372
* convert UTF-8 string to pg_wchar (UCS-2)
348373
* caller should allocate enough space for "to"
@@ -398,21 +423,27 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
398423
* returns the byte length of a UTF-8 word pointed to by s
399424
*/
400425
int
401-
pg_utf_mblen(const unsigned char *s)
426+
pg_utf_mblen(const UTF8 *s)
402427
{
403428
int len = 1;
404429

405430
if ((*s & 0x80) == 0)
406431
len = 1;
407432
else if ((*s & 0xe0) == 0xc0)
408433
len = 2;
409-
else if ((*s & 0xe0) == 0xe0)
410-
len = 3;
434+
else if ((*s & 0xf0) == 0xe0)
435+
len = 3;
436+
else if ((*s & 0xf8) == 0xf0)
437+
len = 4;
438+
else if ((*s & 0xfc) == 0xf8)
439+
len = 5;
440+
else if ((*s & 0xfe) == 0xfc)
441+
len = 6;
411442
return (len);
412443
}
413444

414445
static int
415-
pg_utf_dsplen(const unsigned char *s)
446+
pg_utf_dsplen(const UTF8 *s)
416447
{
417448
return 1; /* XXX fix me! */
418449
}
@@ -721,8 +752,8 @@ pg_wchar_tbl pg_wchar_table[] = {
721752
{pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */
722753
{pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */
723754
{pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */
724-
{pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3}, /* 6; PG_UNICODE */
725-
{pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */
755+
{pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 6}, /* 6; PG_UNICODE */
756+
{pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */
726757
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */
727758
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */
728759
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 10; PG_LATIN3 */
@@ -744,11 +775,11 @@ pg_wchar_tbl pg_wchar_table[] = {
744775
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 26; ISO-8859-7 */
745776
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 27; ISO-8859-8 */
746777
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 28; PG_WIN1250 */
747-
{0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; PG_SJIS */
748-
{0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; PG_BIG5 */
749-
{0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */
750-
{0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */
751-
{0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */
778+
{0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; PG_SJIS */
779+
{0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; PG_BIG5 */
780+
{0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */
781+
{0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */
782+
{0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */
752783
};
753784

754785
/* returns the byte length of a word for mule internal code */
@@ -822,51 +853,48 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError)
822853

823854
while (len > 0 && *mbstr)
824855
{
825-
/* special UTF-8 check */
826-
if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0)
827-
{
828-
if (noError)
829-
return false;
830-
ereport(ERROR,
831-
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
832-
errmsg("Unicode characters greater than or equal to 0x10000 are not supported")));
833-
}
834-
835856
l = pg_mblen(mbstr);
836857

837-
for (i = 1; i < l; i++)
838-
{
839-
/*
840-
* we expect that every multibyte char consists of bytes
841-
* having the 8th bit set
842-
*/
843-
if (i >= len || (mbstr[i] & 0x80) == 0)
858+
/* special UTF-8 check */
859+
if (encoding == PG_UTF8) {
860+
if(!isLegalUTF8(mbstr,l)) {
861+
if (noError) return false;
862+
ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid UNICODE byte sequence detected near character %c",*mbstr)));
863+
}
864+
} else {
865+
for (i = 1; i < l; i++)
844866
{
845-
char buf[8 * 2 + 1];
846-
char *p = buf;
847-
int j,
867+
/*
868+
* we expect that every multibyte char consists of bytes
869+
* having the 8th bit set
870+
*/
871+
if (i >= len || (mbstr[i] & 0x80) == 0)
872+
{
873+
char buf[8 * 2 + 1];
874+
char *p = buf;
875+
int j,
848876
jlimit;
849877

850-
if (noError)
851-
return false;
878+
if (noError)
879+
return false;
852880

853-
jlimit = Min(l, len);
854-
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
881+
jlimit = Min(l, len);
882+
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
855883

856-
for (j = 0; j < jlimit; j++)
857-
p += sprintf(p, "%02x", mbstr[j]);
884+
for (j = 0; j < jlimit; j++)
885+
p += sprintf(p, "%02x", mbstr[j]);
858886

859-
ereport(ERROR,
860-
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
861-
errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
862-
GetDatabaseEncodingName(), buf)));
887+
ereport(ERROR,
888+
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
889+
errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
890+
GetDatabaseEncodingName(), buf)));
891+
}
863892
}
864-
}
865893

894+
}
866895
len -= l;
867896
mbstr += l;
868897
}
869-
870898
return true;
871899
}
872900

src/include/mb/pg_wchar.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.53 2004/12/02 22:14:38 momjian Exp $ */
1+
/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.54 2004/12/02 22:37:14 momjian Exp $ */
22

33
#ifndef PG_WCHAR_H
44
#define PG_WCHAR_H
@@ -17,6 +17,14 @@
1717
*/
1818
typedef unsigned int pg_wchar;
1919

20+
21+
/*
22+
* The UTF types
23+
*/
24+
typedef unsigned int UTF32; /* at least 32 bits */
25+
typedef unsigned short UTF16; /* at least 16 bits */
26+
typedef unsigned char UTF8; /* typically 8 bits */
27+
2028
/*
2129
* various definitions for EUC
2230
*/
@@ -340,4 +348,6 @@ extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc);
340348
extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab);
341349
extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab);
342350

351+
extern bool isLegalUTF8(const UTF8 *source, int len);
352+
343353
#endif /* PG_WCHAR_H */

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy