Skip to content

Commit 08e0b34

Browse files
committed
Back out fix for Unicode characters above 0x10000
1 parent 5d7a555 commit 08e0b34

File tree

3 files changed

+47
-84
lines changed

3 files changed

+47
-84
lines changed

doc/src/sgml/postgres.sgml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<!--
2-
$PostgreSQL: pgsql/doc/src/sgml/postgres.sgml,v 1.65 2004/11/12 21:50:53 tgl Exp $
2+
$PostgreSQL: pgsql/doc/src/sgml/postgres.sgml,v 1.66 2004/12/03 01:20:14 momjian Exp $
33
-->
44

55
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook V4.2//EN" [
@@ -179,6 +179,7 @@ $PostgreSQL: pgsql/doc/src/sgml/postgres.sgml,v 1.65 2004/11/12 21:50:53 tgl Exp
179179
&lobj;
180180
&ecpg;
181181
&infoschema;
182+
&external_projects;
182183

183184
</part>
184185

src/backend/utils/mb/wchar.c

Lines changed: 44 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* conversion functions between pg_wchar and multibyte streams.
33
* Tatsuo Ishii
4-
* $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.39 2004/12/02 22:37:13 momjian Exp $
4+
* $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.40 2004/12/03 01:20:20 momjian Exp $
55
*
66
* WIN1250 client encoding updated by Pavel Behal
77
*
@@ -343,31 +343,6 @@ pg_johab_dsplen(const unsigned char *s)
343343
return (pg_euc_dsplen(s));
344344
}
345345

346-
bool isLegalUTF8(const UTF8 *source, int len) {
347-
UTF8 a;
348-
const UTF8 *srcptr = source+len;
349-
if(!source || (pg_utf_mblen(source) != len)) return false;
350-
switch (len) {
351-
default: return false;
352-
/* Everything else falls through when "true"... */
353-
case 6: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
354-
case 5: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
355-
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
356-
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
357-
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
358-
switch (*source) {
359-
/* no fall-through in this inner switch */
360-
case 0xE0: if (a < 0xA0) return false; break;
361-
case 0xF0: if (a < 0x90) return false; break;
362-
case 0xF4: if (a > 0x8F) return false; break;
363-
default: if (a < 0x80) return false;
364-
}
365-
case 1: if (*source >= 0x80 && *source < 0xC2) return false;
366-
if (*source > 0xFD) return false;
367-
}
368-
return true;
369-
}
370-
371346
/*
372347
* convert UTF-8 string to pg_wchar (UCS-2)
373348
* caller should allocate enough space for "to"
@@ -423,27 +398,21 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
423398
* returns the byte length of a UTF-8 word pointed to by s
424399
*/
425400
int
426-
pg_utf_mblen(const UTF8 *s)
401+
pg_utf_mblen(const unsigned char *s)
427402
{
428403
int len = 1;
429404

430405
if ((*s & 0x80) == 0)
431406
len = 1;
432407
else if ((*s & 0xe0) == 0xc0)
433408
len = 2;
434-
else if ((*s & 0xf0) == 0xe0)
435-
len = 3;
436-
else if ((*s & 0xf8) == 0xf0)
437-
len = 4;
438-
else if ((*s & 0xfc) == 0xf8)
439-
len = 5;
440-
else if ((*s & 0xfe) == 0xfc)
441-
len = 6;
409+
else if ((*s & 0xe0) == 0xe0)
410+
len = 3;
442411
return (len);
443412
}
444413

445414
static int
446-
pg_utf_dsplen(const UTF8 *s)
415+
pg_utf_dsplen(const unsigned char *s)
447416
{
448417
return 1; /* XXX fix me! */
449418
}
@@ -752,8 +721,8 @@ pg_wchar_tbl pg_wchar_table[] = {
752721
{pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */
753722
{pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */
754723
{pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */
755-
{pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 6}, /* 6; PG_UNICODE */
756-
{pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */
724+
{pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3}, /* 6; PG_UNICODE */
725+
{pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */
757726
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */
758727
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */
759728
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 10; PG_LATIN3 */
@@ -775,11 +744,11 @@ pg_wchar_tbl pg_wchar_table[] = {
775744
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 26; ISO-8859-7 */
776745
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 27; ISO-8859-8 */
777746
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 28; PG_WIN1250 */
778-
{0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; PG_SJIS */
779-
{0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; PG_BIG5 */
780-
{0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */
781-
{0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */
782-
{0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */
747+
{0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; PG_SJIS */
748+
{0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; PG_BIG5 */
749+
{0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */
750+
{0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */
751+
{0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */
783752
};
784753

785754
/* returns the byte length of a word for mule internal code */
@@ -853,48 +822,51 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError)
853822

854823
while (len > 0 && *mbstr)
855824
{
825+
/* special UTF-8 check */
826+
if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0)
827+
{
828+
if (noError)
829+
return false;
830+
ereport(ERROR,
831+
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
832+
errmsg("Unicode characters greater than or equal to 0x10000 are not supported")));
833+
}
834+
856835
l = pg_mblen(mbstr);
857836

858-
/* special UTF-8 check */
859-
if (encoding == PG_UTF8) {
860-
if(!isLegalUTF8(mbstr,l)) {
861-
if (noError) return false;
862-
ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid UNICODE byte sequence detected near character %c",*mbstr)));
863-
}
864-
} else {
865-
for (i = 1; i < l; i++)
837+
for (i = 1; i < l; i++)
838+
{
839+
/*
840+
* we expect that every multibyte char consists of bytes
841+
* having the 8th bit set
842+
*/
843+
if (i >= len || (mbstr[i] & 0x80) == 0)
866844
{
867-
/*
868-
* we expect that every multibyte char consists of bytes
869-
* having the 8th bit set
870-
*/
871-
if (i >= len || (mbstr[i] & 0x80) == 0)
872-
{
873-
char buf[8 * 2 + 1];
874-
char *p = buf;
875-
int j,
845+
char buf[8 * 2 + 1];
846+
char *p = buf;
847+
int j,
876848
jlimit;
877849

878-
if (noError)
879-
return false;
850+
if (noError)
851+
return false;
880852

881-
jlimit = Min(l, len);
882-
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
853+
jlimit = Min(l, len);
854+
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
883855

884-
for (j = 0; j < jlimit; j++)
885-
p += sprintf(p, "%02x", mbstr[j]);
856+
for (j = 0; j < jlimit; j++)
857+
p += sprintf(p, "%02x", mbstr[j]);
886858

887-
ereport(ERROR,
888-
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
889-
errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
890-
GetDatabaseEncodingName(), buf)));
891-
}
859+
ereport(ERROR,
860+
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
861+
errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
862+
GetDatabaseEncodingName(), buf)));
892863
}
893-
894864
}
865+
895866
len -= l;
896867
mbstr += l;
897868
}
869+
898870
return true;
899871
}
900872

src/include/mb/pg_wchar.h

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.54 2004/12/02 22:37:14 momjian Exp $ */
1+
/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.55 2004/12/03 01:20:33 momjian Exp $ */
22

33
#ifndef PG_WCHAR_H
44
#define PG_WCHAR_H
@@ -17,14 +17,6 @@
1717
*/
1818
typedef unsigned int pg_wchar;
1919

20-
21-
/*
22-
* The UTF types
23-
*/
24-
typedef unsigned int UTF32; /* at least 32 bits */
25-
typedef unsigned short UTF16; /* at least 16 bits */
26-
typedef unsigned char UTF8; /* typically 8 bits */
27-
2820
/*
2921
* various definitions for EUC
3022
*/
@@ -348,6 +340,4 @@ extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc);
348340
extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab);
349341
extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab);
350342

351-
extern bool isLegalUTF8(const UTF8 *source, int len);
352-
353343
#endif /* PG_WCHAR_H */

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy