Skip to content

Commit 09022de

Browse files
committed
Improve documentation about MULE encoding.
This commit improves the comments in pg_wchar.h and creates #define symbols for some formerly hard-coded values. No substantive code changes. Tatsuo Ishii and Tom Lane
1 parent 47a2adc commit 09022de

File tree

3 files changed

+110
-58
lines changed

3 files changed

+110
-58
lines changed

src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,8 @@ euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
168168
*p++ = LC_CNS11643_2;
169169
else
170170
{
171-
*p++ = 0x9d; /* LCPRV2 */
171+
/* other planes are MULE private charsets */
172+
*p++ = LCPRV2_B;
172173
*p++ = c1 - 0xa3 + LC_CNS11643_3;
173174
}
174175
*p++ = euc[2];
@@ -235,9 +236,9 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
235236
*p++ = mic[1];
236237
*p++ = mic[2];
237238
}
238-
else if (c1 == 0x9d &&
239+
else if (c1 == LCPRV2_B &&
239240
mic[1] >= LC_CNS11643_3 && mic[1] <= LC_CNS11643_7)
240-
{ /* LCPRV2? */
241+
{
241242
*p++ = SS2;
242243
*p++ = mic[1] - LC_CNS11643_3 + 0xa3;
243244
*p++ = mic[2];
@@ -286,10 +287,9 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
286287
cnsBuf = BIG5toCNS(big5buf, &lc);
287288
if (lc != 0)
288289
{
290+
/* Planes 3 and 4 are MULE private charsets */
289291
if (lc == LC_CNS11643_3 || lc == LC_CNS11643_4)
290-
{
291-
*p++ = 0x9d; /* LCPRV2 */
292-
}
292+
*p++ = LCPRV2_B;
293293
*p++ = lc; /* Plane No. */
294294
*p++ = (cnsBuf >> 8) & 0x00ff;
295295
*p++ = cnsBuf & 0x00ff;
@@ -332,10 +332,9 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
332332
if (l < 0)
333333
report_invalid_encoding(PG_MULE_INTERNAL,
334334
(const char *) mic, len);
335-
/* 0x9d means LCPRV2 */
336-
if (c1 == LC_CNS11643_1 || c1 == LC_CNS11643_2 || c1 == 0x9d)
335+
if (c1 == LC_CNS11643_1 || c1 == LC_CNS11643_2 || c1 == LCPRV2_B)
337336
{
338-
if (c1 == 0x9d)
337+
if (c1 == LCPRV2_B)
339338
{
340339
c1 = mic[1]; /* get plane no. */
341340
cnsBuf = (mic[2] << 8) | mic[3];

src/backend/utils/mb/wchar.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,12 @@ pg_mule_dsplen(const unsigned char *s)
742742
{
743743
int len;
744744

745+
/*
746+
* Note: it's not really appropriate to assume that all multibyte charsets
747+
* are double-wide on screen. But this seems an okay approximation for
748+
* the MULE charsets we currently support.
749+
*/
750+
745751
if (IS_LC1(*s))
746752
len = 1;
747753
else if (IS_LCPRV1(*s))

src/include/mb/pg_wchar.h

Lines changed: 96 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -36,36 +36,60 @@ typedef unsigned int pg_wchar;
3636
#define ISSJISHEAD(c) (((c) >= 0x81 && (c) <= 0x9f) || ((c) >= 0xe0 && (c) <= 0xfc))
3737
#define ISSJISTAIL(c) (((c) >= 0x40 && (c) <= 0x7e) || ((c) >= 0x80 && (c) <= 0xfc))
3838

39-
/*
40-
* Leading byte types or leading prefix byte for MULE internal code.
41-
* See http://www.xemacs.org for more details. (there is a doc titled
42-
* "XEmacs Internals Manual", "MULE Character Sets and Encodings"
43-
* section.)
44-
*/
45-
/*
46-
* Is a leading byte for "official" single byte encodings?
47-
*/
48-
#define IS_LC1(c) ((unsigned char)(c) >= 0x81 && (unsigned char)(c) <= 0x8d)
49-
/*
50-
* Is a prefix byte for "private" single byte encodings?
51-
*/
52-
#define IS_LCPRV1(c) ((unsigned char)(c) == 0x9a || (unsigned char)(c) == 0x9b)
53-
/*
54-
* Is a leading byte for "official" multibyte encodings?
55-
*/
56-
#define IS_LC2(c) ((unsigned char)(c) >= 0x90 && (unsigned char)(c) <= 0x99)
57-
/*
58-
* Is a prefix byte for "private" multibyte encodings?
59-
*/
60-
#define IS_LCPRV2(c) ((unsigned char)(c) == 0x9c || (unsigned char)(c) == 0x9d)
61-
6239
/*----------------------------------------------------
63-
* leading characters
40+
* MULE Internal Encoding (MIC)
41+
*
42+
* This encoding follows the design used within XEmacs; it is meant to
43+
* subsume many externally-defined character sets. Each character includes
44+
* identification of the character set it belongs to, so the encoding is
45+
* general but somewhat bulky.
46+
*
47+
* Currently PostgreSQL supports 5 types of MULE character sets:
48+
*
49+
* 1) 1-byte ASCII characters. Each byte is below 0x80.
50+
*
51+
* 2) "Official" single byte charsets such as ISO-8859-1 (Latin1).
52+
* Each MULE character consists of 2 bytes: LC1 + C1, where LC1 is
53+
* an identifier for the charset (in the range 0x81 to 0x8d) and C1
54+
* is the character code (in the range 0xa0 to 0xff).
55+
*
56+
* 3) "Private" single byte charsets such as SISHENG. Each MULE
57+
* character consists of 3 bytes: LCPRV1 + LC12 + C1, where LCPRV1
58+
* is a private-charset flag, LC12 is an identifier for the charset,
59+
* and C1 is the character code (in the range 0xa0 to 0xff).
60+
* LCPRV1 is either 0x9a (if LC12 is in the range 0xa0 to 0xdf)
61+
* or 0x9b (if LC12 is in the range 0xe0 to 0xef).
62+
*
63+
* 4) "Official" multibyte charsets such as JIS X0208. Each MULE
64+
* character consists of 3 bytes: LC2 + C1 + C2, where LC2 is
65+
* an identifier for the charset (in the range 0x90 to 0x99) and C1
66+
* and C2 form the character code (each in the range 0xa0 to 0xff).
67+
*
68+
* 5) "Private" multibyte charsets such as CNS 11643-1992 Plane 3.
69+
* Each MULE character consists of 4 bytes: LCPRV2 + LC22 + C1 + C2,
70+
* where LCPRV2 is a private-charset flag, LC22 is an identifier for
71+
* the charset, and C1 and C2 form the character code (each in the range
72+
* 0xa0 to 0xff). LCPRV2 is either 0x9c (if LC22 is in the range 0xf0
73+
* to 0xf4) or 0x9d (if LC22 is in the range 0xf5 to 0xfe).
74+
*
75+
* "Official" encodings are those that have been assigned code numbers by
76+
* the XEmacs project; "private" encodings have Postgres-specific charset
77+
* identifiers.
78+
*
79+
* See the "XEmacs Internals Manual", available at http://www.xemacs.org,
80+
* for more details. Note that for historical reasons, Postgres'
81+
* private-charset flag values do not match what XEmacs says they should be,
82+
* so this isn't really exactly MULE (not that private charsets would be
83+
* interoperable anyway).
6484
*----------------------------------------------------
6585
*/
6686

6787
/*
68-
* Official single byte encodings (0x81-0x8e)
88+
* Charset identifiers (also called "leading bytes" in the MULE documentation)
89+
*/
90+
91+
/*
92+
* Charset IDs for official single byte encodings (0x81-0x8e)
6993
*/
7094
#define LC_ISO8859_1 0x81 /* ISO8859 Latin 1 */
7195
#define LC_ISO8859_2 0x82 /* ISO8859 Latin 2 */
@@ -79,21 +103,19 @@ typedef unsigned int pg_wchar;
79103
#define LC_JISX0201R 0x8a /* Japanese 1 byte Roman */
80104
/* Note that 0x8b seems to be unused as of Emacs 20.7.
81105
* However, there might be a chance that 0x8b could be used
82-
* in later version of Emacs.
106+
* in later versions of Emacs.
83107
*/
84108
#define LC_KOI8_R 0x8b /* Cyrillic KOI8-R */
85-
#define LC_KOI8_U 0x8b /* Cyrillic KOI8-U */
86109
#define LC_ISO8859_5 0x8c /* ISO8859 Cyrillic */
87110
#define LC_ISO8859_9 0x8d /* ISO8859 Latin 5 (not supported yet) */
88111
/* #define FREE 0x8e free (unused) */
112+
/* #define CONTROL_1 0x8f control characters (unused) */
89113

90-
/*
91-
* Unused
92-
*/
93-
#define CONTROL_1 0x8f /* control characters (unused) */
114+
/* Is a leading byte for "official" single byte encodings? */
115+
#define IS_LC1(c) ((unsigned char)(c) >= 0x81 && (unsigned char)(c) <= 0x8d)
94116

95117
/*
96-
* Official multibyte byte encodings (0x90-0x99)
118+
* Charset IDs for official multibyte encodings (0x90-0x99)
97119
* 0x9a-0x9d are free. 0x9e and 0x9f are reserved.
98120
*/
99121
#define LC_JISX0208_1978 0x90 /* Japanese Kanji, old JIS (not supported) */
@@ -108,45 +130,70 @@ typedef unsigned int pg_wchar;
108130
#define LC_BIG5_1 0x98 /* Plane 1 Chinese traditional (not supported) */
109131
#define LC_BIG5_2 0x99 /* Plane 1 Chinese traditional (not supported) */
110132

133+
/* Is a leading byte for "official" multibyte encodings? */
134+
#define IS_LC2(c) ((unsigned char)(c) >= 0x90 && (unsigned char)(c) <= 0x99)
135+
111136
/*
112-
* Private single byte encodings (0xa0-0xef)
137+
* Postgres-specific prefix bytes for "private" single byte encodings
138+
* (According to the MULE docs, we should be using 0x9e for this)
113139
*/
114-
#define LC_SISHENG 0xa0/* Chinese SiSheng characters for
115-
* PinYin/ZhuYin (not supported) */
116-
#define LC_IPA 0xa1/* IPA (International Phonetic Association)
117-
* (not supported) */
118-
#define LC_VISCII_LOWER 0xa2/* Vietnamese VISCII1.1 lower-case (not
119-
* supported) */
120-
#define LC_VISCII_UPPER 0xa3/* Vietnamese VISCII1.1 upper-case (not
121-
* supported) */
140+
#define LCPRV1_A 0x9a
141+
#define LCPRV1_B 0x9b
142+
#define IS_LCPRV1(c) ((unsigned char)(c) == LCPRV1_A || (unsigned char)(c) == LCPRV1_B)
143+
144+
/*
145+
* Postgres-specific prefix bytes for "private" multibyte encodings
146+
* (According to the MULE docs, we should be using 0x9f for this)
147+
*/
148+
#define LCPRV2_A 0x9c
149+
#define LCPRV2_B 0x9d
150+
#define IS_LCPRV2(c) ((unsigned char)(c) == LCPRV2_A || (unsigned char)(c) == LCPRV2_B)
151+
152+
/*
153+
* Charset IDs for private single byte encodings (0xa0-0xef)
154+
*/
155+
#define LC_SISHENG 0xa0 /* Chinese SiSheng characters for
156+
* PinYin/ZhuYin (not supported) */
157+
#define LC_IPA 0xa1 /* IPA (International Phonetic Association)
158+
* (not supported) */
159+
#define LC_VISCII_LOWER 0xa2 /* Vietnamese VISCII1.1 lower-case (not
160+
* supported) */
161+
#define LC_VISCII_UPPER 0xa3 /* Vietnamese VISCII1.1 upper-case (not
162+
* supported) */
122163
#define LC_ARABIC_DIGIT 0xa4 /* Arabic digit (not supported) */
123164
#define LC_ARABIC_1_COLUMN 0xa5 /* Arabic 1-column (not supported) */
124165
#define LC_ASCII_RIGHT_TO_LEFT 0xa6 /* ASCII (left half of ISO8859-1) with
125166
* right-to-left direction (not
126167
* supported) */
127-
#define LC_LAO 0xa7/* Lao characters (ISO10646 0E80..0EDF) (not
128-
* supported) */
168+
#define LC_LAO 0xa7 /* Lao characters (ISO10646 0E80..0EDF)
169+
* (not supported) */
129170
#define LC_ARABIC_2_COLUMN 0xa8 /* Arabic 1-column (not supported) */
130171

131172
/*
132-
* Private multibyte encodings (0xf0-0xff)
173+
* Charset IDs for private multibyte encodings (0xf0-0xff)
133174
*/
134-
#define LC_INDIAN_1_COLUMN 0xf0/* Indian charset for 1-column width glypps
135-
* (not supported) */
136-
#define LC_TIBETAN_1_COLUMN 0xf1 /* Tibetan 1 column glyph (not supported) */
175+
#define LC_INDIAN_1_COLUMN 0xf0 /* Indian charset for 1-column width glyphs
176+
* (not supported) */
177+
#define LC_TIBETAN_1_COLUMN 0xf1 /* Tibetan 1-column width glyphs
178+
* (not supported) */
137179
#define LC_ETHIOPIC 0xf5 /* Ethiopic characters (not supported) */
138180
#define LC_CNS11643_3 0xf6 /* CNS 11643-1992 Plane 3 */
139181
#define LC_CNS11643_4 0xf7 /* CNS 11643-1992 Plane 4 */
140182
#define LC_CNS11643_5 0xf8 /* CNS 11643-1992 Plane 5 */
141183
#define LC_CNS11643_6 0xf9 /* CNS 11643-1992 Plane 6 */
142184
#define LC_CNS11643_7 0xfa /* CNS 11643-1992 Plane 7 */
143-
#define LC_INDIAN_2_COLUMN 0xfb/* Indian charset for 2-column width glypps
144-
* (not supported) */
185+
#define LC_INDIAN_2_COLUMN 0xfb /* Indian charset for 2-column width glyphs
186+
* (not supported) */
145187
#define LC_TIBETAN 0xfc /* Tibetan (not supported) */
146188
/* #define FREE 0xfd free (unused) */
147189
/* #define FREE 0xfe free (unused) */
148190
/* #define FREE 0xff free (unused) */
149191

192+
/*----------------------------------------------------
193+
* end of MULE stuff
194+
*----------------------------------------------------
195+
*/
196+
150197
/*
151198
* PostgreSQL encoding identifiers
152199
*

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy