Skip to content

Commit 976a1a4

Browse files
committed
Improve to_date/to_number/to_timestamp behavior with multibyte characters.
The documentation says that these functions skip one input character per literal (non-pattern) format character. Actually, though, they skipped one input *byte* per literal *byte*, which could be hugely confusing if either data or format contained multibyte characters. To fix, adjust the FormatNode representation and parse_format() so that multibyte format characters are stored as one FormatNode not several, and adjust the data-skipping bits to advance by pg_mblen() not necessarily one byte. There's no user-visible behavior change on the to_char() side, although the internal representation changes. Commit e87d496 had already fixed most places where we skip characters on the basis of non-literal format patterns to advance by characters not bytes, but this gets one more place, the SKIP_THth macro. I think everything in formatting.c gets that right now. It'd be nice to have some regression test cases covering this behavior; but of course there's no way to do so in an encoding-agnostic way, and many of the interesting aspects would also require unportable locale selections. So I've not bothered here. Discussion: https://postgr.es/m/28186.1510957703@sss.pgh.pa.us
1 parent 63ca863 commit 976a1a4

File tree

1 file changed

+41
-27
lines changed

1 file changed

+41
-27
lines changed

src/backend/utils/adt/formatting.c

Lines changed: 41 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,6 @@ typedef enum
151151
FROM_CHAR_DATE_ISOWEEK /* ISO 8601 week date */
152152
} FromCharDateMode;
153153

154-
typedef struct FormatNode FormatNode;
155-
156154
typedef struct
157155
{
158156
const char *name;
@@ -162,13 +160,13 @@ typedef struct
162160
FromCharDateMode date_mode;
163161
} KeyWord;
164162

165-
struct FormatNode
163+
typedef struct
166164
{
167-
int type; /* node type */
168-
const KeyWord *key; /* if node type is KEYWORD */
169-
char character; /* if node type is CHAR */
170-
int suffix; /* keyword suffix */
171-
};
165+
int type; /* NODE_TYPE_XXX, see below */
166+
const KeyWord *key; /* if type is ACTION */
167+
char character[MAX_MULTIBYTE_CHAR_LEN + 1]; /* if type is CHAR */
168+
int suffix; /* keyword prefix/suffix code, if any */
169+
} FormatNode;
172170

173171
#define NODE_TYPE_END 1
174172
#define NODE_TYPE_ACTION 2
@@ -1282,12 +1280,15 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
12821280
}
12831281
else if (*str)
12841282
{
1283+
int chlen;
1284+
12851285
/*
12861286
* Process double-quoted literal string, if any
12871287
*/
12881288
if (*str == '"')
12891289
{
1290-
while (*(++str))
1290+
str++;
1291+
while (*str)
12911292
{
12921293
if (*str == '"')
12931294
{
@@ -1297,11 +1298,14 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
12971298
/* backslash quotes the next character, if any */
12981299
if (*str == '\\' && *(str + 1))
12991300
str++;
1301+
chlen = pg_mblen(str);
13001302
n->type = NODE_TYPE_CHAR;
1301-
n->character = *str;
1303+
memcpy(n->character, str, chlen);
1304+
n->character[chlen] = '\0';
13021305
n->key = NULL;
13031306
n->suffix = 0;
13041307
n++;
1308+
str += chlen;
13051309
}
13061310
}
13071311
else
@@ -1312,12 +1316,14 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
13121316
*/
13131317
if (*str == '\\' && *(str + 1) == '"')
13141318
str++;
1319+
chlen = pg_mblen(str);
13151320
n->type = NODE_TYPE_CHAR;
1316-
n->character = *str;
1321+
memcpy(n->character, str, chlen);
1322+
n->character[chlen] = '\0';
13171323
n->key = NULL;
13181324
n->suffix = 0;
13191325
n++;
1320-
str++;
1326+
str += chlen;
13211327
}
13221328
}
13231329
}
@@ -1349,7 +1355,8 @@ dump_node(FormatNode *node, int max)
13491355
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_ACTION '%s'\t(%s,%s)",
13501356
a, n->key->name, DUMP_THth(n->suffix), DUMP_FM(n->suffix));
13511357
else if (n->type == NODE_TYPE_CHAR)
1352-
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%c'", a, n->character);
1358+
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%s'",
1359+
a, n->character);
13531360
else if (n->type == NODE_TYPE_END)
13541361
{
13551362
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_END", a);
@@ -2008,8 +2015,8 @@ asc_toupper_z(const char *buff)
20082015
do { \
20092016
if (S_THth(_suf)) \
20102017
{ \
2011-
if (*(ptr)) (ptr)++; \
2012-
if (*(ptr)) (ptr)++; \
2018+
if (*(ptr)) (ptr) += pg_mblen(ptr); \
2019+
if (*(ptr)) (ptr) += pg_mblen(ptr); \
20132020
} \
20142021
} while (0)
20152022

@@ -2076,7 +2083,8 @@ is_next_separator(FormatNode *n)
20762083

20772084
return true;
20782085
}
2079-
else if (isdigit((unsigned char) n->character))
2086+
else if (n->character[1] == '\0' &&
2087+
isdigit((unsigned char) n->character[0]))
20802088
return false;
20812089

20822090
return true; /* some non-digit input (separator) */
@@ -2405,8 +2413,8 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col
24052413
{
24062414
if (n->type != NODE_TYPE_ACTION)
24072415
{
2408-
*s = n->character;
2409-
s++;
2416+
strcpy(s, n->character);
2417+
s += strlen(s);
24102418
continue;
24112419
}
24122420

@@ -2974,7 +2982,7 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out)
29742982
* we don't insist that the consumed character match the format's
29752983
* character.
29762984
*/
2977-
s++;
2985+
s += pg_mblen(s);
29782986
continue;
29792987
}
29802988

@@ -4217,7 +4225,7 @@ get_last_relevant_decnum(char *num)
42174225
/*
42184226
* These macros are used in NUM_processor() and its subsidiary routines.
42194227
* OVERLOAD_TEST: true if we've reached end of input string
4220-
* AMOUNT_TEST(s): true if at least s characters remain in string
4228+
* AMOUNT_TEST(s): true if at least s bytes remain in string
42214229
*/
42224230
#define OVERLOAD_TEST (Np->inout_p >= Np->inout + input_len)
42234231
#define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s)))
@@ -4821,9 +4829,9 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
48214829
if (!Np->is_to_char)
48224830
{
48234831
/*
4824-
* Check at least one character remains to be scanned. (In
4825-
* actions below, must use AMOUNT_TEST if we want to read more
4826-
* characters than that.)
4832+
* Check at least one byte remains to be scanned. (In actions
4833+
* below, must use AMOUNT_TEST if we want to read more bytes than
4834+
* that.)
48274835
*/
48284836
if (OVERLOAD_TEST)
48294837
break;
@@ -5081,12 +5089,18 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
50815089
* In TO_CHAR, non-pattern characters in the format are copied to
50825090
* the output. In TO_NUMBER, we skip one input character for each
50835091
* non-pattern format character, whether or not it matches the
5084-
* format character. (Currently, that's actually implemented as
5085-
* skipping one input byte per non-pattern format byte, which is
5086-
* wrong...)
5092+
* format character.
50875093
*/
50885094
if (Np->is_to_char)
5089-
*Np->inout_p = n->character;
5095+
{
5096+
strcpy(Np->inout_p, n->character);
5097+
Np->inout_p += strlen(Np->inout_p);
5098+
}
5099+
else
5100+
{
5101+
Np->inout_p += pg_mblen(Np->inout_p);
5102+
}
5103+
continue;
50905104
}
50915105
Np->inout_p++;
50925106
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy