Skip to content

Commit f37fec8

Browse files
committed
Add unistr function
This allows decoding a string with Unicode escape sequences. It is similar to Unicode escape strings, but offers some more flexibility. Author: Pavel Stehule <pavel.stehule@gmail.com> Reviewed-by: Asif Rehman <asifr.rehman@gmail.com> Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRA5GnKT+gDVwbVRH2ep451H_myBt+NTz8RkYUARE9+qOQ@mail.gmail.com
1 parent ebedd0c commit f37fec8

File tree

6 files changed

+310
-1
lines changed

6 files changed

+310
-1
lines changed

doc/src/sgml/func.sgml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3551,6 +3551,52 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
35513551
</para></entry>
35523552
</row>
35533553

3554+
<row>
3555+
<entry role="func_table_entry"><para role="func_signature">
3556+
<indexterm>
3557+
<primary>unistr</primary>
3558+
</indexterm>
3559+
<function>unistr</function> ( <type>text</type> )
3560+
<returnvalue>text</returnvalue>
3561+
</para>
3562+
<para>
3563+
Evaluate escaped Unicode characters in argument. Unicode characters
3564+
can be specified as
3565+
<literal>\<replaceable>XXXX</replaceable></literal> (4 hexadecimal
3566+
digits), <literal>\+<replaceable>XXXXXX</replaceable></literal> (6
3567+
hexadecimal digits),
3568+
<literal>\u<replaceable>XXXX</replaceable></literal> (4 hexadecimal
3569+
digits), or <literal>\U<replaceable>XXXXXXXX</replaceable></literal>
3570+
(8 hexadecimal digits). To specify a backslash, write two
3571+
backslashes. All other characters are taken literally.
3572+
</para>
3573+
3574+
<para>
3575+
If the server encoding is not UTF-8, the Unicode code point identified
3576+
by one of these escape sequences is converted to the actual server
3577+
encoding; an error is reported if that's not possible.
3578+
</para>
3579+
3580+
<para>
3581+
This function provides a (non-standard) alternative to string
3582+
constants with Unicode escapes (see <xref
3583+
linkend="sql-syntax-strings-uescape"/>).
3584+
</para>
3585+
3586+
<para>
3587+
<literal>unistr('\0441\043B\043E\043D')</literal>
3588+
<returnvalue>слон</returnvalue>
3589+
</para>
3590+
<para>
3591+
<literal>unistr('d\0061t\+000061')</literal>
3592+
<returnvalue>data</returnvalue>
3593+
</para>
3594+
<para>
3595+
<literal>unistr('d\u0061t\U00000061')</literal>
3596+
<returnvalue>data</returnvalue>
3597+
</para></entry>
3598+
</row>
3599+
35543600
</tbody>
35553601
</tgroup>
35563602
</table>

src/backend/utils/adt/varlena.c

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6380,3 +6380,213 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
63806380

63816381
PG_RETURN_BOOL(result);
63826382
}
6383+
6384+
/*
6385+
* Check if first n chars are hexadecimal digits
6386+
*/
6387+
static bool
6388+
isxdigits_n(const char *instr, size_t n)
6389+
{
6390+
for (size_t i = 0; i < n; i++)
6391+
if (!isxdigit((unsigned char) instr[i]))
6392+
return false;
6393+
6394+
return true;
6395+
}
6396+
6397+
static unsigned int
6398+
hexval(unsigned char c)
6399+
{
6400+
if (c >= '0' && c <= '9')
6401+
return c - '0';
6402+
if (c >= 'a' && c <= 'f')
6403+
return c - 'a' + 0xA;
6404+
if (c >= 'A' && c <= 'F')
6405+
return c - 'A' + 0xA;
6406+
elog(ERROR, "invalid hexadecimal digit");
6407+
return 0; /* not reached */
6408+
}
6409+
6410+
/*
6411+
* Translate string with hexadecimal digits to number
6412+
*/
6413+
static unsigned int
6414+
hexval_n(const char *instr, size_t n)
6415+
{
6416+
unsigned int result = 0;
6417+
6418+
for (size_t i = 0; i < n; i++)
6419+
result += hexval(instr[i]) << (4 * (n - i - 1));
6420+
6421+
return result;
6422+
}
6423+
6424+
/*
6425+
* Replaces Unicode escape sequences by Unicode characters
6426+
*/
6427+
Datum
6428+
unistr(PG_FUNCTION_ARGS)
6429+
{
6430+
text *input_text = PG_GETARG_TEXT_PP(0);
6431+
char *instr;
6432+
int len;
6433+
StringInfoData str;
6434+
text *result;
6435+
pg_wchar pair_first = 0;
6436+
char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
6437+
6438+
instr = VARDATA_ANY(input_text);
6439+
len = VARSIZE_ANY_EXHDR(input_text);
6440+
6441+
initStringInfo(&str);
6442+
6443+
while (len > 0)
6444+
{
6445+
if (instr[0] == '\\')
6446+
{
6447+
if (len >= 2 &&
6448+
instr[1] == '\\')
6449+
{
6450+
if (pair_first)
6451+
goto invalid_pair;
6452+
appendStringInfoChar(&str, '\\');
6453+
instr += 2;
6454+
len -= 2;
6455+
}
6456+
else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
6457+
(len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
6458+
{
6459+
pg_wchar unicode;
6460+
int offset = instr[1] == 'u' ? 2 : 1;
6461+
6462+
unicode = hexval_n(instr + offset, 4);
6463+
6464+
if (!is_valid_unicode_codepoint(unicode))
6465+
ereport(ERROR,
6466+
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6467+
errmsg("invalid Unicode code point: %04X", unicode));
6468+
6469+
if (pair_first)
6470+
{
6471+
if (is_utf16_surrogate_second(unicode))
6472+
{
6473+
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6474+
pair_first = 0;
6475+
}
6476+
else
6477+
goto invalid_pair;
6478+
}
6479+
else if (is_utf16_surrogate_second(unicode))
6480+
goto invalid_pair;
6481+
6482+
if (is_utf16_surrogate_first(unicode))
6483+
pair_first = unicode;
6484+
else
6485+
{
6486+
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6487+
appendStringInfoString(&str, cbuf);
6488+
}
6489+
6490+
instr += 4 + offset;
6491+
len -= 4 + offset;
6492+
}
6493+
else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
6494+
{
6495+
pg_wchar unicode;
6496+
6497+
unicode = hexval_n(instr + 2, 6);
6498+
6499+
if (!is_valid_unicode_codepoint(unicode))
6500+
ereport(ERROR,
6501+
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6502+
errmsg("invalid Unicode code point: %04X", unicode));
6503+
6504+
if (pair_first)
6505+
{
6506+
if (is_utf16_surrogate_second(unicode))
6507+
{
6508+
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6509+
pair_first = 0;
6510+
}
6511+
else
6512+
goto invalid_pair;
6513+
}
6514+
else if (is_utf16_surrogate_second(unicode))
6515+
goto invalid_pair;
6516+
6517+
if (is_utf16_surrogate_first(unicode))
6518+
pair_first = unicode;
6519+
else
6520+
{
6521+
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6522+
appendStringInfoString(&str, cbuf);
6523+
}
6524+
6525+
instr += 8;
6526+
len -= 8;
6527+
}
6528+
else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
6529+
{
6530+
pg_wchar unicode;
6531+
6532+
unicode = hexval_n(instr + 2, 8);
6533+
6534+
if (!is_valid_unicode_codepoint(unicode))
6535+
ereport(ERROR,
6536+
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6537+
errmsg("invalid Unicode code point: %04X", unicode));
6538+
6539+
if (pair_first)
6540+
{
6541+
if (is_utf16_surrogate_second(unicode))
6542+
{
6543+
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6544+
pair_first = 0;
6545+
}
6546+
else
6547+
goto invalid_pair;
6548+
}
6549+
else if (is_utf16_surrogate_second(unicode))
6550+
goto invalid_pair;
6551+
6552+
if (is_utf16_surrogate_first(unicode))
6553+
pair_first = unicode;
6554+
else
6555+
{
6556+
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6557+
appendStringInfoString(&str, cbuf);
6558+
}
6559+
6560+
instr += 10;
6561+
len -= 10;
6562+
}
6563+
else
6564+
ereport(ERROR,
6565+
(errcode(ERRCODE_SYNTAX_ERROR),
6566+
errmsg("invalid Unicode escape"),
6567+
errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6568+
}
6569+
else
6570+
{
6571+
if (pair_first)
6572+
goto invalid_pair;
6573+
6574+
appendStringInfoChar(&str, *instr++);
6575+
len--;
6576+
}
6577+
}
6578+
6579+
/* unfinished surrogate pair? */
6580+
if (pair_first)
6581+
goto invalid_pair;
6582+
6583+
result = cstring_to_text_with_len(str.data, str.len);
6584+
pfree(str.data);
6585+
6586+
PG_RETURN_TEXT_P(result);
6587+
6588+
invalid_pair:
6589+
ereport(ERROR,
6590+
(errcode(ERRCODE_SYNTAX_ERROR),
6591+
errmsg("invalid Unicode surrogate pair")));
6592+
}

src/include/catalog/catversion.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,6 @@
5353
*/
5454

5555
/* yyyymmddN */
56-
#define CATALOG_VERSION_NO 202103266
56+
#define CATALOG_VERSION_NO 202103291
5757

5858
#endif

src/include/catalog/pg_proc.dat

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11527,6 +11527,10 @@
1152711527
proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
1152811528
prosrc => 'unicode_is_normalized' },
1152911529

11530+
{ oid => '9822', descr => 'unescape Unicode characters',
11531+
proname => 'unistr', prorettype => 'text', proargtypes => 'text',
11532+
prosrc => 'unistr' },
11533+
1153011534
{ oid => '4596', descr => 'I/O',
1153111535
proname => 'brin_bloom_summary_in', prorettype => 'pg_brin_bloom_summary',
1153211536
proargtypes => 'cstring', prosrc => 'brin_bloom_summary_in' },

src/test/regress/expected/strings.out

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2234,3 +2234,39 @@ SELECT bit_count('\x1234567890'::bytea);
22342234
15
22352235
(1 row)
22362236

2237+
SELECT unistr('\0064at\+0000610');
2238+
unistr
2239+
--------
2240+
data0
2241+
(1 row)
2242+
2243+
SELECT unistr('d\u0061t\U000000610');
2244+
unistr
2245+
--------
2246+
data0
2247+
(1 row)
2248+
2249+
SELECT unistr('a\\b');
2250+
unistr
2251+
--------
2252+
a\b
2253+
(1 row)
2254+
2255+
-- errors:
2256+
SELECT unistr('wrong: \db99');
2257+
ERROR: invalid Unicode surrogate pair
2258+
SELECT unistr('wrong: \db99\0061');
2259+
ERROR: invalid Unicode surrogate pair
2260+
SELECT unistr('wrong: \+00db99\+000061');
2261+
ERROR: invalid Unicode surrogate pair
2262+
SELECT unistr('wrong: \+2FFFFF');
2263+
ERROR: invalid Unicode code point: 2FFFFF
2264+
SELECT unistr('wrong: \udb99\u0061');
2265+
ERROR: invalid Unicode surrogate pair
2266+
SELECT unistr('wrong: \U0000db99\U00000061');
2267+
ERROR: invalid Unicode surrogate pair
2268+
SELECT unistr('wrong: \U002FFFFF');
2269+
ERROR: invalid Unicode code point: 2FFFFF
2270+
SELECT unistr('wrong: \xyz');
2271+
ERROR: invalid Unicode escape
2272+
HINT: Unicode escapes must be \XXXX, \+XXXXXX, \uXXXX, or \UXXXXXXXX.

src/test/regress/sql/strings.sql

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -746,3 +746,16 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8)
746746
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
747747

748748
SELECT bit_count('\x1234567890'::bytea);
749+
750+
SELECT unistr('\0064at\+0000610');
751+
SELECT unistr('d\u0061t\U000000610');
752+
SELECT unistr('a\\b');
753+
-- errors:
754+
SELECT unistr('wrong: \db99');
755+
SELECT unistr('wrong: \db99\0061');
756+
SELECT unistr('wrong: \+00db99\+000061');
757+
SELECT unistr('wrong: \+2FFFFF');
758+
SELECT unistr('wrong: \udb99\u0061');
759+
SELECT unistr('wrong: \U0000db99\U00000061');
760+
SELECT unistr('wrong: \U002FFFFF');
761+
SELECT unistr('wrong: \xyz');

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy