Skip to content

Commit d8406b9

Browse files
committed
Ignore XML declaration in xpath_internal(), for UTF8 databases.
When a value contained an XML declaration naming some other encoding, this function interpreted UTF8 bytes as the named encoding, yielding mojibake. xml_parse() already has similar logic. This would be necessary but not sufficient for non-UTF8 databases, so preserve behavior there until the xpath facility can support such databases comprehensively. Back-patch to 9.3 (all supported versions). Pavel Stehule and Noah Misch Discussion: https://postgr.es/m/CAFj8pRC-dM=tT=QkGi+Achkm+gwPmjyOayGuUfXVumCxkDgYWg@mail.gmail.com
1 parent 6290646 commit d8406b9

File tree

5 files changed

+142
-1
lines changed

5 files changed

+142
-1
lines changed

src/backend/utils/adt/xml.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3792,6 +3792,7 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
37923792
int32 xpath_len;
37933793
xmlChar *string;
37943794
xmlChar *xpath_expr;
3795+
size_t xmldecl_len = 0;
37953796
int i;
37963797
int ndim;
37973798
Datum *ns_names_uris;
@@ -3852,6 +3853,16 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
38523853
memcpy(xpath_expr, VARDATA(xpath_expr_text), xpath_len);
38533854
xpath_expr[xpath_len] = '\0';
38543855

3856+
/*
3857+
* In a UTF8 database, skip any xml declaration, which might assert
3858+
* another encoding. Ignore parse_xml_decl() failure, letting
3859+
* xmlCtxtReadMemory() report parse errors. Documentation disclaims
3860+
* xpath() support for non-ASCII data in non-UTF8 databases, so leave
3861+
* those scenarios bug-compatible with historical behavior.
3862+
*/
3863+
if (GetDatabaseEncoding() == PG_UTF8)
3864+
parse_xml_decl(string, &xmldecl_len, NULL, NULL, NULL);
3865+
38553866
xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL);
38563867

38573868
PG_TRY();
@@ -3866,7 +3877,8 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
38663877
if (ctxt == NULL || xmlerrcxt->err_occurred)
38673878
xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
38683879
"could not allocate parser context");
3869-
doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, NULL, 0);
3880+
doc = xmlCtxtReadMemory(ctxt, (char *) string + xmldecl_len,
3881+
len - xmldecl_len, NULL, NULL, 0);
38703882
if (doc == NULL || xmlerrcxt->err_occurred)
38713883
xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
38723884
"could not parse XML document");

src/test/regress/expected/xml.out

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,37 @@ SELECT xpath('/nosuchtag', '<root/>');
653653
{}
654654
(1 row)
655655

656+
-- Round-trip non-ASCII data through xpath().
657+
DO $$
658+
DECLARE
659+
xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
660+
degree_symbol text;
661+
res xml[];
662+
BEGIN
663+
-- Per the documentation, xpath() doesn't work on non-ASCII data when
664+
-- the server encoding is not UTF8. The EXCEPTION block below,
665+
-- currently dead code, will be relevant if we remove this limitation.
666+
IF current_setting('server_encoding') <> 'UTF8' THEN
667+
RAISE LOG 'skip: encoding % unsupported for xml',
668+
current_setting('server_encoding');
669+
RETURN;
670+
END IF;
671+
672+
degree_symbol := convert_from('\xc2b0', 'UTF8');
673+
res := xpath('text()', (xml_declaration ||
674+
'<x>' || degree_symbol || '</x>')::xml);
675+
IF degree_symbol <> res[1]::text THEN
676+
RAISE 'expected % (%), got % (%)',
677+
degree_symbol, convert_to(degree_symbol, 'UTF8'),
678+
res[1], convert_to(res[1]::text, 'UTF8');
679+
END IF;
680+
EXCEPTION
681+
-- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
682+
WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
683+
-- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
684+
WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
685+
END
686+
$$;
656687
-- Test xmlexists and xpath_exists
657688
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
658689
xmlexists

src/test/regress/expected/xml_1.out

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,41 @@ LINE 1: SELECT xpath('/nosuchtag', '<root/>');
564564
^
565565
DETAIL: This functionality requires the server to be built with libxml support.
566566
HINT: You need to rebuild PostgreSQL using --with-libxml.
567+
-- Round-trip non-ASCII data through xpath().
568+
DO $$
569+
DECLARE
570+
xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
571+
degree_symbol text;
572+
res xml[];
573+
BEGIN
574+
-- Per the documentation, xpath() doesn't work on non-ASCII data when
575+
-- the server encoding is not UTF8. The EXCEPTION block below,
576+
-- currently dead code, will be relevant if we remove this limitation.
577+
IF current_setting('server_encoding') <> 'UTF8' THEN
578+
RAISE LOG 'skip: encoding % unsupported for xml',
579+
current_setting('server_encoding');
580+
RETURN;
581+
END IF;
582+
583+
degree_symbol := convert_from('\xc2b0', 'UTF8');
584+
res := xpath('text()', (xml_declaration ||
585+
'<x>' || degree_symbol || '</x>')::xml);
586+
IF degree_symbol <> res[1]::text THEN
587+
RAISE 'expected % (%), got % (%)',
588+
degree_symbol, convert_to(degree_symbol, 'UTF8'),
589+
res[1], convert_to(res[1]::text, 'UTF8');
590+
END IF;
591+
EXCEPTION
592+
-- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
593+
WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
594+
-- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
595+
WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
596+
END
597+
$$;
598+
ERROR: unsupported XML feature
599+
DETAIL: This functionality requires the server to be built with libxml support.
600+
HINT: You need to rebuild PostgreSQL using --with-libxml.
601+
CONTEXT: PL/pgSQL function inline_code_block line 17 at assignment
567602
-- Test xmlexists and xpath_exists
568603
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
569604
ERROR: unsupported XML feature

src/test/regress/expected/xml_2.out

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,37 @@ SELECT xpath('/nosuchtag', '<root/>');
635635
{}
636636
(1 row)
637637

638+
-- Round-trip non-ASCII data through xpath().
639+
DO $$
640+
DECLARE
641+
xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
642+
degree_symbol text;
643+
res xml[];
644+
BEGIN
645+
-- Per the documentation, xpath() doesn't work on non-ASCII data when
646+
-- the server encoding is not UTF8. The EXCEPTION block below,
647+
-- currently dead code, will be relevant if we remove this limitation.
648+
IF current_setting('server_encoding') <> 'UTF8' THEN
649+
RAISE LOG 'skip: encoding % unsupported for xml',
650+
current_setting('server_encoding');
651+
RETURN;
652+
END IF;
653+
654+
degree_symbol := convert_from('\xc2b0', 'UTF8');
655+
res := xpath('text()', (xml_declaration ||
656+
'<x>' || degree_symbol || '</x>')::xml);
657+
IF degree_symbol <> res[1]::text THEN
658+
RAISE 'expected % (%), got % (%)',
659+
degree_symbol, convert_to(degree_symbol, 'UTF8'),
660+
res[1], convert_to(res[1]::text, 'UTF8');
661+
END IF;
662+
EXCEPTION
663+
-- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
664+
WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
665+
-- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
666+
WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
667+
END
668+
$$;
638669
-- Test xmlexists and xpath_exists
639670
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
640671
xmlexists

src/test/regress/sql/xml.sql

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,38 @@ SELECT xpath('count(//*)=3', '<root><sub/><sub/></root>');
186186
SELECT xpath('name(/*)', '<root><sub/><sub/></root>');
187187
SELECT xpath('/nosuchtag', '<root/>');
188188

189+
-- Round-trip non-ASCII data through xpath().
190+
DO $$
191+
DECLARE
192+
xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>';
193+
degree_symbol text;
194+
res xml[];
195+
BEGIN
196+
-- Per the documentation, xpath() doesn't work on non-ASCII data when
197+
-- the server encoding is not UTF8. The EXCEPTION block below,
198+
-- currently dead code, will be relevant if we remove this limitation.
199+
IF current_setting('server_encoding') <> 'UTF8' THEN
200+
RAISE LOG 'skip: encoding % unsupported for xml',
201+
current_setting('server_encoding');
202+
RETURN;
203+
END IF;
204+
205+
degree_symbol := convert_from('\xc2b0', 'UTF8');
206+
res := xpath('text()', (xml_declaration ||
207+
'<x>' || degree_symbol || '</x>')::xml);
208+
IF degree_symbol <> res[1]::text THEN
209+
RAISE 'expected % (%), got % (%)',
210+
degree_symbol, convert_to(degree_symbol, 'UTF8'),
211+
res[1], convert_to(res[1]::text, 'UTF8');
212+
END IF;
213+
EXCEPTION
214+
-- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8"
215+
WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM;
216+
-- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist
217+
WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM;
218+
END
219+
$$;
220+
189221
-- Test xmlexists and xpath_exists
190222
SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');
191223
SELECT xmlexists('//town[text() = ''Cwmbran'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>');

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy