Skip to content

Commit 3ad8b84

Browse files
committed
Add some tests for encoding conversion in COPY TO/FROM
This adds a couple of tests to trigger encoding conversion when input and server encodings do not match in COPY FROM/TO, or need_transcoding set to true in the COPY state data. These tests rely on UTF8 <-> LATIN1 for the valid cases as LATIN1 accepts any bytes, and UTF8 <-> EUC_JP for some of the invalid cases where a character cannot be understood, causing a conversion failure. Both ENCODING and client_encoding are covered. Test suggested by Andres Freund. Author: Sutou Kouhei Discussion: https://postgr.es/m/20240206222445.hzq22pb2nye7rm67@awork3.anarazel.de
1 parent bf9165b commit 3ad8b84

File tree

4 files changed

+108
-1
lines changed

4 files changed

+108
-1
lines changed
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
--
2+
-- Test cases for encoding with COPY commands
3+
--
4+
-- skip test if not UTF8 server encoding
5+
SELECT getdatabaseencoding() <> 'UTF8'
6+
AS skip_test \gset
7+
\if :skip_test
8+
\quit
9+
\endif
10+
-- directory paths are passed to us in environment variables
11+
\getenv abs_builddir PG_ABS_BUILDDIR
12+
\set utf8_csv :abs_builddir '/results/copyencoding_utf8.csv'
13+
CREATE TABLE copy_encoding_tab (t text);
14+
-- Valid cases
15+
-- Use ENCODING option
16+
-- U+3042 HIRAGANA LETTER A
17+
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
18+
-- Read UTF8 data as LATIN1: no error
19+
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'LATIN1');
20+
-- Use client_encoding
21+
SET client_encoding TO UTF8;
22+
-- U+3042 HIRAGANA LETTER A
23+
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv);
24+
-- Read UTF8 data as LATIN1: no error
25+
SET client_encoding TO LATIN1;
26+
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
27+
RESET client_encoding;
28+
-- Invalid cases
29+
-- Use ENCODING explicitly
30+
-- U+3042 HIRAGANA LETTER A
31+
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
32+
-- Read UTF8 data as EUC_JP: no error
33+
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'EUC_JP');
34+
ERROR: invalid byte sequence for encoding "EUC_JP": 0xe3 0x81
35+
CONTEXT: COPY copy_encoding_tab, line 1
36+
-- Use client_encoding
37+
SET client_encoding TO UTF8;
38+
-- U+3042 HIRAGANA LETTER A
39+
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv);
40+
-- Read UTF8 data as EUC_JP: no error
41+
SET client_encoding TO EUC_JP;
42+
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
43+
ERROR: invalid byte sequence for encoding "EUC_JP": 0xe3 0x81
44+
CONTEXT: COPY copy_encoding_tab, line 1
45+
RESET client_encoding;
46+
DROP TABLE copy_encoding_tab;
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
--
2+
-- Test cases for encoding with COPY commands
3+
--
4+
-- skip test if not UTF8 server encoding
5+
SELECT getdatabaseencoding() <> 'UTF8'
6+
AS skip_test \gset
7+
\if :skip_test
8+
\quit

src/test/regress/parallel_schedule

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comment
3636
# execute two copy tests in parallel, to check that copy itself
3737
# is concurrent safe.
3838
# ----------
39-
test: copy copyselect copydml insert insert_conflict
39+
test: copy copyselect copydml copyencoding insert insert_conflict
4040

4141
# ----------
4242
# More groups of parallel tests

src/test/regress/sql/copyencoding.sql

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
--
2+
-- Test cases for encoding with COPY commands
3+
--
4+
5+
-- skip test if not UTF8 server encoding
6+
SELECT getdatabaseencoding() <> 'UTF8'
7+
AS skip_test \gset
8+
\if :skip_test
9+
\quit
10+
\endif
11+
12+
-- directory paths are passed to us in environment variables
13+
\getenv abs_builddir PG_ABS_BUILDDIR
14+
15+
\set utf8_csv :abs_builddir '/results/copyencoding_utf8.csv'
16+
17+
CREATE TABLE copy_encoding_tab (t text);
18+
19+
-- Valid cases
20+
21+
-- Use ENCODING option
22+
-- U+3042 HIRAGANA LETTER A
23+
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
24+
-- Read UTF8 data as LATIN1: no error
25+
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'LATIN1');
26+
27+
-- Use client_encoding
28+
SET client_encoding TO UTF8;
29+
-- U+3042 HIRAGANA LETTER A
30+
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv);
31+
-- Read UTF8 data as LATIN1: no error
32+
SET client_encoding TO LATIN1;
33+
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
34+
RESET client_encoding;
35+
36+
-- Invalid cases
37+
38+
-- Use ENCODING explicitly
39+
-- U+3042 HIRAGANA LETTER A
40+
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
41+
-- Read UTF8 data as EUC_JP: no error
42+
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'EUC_JP');
43+
44+
-- Use client_encoding
45+
SET client_encoding TO UTF8;
46+
-- U+3042 HIRAGANA LETTER A
47+
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv);
48+
-- Read UTF8 data as EUC_JP: no error
49+
SET client_encoding TO EUC_JP;
50+
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
51+
RESET client_encoding;
52+
53+
DROP TABLE copy_encoding_tab;

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy