Skip to content

Commit b66863d

Browse files
committed
use UCS4 instead of UTF8
1 parent 8e5e00b commit b66863d

File tree

1 file changed

+41
-28
lines changed

1 file changed

+41
-28
lines changed

Modules/_json.c

Lines changed: 41 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -303,12 +303,11 @@ escape_unicode(PyObject *pystr)
303303
return rval;
304304
}
305305

306-
#define ESCAPE_BUF_SIZE 200
307-
308306
// Take a PyUnicode pystr and write an escaped string to writer. (ensure_ascii)
309307
static int
310308
write_escaped_ascii(PyUnicodeWriter *writer, PyObject *pystr)
311309
{
310+
#define ESCAPE_BUF_SIZE 200
312311
Py_ssize_t i;
313312
Py_ssize_t input_chars;
314313
Py_ssize_t buf_len;
@@ -367,60 +366,74 @@ static int
367366
write_escaped_unicode(PyUnicodeWriter *writer, PyObject *pystr)
368367
{
369368
Py_ssize_t i;
370-
Py_ssize_t input_size;
371-
Py_ssize_t buf_len;
372-
const unsigned char *input;
369+
Py_ssize_t input_chars;
370+
Py_ssize_t chars = 0;
371+
const void *input;
372+
int kind;
373373
int ret;
374-
unsigned char c = 0;
375-
char buf[ESCAPE_BUF_SIZE];
374+
Py_UCS4 output[ESCAPE_BUF_SIZE];
376375

377-
// We don't need to escape non-ASCII chars.
378-
// So we just copy UTF-8 from pystr to buf.
379-
input = (const unsigned char*) PyUnicode_AsUTF8AndSize(pystr, &input_size);
376+
input_chars = PyUnicode_GET_LENGTH(pystr);
377+
input = PyUnicode_DATA(pystr);
378+
kind = PyUnicode_KIND(pystr);
380379

381380
ret = PyUnicodeWriter_WriteChar(writer, '"');
382381
if (ret) return ret;
383382

384383
// Fast path for string doesn't need escape at all: e.g. "id", "name"
385-
for (i = 0; i < input_size; i++) {
386-
c = input[i];
384+
for (i = 0; i < input_chars; i++) {
385+
Py_UCS4 c = PyUnicode_READ(kind, input, i);
387386
if (c <= 0x1f || c == '\\' || c == '"') {
388387
break;
389388
}
390389
}
391390
if (i > 0) {
392-
ret = PyUnicodeWriter_WriteUTF8(writer, (const char *)input, i);
391+
ret = PyUnicodeWriter_WriteSubstring(writer, pystr, 0, i);
393392
if (ret) return ret;
394393
}
395-
if (i == input_size) {
394+
if (i == input_chars) {
396395
return PyUnicodeWriter_WriteChar(writer, '"');
397396
}
398397

399-
buf_len = ascii_escape_unichar(c, (unsigned char *)buf, 0);
398+
for (; i < input_chars; i++) {
399+
Py_UCS4 c = PyUnicode_READ(kind, input, i);
400400

401-
for (i++; i < input_size; i++) {
402-
c = input[i];
403-
if (c <= 0x1f || c == '\\' || c == '"') {
404-
buf_len = ascii_escape_unichar(c, (unsigned char *)buf, buf_len);
405-
}
406-
else {
407-
buf[buf_len++] = c;
401+
// Same to ENCODE_OUTPUT in escape_unicode
402+
switch (c) {
403+
case '\\': output[chars++] = '\\'; output[chars++] = c; break;
404+
case '"': output[chars++] = '\\'; output[chars++] = c; break;
405+
case '\b': output[chars++] = '\\'; output[chars++] = 'b'; break;
406+
case '\f': output[chars++] = '\\'; output[chars++] = 'f'; break;
407+
case '\n': output[chars++] = '\\'; output[chars++] = 'n'; break;
408+
case '\r': output[chars++] = '\\'; output[chars++] = 'r'; break;
409+
case '\t': output[chars++] = '\\'; output[chars++] = 't'; break;
410+
default:
411+
if (c <= 0x1f) {
412+
output[chars++] = '\\';
413+
output[chars++] = 'u';
414+
output[chars++] = '0';
415+
output[chars++] = '0';
416+
output[chars++] = Py_hexdigits[(c >> 4) & 0xf];
417+
output[chars++] = Py_hexdigits[(c ) & 0xf];
418+
} else {
419+
output[chars++] = c;
420+
}
408421
}
409422

410-
if (buf_len + 6 > ESCAPE_BUF_SIZE) {
411-
ret = PyUnicodeWriter_WriteUTF8(writer, buf, buf_len);
423+
if (chars + 6 > ESCAPE_BUF_SIZE) {
424+
ret = PyUnicodeWriter_WriteUCS4(writer, output, chars);
412425
if (ret) return ret;
413-
buf_len = 0;
426+
chars = 0;
414427
}
415428
}
416429

417430
assert(buf_len < ESCAPE_BUF_SIZE);
418-
buf[buf_len++] = '"';
419-
return PyUnicodeWriter_WriteUTF8(writer, buf, buf_len);
431+
output[chars++] = '"';
432+
return PyUnicodeWriter_WriteUCS4(writer, output, chars);
420433
}
421-
422434
#undef ESCAPE_BUF_SIZE
423435

436+
424437
static void
425438
raise_errmsg(const char *msg, PyObject *s, Py_ssize_t end)
426439
{

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy