Skip to content

Commit 2e15785

Browse files
authored
gh-119182: Add PyUnicodeWriter_WriteUCS4() function (#120849)
1 parent a47abdb commit 2e15785

File tree

7 files changed

+131
-6
lines changed

7 files changed

+131
-6
lines changed

Doc/c-api/unicode.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1563,6 +1563,15 @@ object.
15631563
On success, return ``0``.
15641564
On error, set an exception, leave the writer unchanged, and return ``-1``.
15651565
1566+
.. c:function:: int PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *writer, Py_UCS4 *str, Py_ssize_t size)
1567+
1568+
Writer the UCS4 string *str* into *writer*.
1569+
1570+
*size* is a number of UCS4 characters.
1571+
1572+
On success, return ``0``.
1573+
On error, set an exception, leave the writer unchanged, and return ``-1``.
1574+
15661575
.. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
15671576
15681577
Call :c:func:`PyObject_Str` on *obj* and write the output into *writer*.

Doc/whatsnew/3.14.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,7 @@ New Features
314314
* :c:func:`PyUnicodeWriter_Finish`.
315315
* :c:func:`PyUnicodeWriter_WriteChar`.
316316
* :c:func:`PyUnicodeWriter_WriteUTF8`.
317+
* :c:func:`PyUnicodeWriter_WriteUCS4`.
317318
* :c:func:`PyUnicodeWriter_WriteWideChar`.
318319
* :c:func:`PyUnicodeWriter_WriteStr`.
319320
* :c:func:`PyUnicodeWriter_WriteRepr`.

Include/cpython/unicodeobject.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar(
463463
PyUnicodeWriter *writer,
464464
const wchar_t *str,
465465
Py_ssize_t size);
466+
PyAPI_FUNC(int) PyUnicodeWriter_WriteUCS4(
467+
PyUnicodeWriter *writer,
468+
Py_UCS4 *str,
469+
Py_ssize_t size);
466470

467471
PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
468472
PyUnicodeWriter *writer,

Lib/test/test_capi/test_unicode.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1826,8 +1826,42 @@ def test_widechar(self):
18261826
writer.write_widechar("latin1=\xE9")
18271827
writer.write_widechar("-")
18281828
writer.write_widechar("euro=\u20AC")
1829+
writer.write_char("-")
1830+
writer.write_widechar("max=\U0010ffff")
18291831
writer.write_char('.')
1830-
self.assertEqual(writer.finish(), "latin1=\xE9-euro=\u20AC.")
1832+
self.assertEqual(writer.finish(),
1833+
"latin1=\xE9-euro=\u20AC-max=\U0010ffff.")
1834+
1835+
def test_ucs4(self):
1836+
writer = self.create_writer(0)
1837+
writer.write_ucs4("ascii IGNORED", 5)
1838+
writer.write_char("-")
1839+
writer.write_ucs4("latin1=\xe9", 8)
1840+
writer.write_char("-")
1841+
writer.write_ucs4("euro=\u20ac", 6)
1842+
writer.write_char("-")
1843+
writer.write_ucs4("max=\U0010ffff", 5)
1844+
writer.write_char(".")
1845+
self.assertEqual(writer.finish(),
1846+
"ascii-latin1=\xE9-euro=\u20AC-max=\U0010ffff.")
1847+
1848+
# Test some special characters
1849+
writer = self.create_writer(0)
1850+
# Lone surrogate character
1851+
writer.write_ucs4("lone\uDC80", 5)
1852+
writer.write_char("-")
1853+
# Surrogate pair
1854+
writer.write_ucs4("pair\uDBFF\uDFFF", 5)
1855+
writer.write_char("-")
1856+
writer.write_ucs4("null[\0]", 7)
1857+
self.assertEqual(writer.finish(),
1858+
"lone\udc80-pair\udbff-null[\0]")
1859+
1860+
# invalid size
1861+
writer = self.create_writer(0)
1862+
with self.assertRaises(ValueError):
1863+
writer.write_ucs4("text", -1)
1864+
18311865

18321866

18331867
@unittest.skipIf(ctypes is None, 'need ctypes')

Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@ Add a new :c:type:`PyUnicodeWriter` API to create a Python :class:`str` object:
55
* :c:func:`PyUnicodeWriter_Finish`.
66
* :c:func:`PyUnicodeWriter_WriteChar`.
77
* :c:func:`PyUnicodeWriter_WriteUTF8`.
8+
* :c:func:`PyUnicodeWriter_WriteUCS4`.
9+
* :c:func:`PyUnicodeWriter_WriteWideChar`.
810
* :c:func:`PyUnicodeWriter_WriteStr`.
911
* :c:func:`PyUnicodeWriter_WriteRepr`.
1012
* :c:func:`PyUnicodeWriter_WriteSubstring`.
1113
* :c:func:`PyUnicodeWriter_Format`.
14+
* :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
1215

1316
Patch by Victor Stinner.

Modules/_testcapi/unicode.c

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,36 @@ writer_write_widechar(PyObject *self_raw, PyObject *args)
360360
}
361361

362362

363+
static PyObject*
364+
writer_write_ucs4(PyObject *self_raw, PyObject *args)
365+
{
366+
WriterObject *self = (WriterObject *)self_raw;
367+
if (writer_check(self) < 0) {
368+
return NULL;
369+
}
370+
371+
PyObject *str;
372+
Py_ssize_t size;
373+
if (!PyArg_ParseTuple(args, "Un", &str, &size)) {
374+
return NULL;
375+
}
376+
Py_ssize_t len = PyUnicode_GET_LENGTH(str);
377+
size = Py_MIN(size, len);
378+
379+
Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(str);
380+
if (ucs4 == NULL) {
381+
return NULL;
382+
}
383+
384+
int res = PyUnicodeWriter_WriteUCS4(self->writer, ucs4, size);
385+
PyMem_Free(ucs4);
386+
if (res < 0) {
387+
return NULL;
388+
}
389+
Py_RETURN_NONE;
390+
}
391+
392+
363393
static PyObject*
364394
writer_write_str(PyObject *self_raw, PyObject *args)
365395
{
@@ -484,6 +514,7 @@ static PyMethodDef writer_methods[] = {
484514
{"write_char", _PyCFunction_CAST(writer_write_char), METH_VARARGS},
485515
{"write_utf8", _PyCFunction_CAST(writer_write_utf8), METH_VARARGS},
486516
{"write_widechar", _PyCFunction_CAST(writer_write_widechar), METH_VARARGS},
517+
{"write_ucs4", _PyCFunction_CAST(writer_write_ucs4), METH_VARARGS},
487518
{"write_str", _PyCFunction_CAST(writer_write_str), METH_VARARGS},
488519
{"write_repr", _PyCFunction_CAST(writer_write_repr), METH_VARARGS},
489520
{"write_substring", _PyCFunction_CAST(writer_write_substring), METH_VARARGS},

Objects/unicodeobject.c

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2035,11 +2035,9 @@ PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
20352035
if (!converted) {
20362036
return -1;
20372037
}
2038-
PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2039-
PyMem_Free(converted);
20402038

2041-
int res = _PyUnicodeWriter_WriteStr(writer, unicode);
2042-
Py_DECREF(unicode);
2039+
int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
2040+
PyMem_Free(converted);
20432041
return res;
20442042
}
20452043
#endif
@@ -2289,6 +2287,51 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
22892287
return res;
22902288
}
22912289

2290+
2291+
int
2292+
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2293+
Py_UCS4 *str,
2294+
Py_ssize_t size)
2295+
{
2296+
_PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2297+
2298+
if (size < 0) {
2299+
PyErr_SetString(PyExc_ValueError,
2300+
"size must be positive");
2301+
return -1;
2302+
}
2303+
2304+
if (size == 0) {
2305+
return 0;
2306+
}
2307+
2308+
Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2309+
2310+
if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2311+
return -1;
2312+
}
2313+
2314+
int kind = writer->kind;
2315+
void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2316+
if (kind == PyUnicode_1BYTE_KIND) {
2317+
_PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2318+
str, str + size,
2319+
data);
2320+
}
2321+
else if (kind == PyUnicode_2BYTE_KIND) {
2322+
_PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2323+
str, str + size,
2324+
data);
2325+
}
2326+
else {
2327+
memcpy(data, str, size * sizeof(Py_UCS4));
2328+
}
2329+
writer->pos += size;
2330+
2331+
return 0;
2332+
}
2333+
2334+
22922335
PyObject*
22932336
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
22942337
{
@@ -13357,7 +13400,7 @@ PyUnicodeWriter*
1335713400
PyUnicodeWriter_Create(Py_ssize_t length)
1335813401
{
1335913402
if (length < 0) {
13360-
PyErr_SetString(PyExc_TypeError,
13403+
PyErr_SetString(PyExc_ValueError,
1336113404
"length must be positive");
1336213405
return NULL;
1336313406
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy