From d6d0203f20d3d88b46a0ace3b8db99c55305ed0c Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 20 Jun 2024 10:59:41 +0200 Subject: [PATCH 1/6] gh-119182: Add PyUnicodeWriter_WriteUCS4() function --- Doc/c-api/unicode.rst | 10 +++++ Doc/whatsnew/3.14.rst | 1 + Include/cpython/unicodeobject.h | 4 ++ Lib/test/test_capi/test_unicode.py | 18 +++++++- ...-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst | 3 ++ Modules/_testcapi/unicode.c | 31 +++++++++++++ Objects/unicodeobject.c | 44 +++++++++++++++++++ 7 files changed, 110 insertions(+), 1 deletion(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 4ea20bde38c1db..3b4b5162daf24e 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1563,6 +1563,16 @@ object. On success, return ``0``. On error, set an exception, leave the writer unchanged, and return ``-1``. +.. c:function:: int PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *writer, Py_UCS4 *str, Py_ssize_t size) + + Writer the UCS4 string *str* into *writer*. + + *size* is a number of UCS4 characters. If *size* is equal to ``-1``, get the + string length (search the NUL character). + + On success, return ``0``. + On error, set an exception, leave the writer unchanged, and return ``-1``. + .. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj) Call :c:func:`PyObject_Str` on *obj* and write the output into *writer*. diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 2eefa232cdcd02..806e4a9c62b4ea 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -291,6 +291,7 @@ New Features * :c:func:`PyUnicodeWriter_Finish`. * :c:func:`PyUnicodeWriter_WriteChar`. * :c:func:`PyUnicodeWriter_WriteUTF8`. + * :c:func:`PyUnicodeWriter_WriteUCS4`. * :c:func:`PyUnicodeWriter_WriteWideChar`. * :c:func:`PyUnicodeWriter_WriteStr`. * :c:func:`PyUnicodeWriter_WriteRepr`. diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 059bec8618c8d9..91799137101280 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -463,6 +463,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar( PyUnicodeWriter *writer, const wchar_t *str, Py_ssize_t size); +PyAPI_FUNC(int) PyUnicodeWriter_WriteUCS4( + PyUnicodeWriter *writer, + Py_UCS4 *str, + Py_ssize_t size); PyAPI_FUNC(int) PyUnicodeWriter_WriteStr( PyUnicodeWriter *writer, diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 36106b0730dd26..f3c1e06fbdbe4f 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1784,8 +1784,24 @@ def test_widechar(self): writer.write_widechar("latin1=\xE9") writer.write_widechar("-") writer.write_widechar("euro=\u20AC") + writer.write_char("-") + writer.write_ucs4("max=\U0010ffff", -1) writer.write_char('.') - self.assertEqual(writer.finish(), "latin1=\xE9-euro=\u20AC.") + self.assertEqual(writer.finish(), + "latin1=\xE9-euro=\u20AC-max=\U0010ffff.") + + def test_ucs4(self): + writer = self.create_writer(0) + writer.write_ucs4("ascii", -1) + writer.write_char("-") + writer.write_ucs4("latin1=\xe9", -1) + writer.write_char("-") + writer.write_ucs4("euro=\u20ac", -1) + writer.write_char("-") + writer.write_ucs4("max=\U0010ffff", -1) + writer.write_char(".") + self.assertEqual(writer.finish(), + "ascii-latin1=\xE9-euro=\u20AC-max=\U0010ffff.") @unittest.skipIf(ctypes is None, 'need ctypes') diff --git a/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst b/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst index 3d1384c9f3252f..243f290fbd47e2 100644 --- a/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst +++ b/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst @@ -5,9 +5,12 @@ Add a new :c:type:`PyUnicodeWriter` API to create a Python :class:`str` object: * :c:func:`PyUnicodeWriter_Finish`. * :c:func:`PyUnicodeWriter_WriteChar`. * :c:func:`PyUnicodeWriter_WriteUTF8`. +* :c:func:`PyUnicodeWriter_WriteUCS4`. +* :c:func:`PyUnicodeWriter_WriteWideChar`. * :c:func:`PyUnicodeWriter_WriteStr`. * :c:func:`PyUnicodeWriter_WriteRepr`. * :c:func:`PyUnicodeWriter_WriteSubstring`. * :c:func:`PyUnicodeWriter_Format`. +* :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`. Patch by Victor Stinner. diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index c723e087baa308..b8ecf53f4f8b9c 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -360,6 +360,36 @@ writer_write_widechar(PyObject *self_raw, PyObject *args) } +static PyObject* +writer_write_ucs4(PyObject *self_raw, PyObject *args) +{ + WriterObject *self = (WriterObject *)self_raw; + if (writer_check(self) < 0) { + return NULL; + } + + PyObject *str; + Py_ssize_t size; + if (!PyArg_ParseTuple(args, "Un", &str, &size)) { + return NULL; + } + Py_ssize_t len = PyUnicode_GET_LENGTH(str); + size = Py_MIN(size, len); + + Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(str); + if (ucs4 == NULL) { + return NULL; + } + + int res = PyUnicodeWriter_WriteUCS4(self->writer, ucs4, size); + PyMem_Free(ucs4); + if (res < 0) { + return NULL; + } + Py_RETURN_NONE; +} + + static PyObject* writer_write_str(PyObject *self_raw, PyObject *args) { @@ -484,6 +514,7 @@ static PyMethodDef writer_methods[] = { {"write_char", _PyCFunction_CAST(writer_write_char), METH_VARARGS}, {"write_utf8", _PyCFunction_CAST(writer_write_utf8), METH_VARARGS}, {"write_widechar", _PyCFunction_CAST(writer_write_widechar), METH_VARARGS}, + {"write_ucs4", _PyCFunction_CAST(writer_write_ucs4), METH_VARARGS}, {"write_str", _PyCFunction_CAST(writer_write_str), METH_VARARGS}, {"write_repr", _PyCFunction_CAST(writer_write_repr), METH_VARARGS}, {"write_substring", _PyCFunction_CAST(writer_write_substring), METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 74a743812c9c78..b0a47032c84d91 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2289,6 +2289,50 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) return res; } + +int +PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, + Py_UCS4 *str, + Py_ssize_t size) +{ + _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer; + + if (size < 0) { + size = 0; + for (; str[size] != '\0'; size++); + } + + if (size == 0) { + return 0; + } + + Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size); + + if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) { + return -1; + } + + int kind = writer->kind; + void *data = (Py_UCS1*)writer->data + writer->pos * kind; + if (kind == PyUnicode_1BYTE_KIND) { + _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, + str, str + size, + data); + } + else if (kind == PyUnicode_2BYTE_KIND) { + _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, + str, str + size, + data); + } + else { + memcpy(data, str, size * sizeof(Py_UCS4)); + } + writer->pos += size; + + return 0; +} + + PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) { From aefcbf898adee901bf3990310a039c6618237f8c Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 24 Jun 2024 14:33:02 +0200 Subject: [PATCH 2/6] size must be positive --- Doc/c-api/unicode.rst | 3 +-- Objects/unicodeobject.c | 5 +++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 3b4b5162daf24e..246cf47df62e78 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1567,8 +1567,7 @@ object. Writer the UCS4 string *str* into *writer*. - *size* is a number of UCS4 characters. If *size* is equal to ``-1``, get the - string length (search the NUL character). + *size* is a number of UCS4 characters. On success, return ``0``. On error, set an exception, leave the writer unchanged, and return ``-1``. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 2e814b42f36fe8..c6dc9f09151796 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2298,8 +2298,9 @@ PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer; if (size < 0) { - size = 0; - for (; str[size] != '\0'; size++); + PyErr_SetString(PyExc_TypeError, + "size must be positive"); + return NULL; } if (size == 0) { From e18a47c3a708d5b0723b7945eb496cc720c2aa3f Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 24 Jun 2024 14:36:05 +0200 Subject: [PATCH 3/6] Use PyUnicodeWriter_WriteUCS4() on Solaris --- Objects/unicodeobject.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c6dc9f09151796..6966120465a4ee 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2035,11 +2035,9 @@ PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer, if (!converted) { return -1; } - PyObject *unicode = _PyUnicode_FromUCS4(converted, size); - PyMem_Free(converted); - int res = _PyUnicodeWriter_WriteStr(writer, unicode); - Py_DECREF(unicode); + int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size); + PyMem_Free(converted); return res; } #endif From 056c42e2e152fd1cdd8c1595f3e67b7bd92f9b16 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 24 Jun 2024 14:58:17 +0200 Subject: [PATCH 4/6] Fix typo --- Objects/unicodeobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 6966120465a4ee..f7db0b5d1f27e6 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2298,7 +2298,7 @@ PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, if (size < 0) { PyErr_SetString(PyExc_TypeError, "size must be positive"); - return NULL; + return -1; } if (size == 0) { From 8c7691a9dd46bf12c7f72c24fea8ea4d12e709fd Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 24 Jun 2024 15:13:29 +0200 Subject: [PATCH 5/6] Update tests --- Lib/test/test_capi/test_unicode.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index f3c1e06fbdbe4f..91737a54820209 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1785,20 +1785,20 @@ def test_widechar(self): writer.write_widechar("-") writer.write_widechar("euro=\u20AC") writer.write_char("-") - writer.write_ucs4("max=\U0010ffff", -1) + writer.write_widechar("max=\U0010ffff") writer.write_char('.') self.assertEqual(writer.finish(), "latin1=\xE9-euro=\u20AC-max=\U0010ffff.") def test_ucs4(self): writer = self.create_writer(0) - writer.write_ucs4("ascii", -1) + writer.write_ucs4("ascii", 5) writer.write_char("-") - writer.write_ucs4("latin1=\xe9", -1) + writer.write_ucs4("latin1=\xe9", 8) writer.write_char("-") - writer.write_ucs4("euro=\u20ac", -1) + writer.write_ucs4("euro=\u20ac", 6) writer.write_char("-") - writer.write_ucs4("max=\U0010ffff", -1) + writer.write_ucs4("max=\U0010ffff", 5) writer.write_char(".") self.assertEqual(writer.finish(), "ascii-latin1=\xE9-euro=\u20AC-max=\U0010ffff.") From 76e66dd101ef4eca5704c70f82aa6aaa18ab0eda Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 24 Jun 2024 15:22:40 +0200 Subject: [PATCH 6/6] Add more tests --- Lib/test/test_capi/test_unicode.py | 20 +++++++++++++++++++- Objects/unicodeobject.c | 4 ++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 91737a54820209..8f9def2e650e56 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1792,7 +1792,7 @@ def test_widechar(self): def test_ucs4(self): writer = self.create_writer(0) - writer.write_ucs4("ascii", 5) + writer.write_ucs4("ascii IGNORED", 5) writer.write_char("-") writer.write_ucs4("latin1=\xe9", 8) writer.write_char("-") @@ -1803,6 +1803,24 @@ def test_ucs4(self): self.assertEqual(writer.finish(), "ascii-latin1=\xE9-euro=\u20AC-max=\U0010ffff.") + # Test some special characters + writer = self.create_writer(0) + # Lone surrogate character + writer.write_ucs4("lone\uDC80", 5) + writer.write_char("-") + # Surrogate pair + writer.write_ucs4("pair\uDBFF\uDFFF", 5) + writer.write_char("-") + writer.write_ucs4("null[\0]", 7) + self.assertEqual(writer.finish(), + "lone\udc80-pair\udbff-null[\0]") + + # invalid size + writer = self.create_writer(0) + with self.assertRaises(ValueError): + writer.write_ucs4("text", -1) + + @unittest.skipIf(ctypes is None, 'need ctypes') class PyUnicodeWriterFormatTest(unittest.TestCase): diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index f7db0b5d1f27e6..8b7e8dae6ee989 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2296,7 +2296,7 @@ PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer; if (size < 0) { - PyErr_SetString(PyExc_TypeError, + PyErr_SetString(PyExc_ValueError, "size must be positive"); return -1; } @@ -13391,7 +13391,7 @@ PyUnicodeWriter* PyUnicodeWriter_Create(Py_ssize_t length) { if (length < 0) { - PyErr_SetString(PyExc_TypeError, + PyErr_SetString(PyExc_ValueError, "length must be positive"); return NULL; } pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy