diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 4ea20bde38c1db..246cf47df62e78 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1563,6 +1563,15 @@ object. On success, return ``0``. On error, set an exception, leave the writer unchanged, and return ``-1``. +.. c:function:: int PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *writer, Py_UCS4 *str, Py_ssize_t size) + + Writer the UCS4 string *str* into *writer*. + + *size* is a number of UCS4 characters. + + On success, return ``0``. + On error, set an exception, leave the writer unchanged, and return ``-1``. + .. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj) Call :c:func:`PyObject_Str` on *obj* and write the output into *writer*. diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index b134ed31f6df40..9662044915b8ca 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -314,6 +314,7 @@ New Features * :c:func:`PyUnicodeWriter_Finish`. * :c:func:`PyUnicodeWriter_WriteChar`. * :c:func:`PyUnicodeWriter_WriteUTF8`. + * :c:func:`PyUnicodeWriter_WriteUCS4`. * :c:func:`PyUnicodeWriter_WriteWideChar`. * :c:func:`PyUnicodeWriter_WriteStr`. * :c:func:`PyUnicodeWriter_WriteRepr`. diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 059bec8618c8d9..91799137101280 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -463,6 +463,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar( PyUnicodeWriter *writer, const wchar_t *str, Py_ssize_t size); +PyAPI_FUNC(int) PyUnicodeWriter_WriteUCS4( + PyUnicodeWriter *writer, + Py_UCS4 *str, + Py_ssize_t size); PyAPI_FUNC(int) PyUnicodeWriter_WriteStr( PyUnicodeWriter *writer, diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 36106b0730dd26..8f9def2e650e56 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1784,8 +1784,42 @@ def test_widechar(self): writer.write_widechar("latin1=\xE9") writer.write_widechar("-") writer.write_widechar("euro=\u20AC") + writer.write_char("-") + writer.write_widechar("max=\U0010ffff") writer.write_char('.') - self.assertEqual(writer.finish(), "latin1=\xE9-euro=\u20AC.") + self.assertEqual(writer.finish(), + "latin1=\xE9-euro=\u20AC-max=\U0010ffff.") + + def test_ucs4(self): + writer = self.create_writer(0) + writer.write_ucs4("ascii IGNORED", 5) + writer.write_char("-") + writer.write_ucs4("latin1=\xe9", 8) + writer.write_char("-") + writer.write_ucs4("euro=\u20ac", 6) + writer.write_char("-") + writer.write_ucs4("max=\U0010ffff", 5) + writer.write_char(".") + self.assertEqual(writer.finish(), + "ascii-latin1=\xE9-euro=\u20AC-max=\U0010ffff.") + + # Test some special characters + writer = self.create_writer(0) + # Lone surrogate character + writer.write_ucs4("lone\uDC80", 5) + writer.write_char("-") + # Surrogate pair + writer.write_ucs4("pair\uDBFF\uDFFF", 5) + writer.write_char("-") + writer.write_ucs4("null[\0]", 7) + self.assertEqual(writer.finish(), + "lone\udc80-pair\udbff-null[\0]") + + # invalid size + writer = self.create_writer(0) + with self.assertRaises(ValueError): + writer.write_ucs4("text", -1) + @unittest.skipIf(ctypes is None, 'need ctypes') diff --git a/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst b/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst index 3d1384c9f3252f..243f290fbd47e2 100644 --- a/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst +++ b/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst @@ -5,9 +5,12 @@ Add a new :c:type:`PyUnicodeWriter` API to create a Python :class:`str` object: * :c:func:`PyUnicodeWriter_Finish`. * :c:func:`PyUnicodeWriter_WriteChar`. * :c:func:`PyUnicodeWriter_WriteUTF8`. +* :c:func:`PyUnicodeWriter_WriteUCS4`. +* :c:func:`PyUnicodeWriter_WriteWideChar`. * :c:func:`PyUnicodeWriter_WriteStr`. * :c:func:`PyUnicodeWriter_WriteRepr`. * :c:func:`PyUnicodeWriter_WriteSubstring`. * :c:func:`PyUnicodeWriter_Format`. +* :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`. Patch by Victor Stinner. diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index c723e087baa308..b8ecf53f4f8b9c 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -360,6 +360,36 @@ writer_write_widechar(PyObject *self_raw, PyObject *args) } +static PyObject* +writer_write_ucs4(PyObject *self_raw, PyObject *args) +{ + WriterObject *self = (WriterObject *)self_raw; + if (writer_check(self) < 0) { + return NULL; + } + + PyObject *str; + Py_ssize_t size; + if (!PyArg_ParseTuple(args, "Un", &str, &size)) { + return NULL; + } + Py_ssize_t len = PyUnicode_GET_LENGTH(str); + size = Py_MIN(size, len); + + Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(str); + if (ucs4 == NULL) { + return NULL; + } + + int res = PyUnicodeWriter_WriteUCS4(self->writer, ucs4, size); + PyMem_Free(ucs4); + if (res < 0) { + return NULL; + } + Py_RETURN_NONE; +} + + static PyObject* writer_write_str(PyObject *self_raw, PyObject *args) { @@ -484,6 +514,7 @@ static PyMethodDef writer_methods[] = { {"write_char", _PyCFunction_CAST(writer_write_char), METH_VARARGS}, {"write_utf8", _PyCFunction_CAST(writer_write_utf8), METH_VARARGS}, {"write_widechar", _PyCFunction_CAST(writer_write_widechar), METH_VARARGS}, + {"write_ucs4", _PyCFunction_CAST(writer_write_ucs4), METH_VARARGS}, {"write_str", _PyCFunction_CAST(writer_write_str), METH_VARARGS}, {"write_repr", _PyCFunction_CAST(writer_write_repr), METH_VARARGS}, {"write_substring", _PyCFunction_CAST(writer_write_substring), METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 279cdaa668e291..8b7e8dae6ee989 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2035,11 +2035,9 @@ PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer, if (!converted) { return -1; } - PyObject *unicode = _PyUnicode_FromUCS4(converted, size); - PyMem_Free(converted); - int res = _PyUnicodeWriter_WriteStr(writer, unicode); - Py_DECREF(unicode); + int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size); + PyMem_Free(converted); return res; } #endif @@ -2289,6 +2287,51 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) return res; } + +int +PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, + Py_UCS4 *str, + Py_ssize_t size) +{ + _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer; + + if (size < 0) { + PyErr_SetString(PyExc_ValueError, + "size must be positive"); + return -1; + } + + if (size == 0) { + return 0; + } + + Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size); + + if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) { + return -1; + } + + int kind = writer->kind; + void *data = (Py_UCS1*)writer->data + writer->pos * kind; + if (kind == PyUnicode_1BYTE_KIND) { + _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, + str, str + size, + data); + } + else if (kind == PyUnicode_2BYTE_KIND) { + _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, + str, str + size, + data); + } + else { + memcpy(data, str, size * sizeof(Py_UCS4)); + } + writer->pos += size; + + return 0; +} + + PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) { @@ -13348,7 +13391,7 @@ PyUnicodeWriter* PyUnicodeWriter_Create(Py_ssize_t length) { if (length < 0) { - PyErr_SetString(PyExc_TypeError, + PyErr_SetString(PyExc_ValueError, "length must be positive"); return NULL; } pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy