From 2aa2e87c0b1f6e351a6533140e501b66c3459643 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 13 May 2025 15:58:27 +0200 Subject: [PATCH 1/7] gh-133968: Add PyUnicodeWriter_WriteASCII() function Replace most PyUnicodeWriter_WriteUTF8() calls with PyUnicodeWriter_WriteASCII(). --- Doc/c-api/unicode.rst | 17 +++++++++++++- Doc/whatsnew/3.15.rst | 6 ++++- Include/cpython/unicodeobject.h | 4 ++++ Lib/test/test_capi/test_unicode.py | 6 +++++ ...-05-13-16-06-46.gh-issue-133968.6alWst.rst | 4 ++++ Modules/_json.c | 10 ++++----- Modules/_ssl.c | 2 +- Modules/_testcapi/unicode.c | 22 +++++++++++++++++++ Objects/genericaliasobject.c | 6 ++--- Objects/typevarobject.c | 4 ++-- Objects/unicodeobject.c | 11 ++++++++++ Objects/unionobject.c | 8 +++---- Parser/asdl_c.py | 6 ++--- Python/Python-ast.c | 6 ++--- Python/context.c | 10 ++++----- Python/hamt.c | 6 ++--- 16 files changed, 97 insertions(+), 31 deletions(-) create mode 100644 Misc/NEWS.d/next/C_API/2025-05-13-16-06-46.gh-issue-133968.6alWst.rst diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index cdd90d05b70b36..83756a0b855b49 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1806,9 +1806,24 @@ object. See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`. +.. c:function:: int PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer, const char *str, Py_ssize_t size) + + Write the ASCII string *str* into *writer*. + + *size* is the string length in bytes. If *size* is equal to ``-1``, call + ``strlen(str)`` to get the string length. + + *str* must only contain ASCII characters. The behavior is undefined if + *str* contains non-ASCII characters. + + On success, return ``0``. + On error, set an exception, leave the writer unchanged, and return ``-1``. + + .. versionadded:: 3.15 + .. c:function:: int PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, const wchar_t *str, Py_ssize_t size) - Writer the wide string *str* into *writer*. + Write the wide string *str* into *writer*. *size* is a number of wide characters. If *size* is equal to ``-1``, call ``wcslen(str)`` to get the string length. diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 8cf5238e6cc49a..1faf42946e047f 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -161,7 +161,11 @@ C API changes New features ------------ -* TODO +* Add :c:func:`PyUnicodeWriter_WriteASCII` function to write an ASCII string + into a :c:type:`PyUnicodeWriter`. The function is faster than + :c:func:`PyUnicodeWriter_WriteUTF8`, but has an undefined behavior if the + input string contains non-ASCII characters. + (Contributed by Victor Stinner in :gh:`133968`.) Porting to Python 3.15 ---------------------- diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 136f5d5c5f8425..3d0414f5291fe4 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -478,6 +478,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8( PyUnicodeWriter *writer, const char *str, Py_ssize_t size); +PyAPI_FUNC(int) PyUnicodeWriter_WriteASCII( + PyUnicodeWriter *writer, + const char *str, + Py_ssize_t size); PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar( PyUnicodeWriter *writer, const wchar_t *str, diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 3408c10f426058..6f513e984b14de 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1776,6 +1776,12 @@ def test_utf8(self): self.assertEqual(writer.finish(), "ascii-latin1=\xE9-euro=\u20AC.") + def test_ascii(self): + writer = self.create_writer(0) + writer.write_ascii(b"Hello ", -1) + writer.write_ascii(b"Python! ", 6) + self.assertEqual(writer.finish(), "Hello Python") + def test_invalid_utf8(self): writer = self.create_writer(0) with self.assertRaises(UnicodeDecodeError): diff --git a/Misc/NEWS.d/next/C_API/2025-05-13-16-06-46.gh-issue-133968.6alWst.rst b/Misc/NEWS.d/next/C_API/2025-05-13-16-06-46.gh-issue-133968.6alWst.rst new file mode 100644 index 00000000000000..47d5a3bda39942 --- /dev/null +++ b/Misc/NEWS.d/next/C_API/2025-05-13-16-06-46.gh-issue-133968.6alWst.rst @@ -0,0 +1,4 @@ +Add :c:func:`PyUnicodeWriter_WriteASCII` function to write an ASCII string +into a :c:type:`PyUnicodeWriter`. The function is faster than +:c:func:`PyUnicodeWriter_WriteUTF8`, but has an undefined behavior if the +input string contains non-ASCII characters. Patch by Victor Stinner. diff --git a/Modules/_json.c b/Modules/_json.c index 89b0a41dd10acb..4aa6ae650651b3 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -1476,13 +1476,13 @@ encoder_listencode_obj(PyEncoderObject *s, PyUnicodeWriter *writer, int rv; if (obj == Py_None) { - return PyUnicodeWriter_WriteUTF8(writer, "null", 4); + return PyUnicodeWriter_WriteASCII(writer, "null", 4); } else if (obj == Py_True) { - return PyUnicodeWriter_WriteUTF8(writer, "true", 4); + return PyUnicodeWriter_WriteASCII(writer, "true", 4); } else if (obj == Py_False) { - return PyUnicodeWriter_WriteUTF8(writer, "false", 5); + return PyUnicodeWriter_WriteASCII(writer, "false", 5); } else if (PyUnicode_Check(obj)) { PyObject *encoded = encoder_encode_string(s, obj); @@ -1649,7 +1649,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyUnicodeWriter *writer, if (PyDict_GET_SIZE(dct) == 0) { /* Fast path */ - return PyUnicodeWriter_WriteUTF8(writer, "{}", 2); + return PyUnicodeWriter_WriteASCII(writer, "{}", 2); } if (s->markers != Py_None) { @@ -1753,7 +1753,7 @@ encoder_listencode_list(PyEncoderObject *s, PyUnicodeWriter *writer, return -1; if (PySequence_Fast_GET_SIZE(s_fast) == 0) { Py_DECREF(s_fast); - return PyUnicodeWriter_WriteUTF8(writer, "[]", 2); + return PyUnicodeWriter_WriteASCII(writer, "[]", 2); } if (s->markers != Py_None) { diff --git a/Modules/_ssl.c b/Modules/_ssl.c index 976da1340ecf1e..014e624f6c2f00 100644 --- a/Modules/_ssl.c +++ b/Modules/_ssl.c @@ -563,7 +563,7 @@ fill_and_set_sslerror(_sslmodulestate *state, goto fail; } } - if (PyUnicodeWriter_WriteUTF8(writer, "] ", 2) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, "] ", 2) < 0) { goto fail; } } diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index b8ecf53f4f8b9c..e70f5c68bc3b69 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -332,6 +332,27 @@ writer_write_utf8(PyObject *self_raw, PyObject *args) } +static PyObject* +writer_write_ascii(PyObject *self_raw, PyObject *args) +{ + WriterObject *self = (WriterObject *)self_raw; + if (writer_check(self) < 0) { + return NULL; + } + + char *str; + Py_ssize_t size; + if (!PyArg_ParseTuple(args, "yn", &str, &size)) { + return NULL; + } + + if (PyUnicodeWriter_WriteASCII(self->writer, str, size) < 0) { + return NULL; + } + Py_RETURN_NONE; +} + + static PyObject* writer_write_widechar(PyObject *self_raw, PyObject *args) { @@ -513,6 +534,7 @@ writer_finish(PyObject *self_raw, PyObject *Py_UNUSED(args)) static PyMethodDef writer_methods[] = { {"write_char", _PyCFunction_CAST(writer_write_char), METH_VARARGS}, {"write_utf8", _PyCFunction_CAST(writer_write_utf8), METH_VARARGS}, + {"write_ascii", _PyCFunction_CAST(writer_write_ascii), METH_VARARGS}, {"write_widechar", _PyCFunction_CAST(writer_write_widechar), METH_VARARGS}, {"write_ucs4", _PyCFunction_CAST(writer_write_ucs4), METH_VARARGS}, {"write_str", _PyCFunction_CAST(writer_write_str), METH_VARARGS}, diff --git a/Objects/genericaliasobject.c b/Objects/genericaliasobject.c index ec3d01f00a3c3c..07b57f0c552ce9 100644 --- a/Objects/genericaliasobject.c +++ b/Objects/genericaliasobject.c @@ -65,7 +65,7 @@ ga_repr_items_list(PyUnicodeWriter *writer, PyObject *p) for (Py_ssize_t i = 0; i < len; i++) { if (i > 0) { - if (PyUnicodeWriter_WriteUTF8(writer, ", ", 2) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, ", ", 2) < 0) { return -1; } } @@ -109,7 +109,7 @@ ga_repr(PyObject *self) } for (Py_ssize_t i = 0; i < len; i++) { if (i > 0) { - if (PyUnicodeWriter_WriteUTF8(writer, ", ", 2) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, ", ", 2) < 0) { goto error; } } @@ -126,7 +126,7 @@ ga_repr(PyObject *self) } if (len == 0) { // for something like tuple[()] we should print a "()" - if (PyUnicodeWriter_WriteUTF8(writer, "()", 2) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, "()", 2) < 0) { goto error; } } diff --git a/Objects/typevarobject.c b/Objects/typevarobject.c index 6c199a52aa0ae6..cead6e69af5451 100644 --- a/Objects/typevarobject.c +++ b/Objects/typevarobject.c @@ -192,7 +192,7 @@ constevaluator_call(PyObject *self, PyObject *args, PyObject *kwargs) for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(value); i++) { PyObject *item = PyTuple_GET_ITEM(value, i); if (i > 0) { - if (PyUnicodeWriter_WriteUTF8(writer, ", ", 2) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, ", ", 2) < 0) { PyUnicodeWriter_Discard(writer); return NULL; } @@ -273,7 +273,7 @@ _Py_typing_type_repr(PyUnicodeWriter *writer, PyObject *p) } if (p == (PyObject *)&_PyNone_Type) { - return PyUnicodeWriter_WriteUTF8(writer, "None", 4); + return PyUnicodeWriter_WriteASCII(writer, "None", 4); } if ((rc = PyObject_HasAttrWithError(p, &_Py_ID(__origin__))) > 0 && diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index aa94fb91e65fc3..529664d5b38aa2 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -14083,6 +14083,17 @@ _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, return 0; } + +int +PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer, + const char *str, + Py_ssize_t size) +{ + _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer; + return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size); +} + + int PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer, const char *str, diff --git a/Objects/unionobject.c b/Objects/unionobject.c index 66435924b6c6c3..00ca5b9bf80341 100644 --- a/Objects/unionobject.c +++ b/Objects/unionobject.c @@ -290,7 +290,7 @@ union_repr(PyObject *self) } for (Py_ssize_t i = 0; i < len; i++) { - if (i > 0 && PyUnicodeWriter_WriteUTF8(writer, " | ", 3) < 0) { + if (i > 0 && PyUnicodeWriter_WriteASCII(writer, " | ", 3) < 0) { goto error; } PyObject *p = PyTuple_GET_ITEM(alias->args, i); @@ -300,12 +300,12 @@ union_repr(PyObject *self) } #if 0 - PyUnicodeWriter_WriteUTF8(writer, "|args=", 6); + PyUnicodeWriter_WriteASCII(writer, "|args=", 6); PyUnicodeWriter_WriteRepr(writer, alias->args); - PyUnicodeWriter_WriteUTF8(writer, "|h=", 3); + PyUnicodeWriter_WriteASCII(writer, "|h=", 3); PyUnicodeWriter_WriteRepr(writer, alias->hashable_args); if (alias->unhashable_args) { - PyUnicodeWriter_WriteUTF8(writer, "|u=", 3); + PyUnicodeWriter_WriteASCII(writer, "|u=", 3); PyUnicodeWriter_WriteRepr(writer, alias->unhashable_args); } #endif diff --git a/Parser/asdl_c.py b/Parser/asdl_c.py index 22dcfe1b0d99bf..dba20226c3283a 100755 --- a/Parser/asdl_c.py +++ b/Parser/asdl_c.py @@ -1512,7 +1512,7 @@ def visitModule(self, mod): for (Py_ssize_t i = 0; i < Py_MIN(length, 2); i++) { if (i > 0) { - if (PyUnicodeWriter_WriteUTF8(writer, ", ", 2) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, ", ", 2) < 0) { goto error; } } @@ -1536,7 +1536,7 @@ def visitModule(self, mod): } if (i == 0 && length > 2) { - if (PyUnicodeWriter_WriteUTF8(writer, ", ...", 5) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, ", ...", 5) < 0) { goto error; } } @@ -1640,7 +1640,7 @@ def visitModule(self, mod): } if (i > 0) { - if (PyUnicodeWriter_WriteUTF8(writer, ", ", 2) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, ", ", 2) < 0) { Py_DECREF(name); Py_DECREF(value_repr); goto error; diff --git a/Python/Python-ast.c b/Python/Python-ast.c index f7625ab1205bdc..660bc598a4862c 100644 --- a/Python/Python-ast.c +++ b/Python/Python-ast.c @@ -5796,7 +5796,7 @@ ast_repr_list(PyObject *list, int depth) for (Py_ssize_t i = 0; i < Py_MIN(length, 2); i++) { if (i > 0) { - if (PyUnicodeWriter_WriteUTF8(writer, ", ", 2) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, ", ", 2) < 0) { goto error; } } @@ -5820,7 +5820,7 @@ ast_repr_list(PyObject *list, int depth) } if (i == 0 && length > 2) { - if (PyUnicodeWriter_WriteUTF8(writer, ", ...", 5) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, ", ...", 5) < 0) { goto error; } } @@ -5924,7 +5924,7 @@ ast_repr_max_depth(AST_object *self, int depth) } if (i > 0) { - if (PyUnicodeWriter_WriteUTF8(writer, ", ", 2) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, ", ", 2) < 0) { Py_DECREF(name); Py_DECREF(value_repr); goto error; diff --git a/Python/context.c b/Python/context.c index dceaae9b42979d..9927cab915cae7 100644 --- a/Python/context.c +++ b/Python/context.c @@ -979,7 +979,7 @@ contextvar_tp_repr(PyObject *op) return NULL; } - if (PyUnicodeWriter_WriteUTF8(writer, "tok_used) { - if (PyUnicodeWriter_WriteUTF8(writer, " used", 5) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, " used", 5) < 0) { goto error; } } - if (PyUnicodeWriter_WriteUTF8(writer, " var=", 5) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, " var=", 5) < 0) { goto error; } if (PyUnicodeWriter_WriteRepr(writer, (PyObject *)self->tok_var) < 0) { diff --git a/Python/hamt.c b/Python/hamt.c index f9bbf63961d8de..df8d6ec39d3f84 100644 --- a/Python/hamt.c +++ b/Python/hamt.c @@ -1176,7 +1176,7 @@ hamt_node_bitmap_dump(PyHamtNode_Bitmap *node, } if (key_or_null == NULL) { - if (PyUnicodeWriter_WriteUTF8(writer, "NULL:\n", -1) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, "NULL:\n", -1) < 0) { goto error; } @@ -1194,7 +1194,7 @@ hamt_node_bitmap_dump(PyHamtNode_Bitmap *node, } } - if (PyUnicodeWriter_WriteUTF8(writer, "\n", 1) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, "\n", 1) < 0) { goto error; } } @@ -1915,7 +1915,7 @@ hamt_node_array_dump(PyHamtNode_Array *node, goto error; } - if (PyUnicodeWriter_WriteUTF8(writer, "\n", 1) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, "\n", 1) < 0) { goto error; } } From fc08c32ff3baddd28fa65ea0aabea7bb49f42356 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 13 May 2025 17:02:55 +0100 Subject: [PATCH 2/7] Update Doc/c-api/unicode.rst Co-authored-by: Peter Bierma --- Doc/c-api/unicode.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 83756a0b855b49..4d9a1782aabbc5 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1819,7 +1819,7 @@ object. On success, return ``0``. On error, set an exception, leave the writer unchanged, and return ``-1``. - .. versionadded:: 3.15 + .. versionadded:: next .. c:function:: int PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, const wchar_t *str, Py_ssize_t size) From 14c22c3b8b3d853082f8297a930e7988f9ccc208 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 15 May 2025 21:41:38 +0200 Subject: [PATCH 3/7] Update Doc/whatsnew/3.15.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- Doc/whatsnew/3.15.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 1faf42946e047f..f73b447079fffc 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -167,6 +167,7 @@ New features input string contains non-ASCII characters. (Contributed by Victor Stinner in :gh:`133968`.) + Porting to Python 3.15 ---------------------- From 33b3276a138bf7a280814e035b602fa2b185fc2b Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 15 May 2025 21:43:01 +0200 Subject: [PATCH 4/7] Update Python/hamt.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- Python/hamt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/hamt.c b/Python/hamt.c index df8d6ec39d3f84..906149cc6cdbdc 100644 --- a/Python/hamt.c +++ b/Python/hamt.c @@ -1176,7 +1176,7 @@ hamt_node_bitmap_dump(PyHamtNode_Bitmap *node, } if (key_or_null == NULL) { - if (PyUnicodeWriter_WriteASCII(writer, "NULL:\n", -1) < 0) { + if (PyUnicodeWriter_WriteASCII(writer, "NULL:\n", 6) < 0) { goto error; } From e7ca52fb6c2264963c933ac19f064b77585722e6 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 15 May 2025 21:45:57 +0200 Subject: [PATCH 5/7] Address Peter's review --- Objects/unicodeobject.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 529664d5b38aa2..5611f839627a2e 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -14089,6 +14089,9 @@ PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer, const char *str, Py_ssize_t size) { + assert(writer != NULL); + _Py_AssertHoldsTstate(); + _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer; return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size); } From 25e9444847b031520a37665d955244810114d3d7 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 15 May 2025 22:01:23 +0200 Subject: [PATCH 6/7] Test also empty string --- Lib/test/test_capi/test_unicode.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 6f513e984b14de..c8be4f3faa9483 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1779,6 +1779,7 @@ def test_utf8(self): def test_ascii(self): writer = self.create_writer(0) writer.write_ascii(b"Hello ", -1) + writer.write_ascii(b"", 0) writer.write_ascii(b"Python! ", 6) self.assertEqual(writer.finish(), "Hello Python") From b01a577ecebd1403162f2d2346f8dbd0db498755 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 29 May 2025 16:28:47 +0200 Subject: [PATCH 7/7] Please the linter: remove an unused import --- Lib/test/test_ctypes/test_incomplete.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/test/test_ctypes/test_incomplete.py b/Lib/test/test_ctypes/test_incomplete.py index 2f344611995b2c..3189fcd1bd1330 100644 --- a/Lib/test/test_ctypes/test_incomplete.py +++ b/Lib/test/test_ctypes/test_incomplete.py @@ -1,6 +1,5 @@ import ctypes import unittest -import warnings from ctypes import Structure, POINTER, pointer, c_char_p # String-based "incomplete pointers" were implemented in ctypes 0.6.3 (2003, when pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy