From 431abbaa89d737c22097fa1c967775e4b81f217c Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 1 Jul 2025 15:28:11 +0200 Subject: [PATCH 01/14] gh-76535: Make `PyUnicode_ToLowerFull` and friends public Make `PyUnicode_ToLowerFull`, `PyUnicode_ToUpperFull` and `PyUnicode_ToTitleFull` public and rename them to `PyUnicode_ToLower` etc. --- Doc/c-api/unicode.rst | 30 ++++++++++++++++++ Include/cpython/unicodeobject.h | 15 +++++++++ Include/internal/pycore_unicodeobject.h | 3 -- Objects/unicodectype.c | 42 +++++++++++++++++-------- Objects/unicodeobject.c | 10 +++--- 5 files changed, 79 insertions(+), 21 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 84fee05cb4ce20..90e6a382ea3078 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -307,6 +307,36 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. +.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer) + + Convert *ch* to lower case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be lower cased + (maximum three), and return the number of characters stored. + Passing a ``NULL`` buffer returns the buffer size needed. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer) + + Convert *ch* to lower case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be lower cased + (maximum three), and return the number of characters stored. + Passing a ``NULL`` buffer returns the buffer size needed. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer) + + Convert *ch* to lower case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be lower cased + (maximum three), and return the number of characters stored. + Passing a ``NULL`` buffer returns the buffer size needed. + + .. versionadded:: next + + These APIs can be used to work with surrogates: .. c:function:: int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 86c502730f478c..d52d86105e7d84 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -733,6 +733,21 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); +PyAPI_FUNC(int) PyUnicode_ToLower( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res /* Output buffer */ + ); + +PyAPI_FUNC(int) PyUnicode_ToUpper( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res /* Output buffer */ + ); + +PyAPI_FUNC(int) PyUnicode_ToTitle( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res /* Output buffer */ + ); + // Helper array used by Py_UNICODE_ISSPACE(). PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index 3791b913c17546..cc1368fb63d4ae 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -15,9 +15,6 @@ extern "C" { extern int _PyUnicode_IsXidStart(Py_UCS4 ch); extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); -extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch); extern int _PyUnicode_IsCased(Py_UCS4 ch); diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 7cd0dca3d13545..9f10c02f67fd1a 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -198,7 +198,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } -int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -206,15 +206,21 @@ int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) int index = ctype->lower & 0xFFFF; int n = ctype->lower >> 24; int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; + for (i = 0; i < n; i++) { + if (res != NULL) { + res[i] = _PyUnicode_ExtendedCase[index + i]; + } + } return n; } - res[0] = ch + ctype->lower; + + if (res != NULL) { + res[0] = ch + ctype->lower; + } return 1; } -int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -222,15 +228,20 @@ int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) int index = ctype->title & 0xFFFF; int n = ctype->title >> 24; int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; + for (i = 0; i < n; i++) { + if (res != NULL) { + res[i] = _PyUnicode_ExtendedCase[index + i]; + } + } return n; } - res[0] = ch + ctype->title; + if (res != NULL) { + res[0] = ch + ctype->title; + } return 1; } -int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -238,11 +249,16 @@ int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) int index = ctype->upper & 0xFFFF; int n = ctype->upper >> 24; int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; + for (i = 0; i < n; i++) { + if (res != NULL) { + res[i] = _PyUnicode_ExtendedCase[index + i]; + } + } return n; } - res[0] = ch + ctype->upper; + if (res != NULL) { + res[0] = ch + ctype->upper; + } return 1; } @@ -258,7 +274,7 @@ int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) res[i] = _PyUnicode_ExtendedCase[index + i]; return n; } - return _PyUnicode_ToLowerFull(ch, res); + return PyUnicode_ToLowerFull(ch, res); } int _PyUnicode_IsCased(Py_UCS4 ch) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5c2308a012142a..6c9c3ccfca4dea 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10046,7 +10046,7 @@ lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, mapped[0] = handle_capital_sigma(kind, data, length, i); return 1; } - return _PyUnicode_ToLowerFull(c, mapped); + return PyUnicode_ToLower(c, mapped); } static Py_ssize_t @@ -10057,7 +10057,7 @@ do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UC Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); - n_res = _PyUnicode_ToTitleFull(c, mapped); + n_res = PyUnicode_ToTitle(c, mapped); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10084,7 +10084,7 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 n_res = lower_ucs4(kind, data, length, i, c, mapped); } else if (Py_UNICODE_ISLOWER(c)) { - n_res = _PyUnicode_ToUpperFull(c, mapped); + n_res = PyUnicode_ToUpper(c, mapped); } else { n_res = 1; @@ -10110,7 +10110,7 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, if (lower) n_res = lower_ucs4(kind, data, length, i, c, mapped); else - n_res = _PyUnicode_ToUpperFull(c, mapped); + n_res = PyUnicode_ToUpper(c, mapped); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10163,7 +10163,7 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m if (previous_is_cased) n_res = lower_ucs4(kind, data, length, i, c, mapped); else - n_res = _PyUnicode_ToTitleFull(c, mapped); + n_res = PyUnicode_ToTitle(c, mapped); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); From d604fc8ed0edfcc06c20da42489cd130523eea1b Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 1 Jul 2025 16:52:21 +0200 Subject: [PATCH 02/14] Address feedback; add size parameter and do PyUnicode_ToFolded as well --- Doc/c-api/unicode.rst | 43 +++++++++++++++-------- Include/cpython/unicodeobject.h | 16 +++++++-- Include/internal/pycore_unicodeobject.h | 1 - Objects/unicodectype.c | 45 +++++++++++++++++++++---- Objects/unicodeobject.c | 22 ++++++------ 5 files changed, 91 insertions(+), 36 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 90e6a382ea3078..1e3d0c0b1ec1ff 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -307,36 +307,51 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. -.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer) +.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, int size) Convert *ch* to lower case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be lower cased - (maximum three), and return the number of characters stored. - Passing a ``NULL`` buffer returns the buffer size needed. + able to hold as many characters needed for *ch* to be lower cased, and + return the number of characters stored. Passing a ``NULL`` buffer returns + the buffer size needed. If at some point a buffer overflow is detected, + an :exc:`OverflowError` is raised and ``-1`` is returned. .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer) +.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, int size) - Convert *ch* to lower case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be lower cased - (maximum three), and return the number of characters stored. - Passing a ``NULL`` buffer returns the buffer size needed. + Convert *ch* to upper case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be upper cased, and + return the number of characters stored. Passing a ``NULL`` buffer returns + the buffer size needed. If at some point a buffer overflow is detected, + an :exc:`OverflowError` is raised and ``-1`` is returned. .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer) +.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, int size) - Convert *ch* to lower case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be lower cased - (maximum three), and return the number of characters stored. - Passing a ``NULL`` buffer returns the buffer size needed. + Convert *ch* to title case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be title cased, and + return the number of characters stored. Passing a ``NULL`` buffer returns + the buffer size needed. If at some point a buffer overflow is detected, + an :exc:`OverflowError` is raised and ``-1`` is returned. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, int size) + + Foldcase *ch*, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be foldcased, and + return the number of characters stored. Passing a ``NULL`` buffer returns + the buffer size needed. If at some point a buffer overflow is detected, + an :exc:`OverflowError` is raised and ``-1`` is returned. .. versionadded:: next + These APIs can be used to work with surrogates: .. c:function:: int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index d52d86105e7d84..f9142af0057b78 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -735,19 +735,29 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( PyAPI_FUNC(int) PyUnicode_ToLower( Py_UCS4 ch, /* Unicode character */ - Py_UCS4 *res /* Output buffer */ + Py_UCS4 *res, /* Output buffer */ + int size /* Buffer size */ ); PyAPI_FUNC(int) PyUnicode_ToUpper( Py_UCS4 ch, /* Unicode character */ - Py_UCS4 *res /* Output buffer */ + Py_UCS4 *res, /* Output buffer */ + int size /* Buffer size */ ); PyAPI_FUNC(int) PyUnicode_ToTitle( Py_UCS4 ch, /* Unicode character */ - Py_UCS4 *res /* Output buffer */ + Py_UCS4 *res, /* Output buffer */ + int size /* Buffer size */ ); +PyAPI_FUNC(int) PyUnicode_ToFolded( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + int size /* Buffer size */ + ); + + // Helper array used by Py_UNICODE_ISSPACE(). PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index cc1368fb63d4ae..c83a221bb6a3a1 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -15,7 +15,6 @@ extern "C" { extern int _PyUnicode_IsXidStart(Py_UCS4 ch); extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); -extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch); extern int _PyUnicode_IsCased(Py_UCS4 ch); diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 9f10c02f67fd1a..2ef667c30a1690 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -198,7 +198,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } -int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -208,6 +208,10 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res) int i; for (i = 0; i < n; i++) { if (res != NULL) { + if (i >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[i] = _PyUnicode_ExtendedCase[index + i]; } } @@ -215,12 +219,16 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res) } if (res != NULL) { + if (0 >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->lower; } return 1; } -int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -230,18 +238,26 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res) int i; for (i = 0; i < n; i++) { if (res != NULL) { + if (i >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[i] = _PyUnicode_ExtendedCase[index + i]; } } return n; } if (res != NULL) { + if (0 >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->title; } return 1; } -int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -251,18 +267,26 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res) int i; for (i = 0; i < n; i++) { if (res != NULL) { + if (i >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[i] = _PyUnicode_ExtendedCase[index + i]; } } return n; } if (res != NULL) { + if (0 >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->upper; } return 1; } -int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, int size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -270,11 +294,18 @@ int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); int n = (ctype->lower >> 20) & 7; int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; + for (i = 0; i < n; i++) { + if (res != NULL) { + if (i >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } + res[i] = _PyUnicode_ExtendedCase[index + i]; + } + } return n; } - return PyUnicode_ToLowerFull(ch, res); + return PyUnicode_ToLower(ch, res, size); } int _PyUnicode_IsCased(Py_UCS4 ch) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 6c9c3ccfca4dea..4a3b77b727657a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10039,14 +10039,14 @@ handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i static int lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, - Py_UCS4 c, Py_UCS4 *mapped) + Py_UCS4 c, Py_UCS4 *mapped, int mapped_size) { /* Obscure special case. */ if (c == 0x3A3) { mapped[0] = handle_capital_sigma(kind, data, length, i); return 1; } - return PyUnicode_ToLower(c, mapped); + return PyUnicode_ToLower(c, mapped, mapped_size); } static Py_ssize_t @@ -10057,14 +10057,14 @@ do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UC Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); - n_res = PyUnicode_ToTitle(c, mapped); + n_res = PyUnicode_ToTitle(c, mapped, 3); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; } for (i = 1; i < length; i++) { c = PyUnicode_READ(kind, data, i); - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10081,10 +10081,10 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; int n_res, j; if (Py_UNICODE_ISUPPER(c)) { - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); } else if (Py_UNICODE_ISLOWER(c)) { - n_res = PyUnicode_ToUpper(c, mapped); + n_res = PyUnicode_ToUpper(c, mapped, 3); } else { n_res = 1; @@ -10108,9 +10108,9 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; int n_res, j; if (lower) - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); else - n_res = PyUnicode_ToUpper(c, mapped); + n_res = PyUnicode_ToUpper(c, mapped, 3); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10139,7 +10139,7 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); + int j, n_res = PyUnicode_ToFolded(c, mapped, 3); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10161,9 +10161,9 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m int n_res, j; if (previous_is_cased) - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); else - n_res = PyUnicode_ToTitle(c, mapped); + n_res = PyUnicode_ToTitle(c, mapped, 3); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); From fbbf8412011cb56092d2f90300f00b8096a725fe Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Tue, 1 Jul 2025 14:56:46 +0000 Subject: [PATCH 03/14] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst diff --git a/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst new file mode 100644 index 00000000000000..65b5c45a33a895 --- /dev/null +++ b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst @@ -0,0 +1 @@ +Make :c:func:`PyUnicode_ToLower`, :c:func:`PyUnicode_ToUpper`, :c:func:`PyUnicode_ToTitle` and :c:func:`PyUnicode_ToFolded` public. From f17aa0c55e0014fc2f1e19aa041accdaf755a051 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 1 Jul 2025 17:29:35 +0200 Subject: [PATCH 04/14] Address more feedback; assert return value and raise ValueError --- Doc/c-api/unicode.rst | 8 ++++---- Objects/unicodectype.c | 14 +++++++------- Objects/unicodeobject.c | 25 +++++++++++++++---------- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 1e3d0c0b1ec1ff..879b76b770f8fa 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -313,7 +313,7 @@ These APIs can be used for fast direct character conversions: able to hold as many characters needed for *ch* to be lower cased, and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`OverflowError` is raised and ``-1`` is returned. + an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -324,7 +324,7 @@ These APIs can be used for fast direct character conversions: able to hold as many characters needed for *ch* to be upper cased, and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`OverflowError` is raised and ``-1`` is returned. + an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -335,7 +335,7 @@ These APIs can be used for fast direct character conversions: able to hold as many characters needed for *ch* to be title cased, and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`OverflowError` is raised and ``-1`` is returned. + an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -346,7 +346,7 @@ These APIs can be used for fast direct character conversions: able to hold as many characters needed for *ch* to be foldcased, and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`OverflowError` is raised and ``-1`` is returned. + an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 2ef667c30a1690..66a7d9d85e67cd 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -209,7 +209,7 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) for (i = 0; i < n; i++) { if (res != NULL) { if (i >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[i] = _PyUnicode_ExtendedCase[index + i]; @@ -220,7 +220,7 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) if (res != NULL) { if (0 >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[0] = ch + ctype->lower; @@ -239,7 +239,7 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) for (i = 0; i < n; i++) { if (res != NULL) { if (i >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[i] = _PyUnicode_ExtendedCase[index + i]; @@ -249,7 +249,7 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) } if (res != NULL) { if (0 >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[0] = ch + ctype->title; @@ -268,7 +268,7 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) for (i = 0; i < n; i++) { if (res != NULL) { if (i >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[i] = _PyUnicode_ExtendedCase[index + i]; @@ -278,7 +278,7 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) } if (res != NULL) { if (0 >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[0] = ch + ctype->upper; @@ -297,7 +297,7 @@ int PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, int size) for (i = 0; i < n; i++) { if (res != NULL) { if (i >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[i] = _PyUnicode_ExtendedCase[index + i]; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4a3b77b727657a..05e9dbf7d3fa51 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10057,14 +10057,16 @@ do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UC Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); - n_res = PyUnicode_ToTitle(c, mapped, 3); + n_res = PyUnicode_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; } for (i = 1; i < length; i++) { c = PyUnicode_READ(kind, data, i); - n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10081,15 +10083,16 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; int n_res, j; if (Py_UNICODE_ISUPPER(c)) { - n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); } else if (Py_UNICODE_ISLOWER(c)) { - n_res = PyUnicode_ToUpper(c, mapped, 3); + n_res = PyUnicode_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); } else { n_res = 1; mapped[0] = c; } + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10108,9 +10111,10 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; int n_res, j; if (lower) - n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = PyUnicode_ToUpper(c, mapped, 3); + n_res = PyUnicode_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10139,7 +10143,8 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int j, n_res = PyUnicode_ToFolded(c, mapped, 3); + int j, n_res = PyUnicode_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10161,10 +10166,10 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m int n_res, j; if (previous_is_cased) - n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = PyUnicode_ToTitle(c, mapped, 3); - + n_res = PyUnicode_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; From 4a704898102c0bc94d1490732cc6fa70c0f01515 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 1 Jul 2025 18:32:04 +0200 Subject: [PATCH 05/14] Add tests --- Lib/test/test_capi/test_unicode.py | 49 +++++++++++++ Modules/_testcapi/unicode.c | 110 +++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 6a9c60f3a6d75e..2f9a2e0b8b5b51 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1753,6 +1753,55 @@ def test_GET_CACHED_HASH(self): # impl detail: ASCII string hashes are equal to bytes ones self.assertEqual(unicode_GET_CACHED_HASH(obj), hash(content_bytes)) + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_tolower(self): + import string + from _testcapi import unicode_tolower + + for i, c in enumerate(string.ascii_uppercase): + with self.subTest(c): + self.assertEqual(unicode_tolower(c), string.ascii_lowercase[i]) + + # Test unicode character + self.assertEqual(unicode_tolower("Č"), "č") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_toupper(self): + import string + from _testcapi import unicode_toupper + + for i, c in enumerate(string.ascii_lowercase): + with self.subTest(c): + self.assertEqual(unicode_toupper(c), string.ascii_uppercase[i]) + + # Test unicode character + self.assertEqual(unicode_toupper("č"), "Č") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_totitle(self): + from _testcapi import unicode_totitle + + self.assertEqual(unicode_totitle("t"), "T") + + # Test unicode character + self.assertEqual(unicode_totitle("ł"), "Ł") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_tofolded(self): + from _testcapi import unicode_tofolded + + self.assertEqual(unicode_tofolded("T"), "t") + + # Test unicode character + self.assertEqual(unicode_tofolded("Ł"), "ł") + + # Test case-ignorable character + self.assertEqual(unicode_tofolded("👍"), "👍") + class PyUnicodeWriterTest(unittest.TestCase): def create_writer(self, size): diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 203282dd53dd0a..01c4caef6e2a01 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -220,6 +220,112 @@ unicode_copycharacters(PyObject *self, PyObject *args) return Py_BuildValue("(Nn)", to_copy, copied); } +/* Test PyUnicode_ToLower() */ +static PyObject * +unicode_tolower(PyObject *self, PyObject *arg) +{ + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_SetString(PyExc_ValueError, "unicode_tolower only accepts 1-character strings"); + return NULL; + } + + Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); + + Py_UCS4 lower[3]; + int chars = PyUnicode_ToLower(c, lower, Py_ARRAY_LENGTH(lower)); + assert(chars >= 1); + + PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_WriteUCS4(writer, lower, chars) < 0) { + PyUnicodeWriter_Discard(writer); + return NULL; + } + return PyUnicodeWriter_Finish(writer); +} + +/* Test PyUnicode_ToUpper() */ +static PyObject * +unicode_toupper(PyObject *self, PyObject *arg) +{ + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_SetString(PyExc_ValueError, "unicode_toupper only accepts 1-character strings"); + return NULL; + } + + Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); + + Py_UCS4 upper[3]; + int chars = PyUnicode_ToUpper(c, upper, Py_ARRAY_LENGTH(upper)); + assert(chars >= 1); + + PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_WriteUCS4(writer, upper, chars) < 0) { + PyUnicodeWriter_Discard(writer); + return NULL; + } + return PyUnicodeWriter_Finish(writer); +} + + +/* Test PyUnicode_ToLower() */ +static PyObject * +unicode_totitle(PyObject *self, PyObject *arg) +{ + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_SetString(PyExc_ValueError, "unicode_totitle only accepts 1-character strings"); + return NULL; + } + + Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); + + Py_UCS4 title[3]; + int chars = PyUnicode_ToTitle(c, title, Py_ARRAY_LENGTH(title)); + assert(chars >= 1); + + PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_WriteUCS4(writer, title, chars) < 0) { + PyUnicodeWriter_Discard(writer); + return NULL; + } + return PyUnicodeWriter_Finish(writer); +} + +/* Test PyUnicode_ToLower() */ +static PyObject * +unicode_tofolded(PyObject *self, PyObject *arg) +{ + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_SetString(PyExc_ValueError, "unicode_tofolded only accepts 1-character strings"); + return NULL; + } + + Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); + + Py_UCS4 folded[3]; + int chars = PyUnicode_ToFolded(c, folded, Py_ARRAY_LENGTH(folded)); + assert(chars >= 1); + + PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_WriteUCS4(writer, folded, chars) < 0) { + PyUnicodeWriter_Discard(writer); + return NULL; + } + return PyUnicodeWriter_Finish(writer); +} + + static PyObject* unicode_GET_CACHED_HASH(PyObject *self, PyObject *arg) { @@ -577,6 +683,10 @@ static PyMethodDef TestMethods[] = { {"unicode_asutf8", unicode_asutf8, METH_VARARGS}, {"unicode_copycharacters", unicode_copycharacters, METH_VARARGS}, {"unicode_GET_CACHED_HASH", unicode_GET_CACHED_HASH, METH_O}, + {"unicode_tolower", unicode_tolower, METH_O}, + {"unicode_toupper", unicode_toupper, METH_O}, + {"unicode_totitle", unicode_totitle, METH_O}, + {"unicode_tofolded", unicode_tofolded, METH_O}, {NULL}, }; From 61afd9a5f7a9906b5f38175833ada9a7b1993a0d Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 1 Jul 2025 18:38:41 +0200 Subject: [PATCH 06/14] Document the maximum numbers of characters needed in the buffer --- Doc/c-api/unicode.rst | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 879b76b770f8fa..9021b6142ed199 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -310,7 +310,8 @@ These APIs can be used for fast direct character conversions: .. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, int size) Convert *ch* to lower case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be lower cased, and + able to hold as many characters needed for *ch* to be lower cased + (e.g. a maximum of two character for Unicode 16.0), and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. @@ -321,7 +322,8 @@ These APIs can be used for fast direct character conversions: .. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, int size) Convert *ch* to upper case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be upper cased, and + able to hold as many characters needed for *ch* to be upper cased + (e.g. a maximum of three character for Unicode 16.0), and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. @@ -332,7 +334,8 @@ These APIs can be used for fast direct character conversions: .. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, int size) Convert *ch* to title case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be title cased, and + able to hold as many characters needed for *ch* to be title cased + (e.g. a maximum of three character for Unicode 16.0), and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. @@ -343,7 +346,8 @@ These APIs can be used for fast direct character conversions: .. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, int size) Foldcase *ch*, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be foldcased, and + able to hold as many characters needed for *ch* to be foldcased + (e.g. a maximum of three character for Unicode 16.0), and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. From 7885b17116451e3bc5e59e6fe281db0be573e195 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 13:56:02 +0200 Subject: [PATCH 07/14] Address feedback; test more characters and refactor _testcapi functions --- Lib/test/test_capi/test_unicode.py | 6 ++ Modules/_testcapi/unicode.c | 90 +++++++----------------------- 2 files changed, 27 insertions(+), 69 deletions(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 2f9a2e0b8b5b51..931ce47ed2911e 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1765,6 +1765,7 @@ def test_tolower(self): # Test unicode character self.assertEqual(unicode_tolower("Č"), "č") + self.assertEqual(unicode_tolower("Σ"), "σ") @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') @@ -1778,6 +1779,8 @@ def test_toupper(self): # Test unicode character self.assertEqual(unicode_toupper("č"), "Č") + self.assertEqual(unicode_toupper("ß"), "SS") + self.assertEqual(unicode_toupper("ΐ"), "Ϊ́") @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') @@ -1788,6 +1791,8 @@ def test_totitle(self): # Test unicode character self.assertEqual(unicode_totitle("ł"), "Ł") + self.assertEqual(unicode_totitle("ß"), "Ss") + self.assertEqual(unicode_totitle("ΐ"), "Ϊ́") @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') @@ -1798,6 +1803,7 @@ def test_tofolded(self): # Test unicode character self.assertEqual(unicode_tofolded("Ł"), "ł") + self.assertEqual(unicode_tofolded("Σ"), "σ") # Test case-ignorable character self.assertEqual(unicode_tofolded("👍"), "👍") diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 01c4caef6e2a01..9959a7c613da48 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -220,56 +220,46 @@ unicode_copycharacters(PyObject *self, PyObject *args) return Py_BuildValue("(Nn)", to_copy, copied); } -/* Test PyUnicode_ToLower() */ static PyObject * -unicode_tolower(PyObject *self, PyObject *arg) +unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int), const char *name) { - if (PyUnicode_GET_LENGTH(arg) != 1) { - PyErr_SetString(PyExc_ValueError, "unicode_tolower only accepts 1-character strings"); + if (PyUnicode_GET_LENGTH(str) != 1) { + PyErr_Format(PyExc_ValueError, "%s only accepts 1-character strings", name); return NULL; } - Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); + Py_UCS4 c = PyUnicode_READ_CHAR(str, 0); - Py_UCS4 lower[3]; - int chars = PyUnicode_ToLower(c, lower, Py_ARRAY_LENGTH(lower)); - assert(chars >= 1); + Py_UCS4 buf[3]; + int chars = function(c, buf, Py_ARRAY_LENGTH(buf)); + if (chars <= 0) { + PyErr_BadInternalCall(); + return NULL; + } PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); if (writer == NULL) { return NULL; } - if (PyUnicodeWriter_WriteUCS4(writer, lower, chars) < 0) { + if (PyUnicodeWriter_WriteUCS4(writer, buf, chars) < 0) { PyUnicodeWriter_Discard(writer); return NULL; } return PyUnicodeWriter_Finish(writer); } +/* Test PyUnicode_ToLower() */ +static PyObject * +unicode_tolower(PyObject *self, PyObject *arg) +{ + return unicode_case_operation(arg, PyUnicode_ToLower, "unicode_tolower"); +} + /* Test PyUnicode_ToUpper() */ static PyObject * unicode_toupper(PyObject *self, PyObject *arg) { - if (PyUnicode_GET_LENGTH(arg) != 1) { - PyErr_SetString(PyExc_ValueError, "unicode_toupper only accepts 1-character strings"); - return NULL; - } - - Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); - - Py_UCS4 upper[3]; - int chars = PyUnicode_ToUpper(c, upper, Py_ARRAY_LENGTH(upper)); - assert(chars >= 1); - - PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); - if (writer == NULL) { - return NULL; - } - if (PyUnicodeWriter_WriteUCS4(writer, upper, chars) < 0) { - PyUnicodeWriter_Discard(writer); - return NULL; - } - return PyUnicodeWriter_Finish(writer); + return unicode_case_operation(arg, PyUnicode_ToUpper, "unicode_toupper"); } @@ -277,52 +267,14 @@ unicode_toupper(PyObject *self, PyObject *arg) static PyObject * unicode_totitle(PyObject *self, PyObject *arg) { - if (PyUnicode_GET_LENGTH(arg) != 1) { - PyErr_SetString(PyExc_ValueError, "unicode_totitle only accepts 1-character strings"); - return NULL; - } - - Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); - - Py_UCS4 title[3]; - int chars = PyUnicode_ToTitle(c, title, Py_ARRAY_LENGTH(title)); - assert(chars >= 1); - - PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); - if (writer == NULL) { - return NULL; - } - if (PyUnicodeWriter_WriteUCS4(writer, title, chars) < 0) { - PyUnicodeWriter_Discard(writer); - return NULL; - } - return PyUnicodeWriter_Finish(writer); + return unicode_case_operation(arg, PyUnicode_ToTitle, "unicode_totitle"); } /* Test PyUnicode_ToLower() */ static PyObject * unicode_tofolded(PyObject *self, PyObject *arg) { - if (PyUnicode_GET_LENGTH(arg) != 1) { - PyErr_SetString(PyExc_ValueError, "unicode_tofolded only accepts 1-character strings"); - return NULL; - } - - Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); - - Py_UCS4 folded[3]; - int chars = PyUnicode_ToFolded(c, folded, Py_ARRAY_LENGTH(folded)); - assert(chars >= 1); - - PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); - if (writer == NULL) { - return NULL; - } - if (PyUnicodeWriter_WriteUCS4(writer, folded, chars) < 0) { - PyUnicodeWriter_Discard(writer); - return NULL; - } - return PyUnicodeWriter_Finish(writer); + return unicode_case_operation(arg, PyUnicode_ToFolded, "unicode_tofolded"); } From 6f9cb9572009d1cd7e55e229b15992490e146a19 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 14:14:03 +0200 Subject: [PATCH 08/14] Address more review comments --- Modules/_testcapi/unicode.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 9959a7c613da48..057bc3b7a6f1c9 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -221,10 +221,15 @@ unicode_copycharacters(PyObject *self, PyObject *args) } static PyObject * -unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int), const char *name) +unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int)) { + if (!PyUnicode_Check(str)) { + PyErr_Format(PyExc_TypeError, "expect str type, got %T", str); + return NULL; + } + if (PyUnicode_GET_LENGTH(str) != 1) { - PyErr_Format(PyExc_ValueError, "%s only accepts 1-character strings", name); + PyErr_SetString(PyExc_ValueError, "expecting 1-character strings only"); return NULL; } @@ -233,33 +238,24 @@ unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int), Py_UCS4 buf[3]; int chars = function(c, buf, Py_ARRAY_LENGTH(buf)); if (chars <= 0) { - PyErr_BadInternalCall(); return NULL; } - PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); - if (writer == NULL) { - return NULL; - } - if (PyUnicodeWriter_WriteUCS4(writer, buf, chars) < 0) { - PyUnicodeWriter_Discard(writer); - return NULL; - } - return PyUnicodeWriter_Finish(writer); + return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buf, chars); } /* Test PyUnicode_ToLower() */ static PyObject * unicode_tolower(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToLower, "unicode_tolower"); + return unicode_case_operation(arg, PyUnicode_ToLower); } /* Test PyUnicode_ToUpper() */ static PyObject * unicode_toupper(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToUpper, "unicode_toupper"); + return unicode_case_operation(arg, PyUnicode_ToUpper); } @@ -267,14 +263,14 @@ unicode_toupper(PyObject *self, PyObject *arg) static PyObject * unicode_totitle(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToTitle, "unicode_totitle"); + return unicode_case_operation(arg, PyUnicode_ToTitle); } /* Test PyUnicode_ToLower() */ static PyObject * unicode_tofolded(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToFolded, "unicode_tofolded"); + return unicode_case_operation(arg, PyUnicode_ToFolded); } From 6a974c44b5767aa95319e412de0091bae02fea18 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 14:19:05 +0200 Subject: [PATCH 09/14] Disallow passing NULL --- Doc/c-api/unicode.rst | 20 +++++------- Objects/unicodectype.c | 73 ++++++++++++++++++------------------------ 2 files changed, 39 insertions(+), 54 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 9021b6142ed199..11dd600c669243 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -312,9 +312,8 @@ These APIs can be used for fast direct character conversions: Convert *ch* to lower case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be lower cased (e.g. a maximum of two character for Unicode 16.0), and - return the number of characters stored. Passing a ``NULL`` buffer returns - the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`ValueError` is raised and ``-1`` is returned. + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -324,9 +323,8 @@ These APIs can be used for fast direct character conversions: Convert *ch* to upper case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be upper cased (e.g. a maximum of three character for Unicode 16.0), and - return the number of characters stored. Passing a ``NULL`` buffer returns - the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`ValueError` is raised and ``-1`` is returned. + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -336,9 +334,8 @@ These APIs can be used for fast direct character conversions: Convert *ch* to title case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be title cased (e.g. a maximum of three character for Unicode 16.0), and - return the number of characters stored. Passing a ``NULL`` buffer returns - the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`ValueError` is raised and ``-1`` is returned. + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -348,9 +345,8 @@ These APIs can be used for fast direct character conversions: Foldcase *ch*, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be foldcased (e.g. a maximum of three character for Unicode 16.0), and - return the number of characters stored. Passing a ``NULL`` buffer returns - the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`ValueError` is raised and ``-1`` is returned. + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 66a7d9d85e67cd..ec0ae918b339ee 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -207,24 +207,20 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) int n = ctype->lower >> 24; int i; for (i = 0; i < n; i++) { - if (res != NULL) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[i] = _PyUnicode_ExtendedCase[index + i]; + if (i >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[i] = _PyUnicode_ExtendedCase[index + i]; } return n; } - if (res != NULL) { - if (0 >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[0] = ch + ctype->lower; + if (0 >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[0] = ch + ctype->lower; return 1; } @@ -237,23 +233,20 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) int n = ctype->title >> 24; int i; for (i = 0; i < n; i++) { - if (res != NULL) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[i] = _PyUnicode_ExtendedCase[index + i]; + if (i >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[i] = _PyUnicode_ExtendedCase[index + i]; } return n; } - if (res != NULL) { - if (0 >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[0] = ch + ctype->title; + + if (0 >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[0] = ch + ctype->title; return 1; } @@ -266,23 +259,20 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) int n = ctype->upper >> 24; int i; for (i = 0; i < n; i++) { - if (res != NULL) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[i] = _PyUnicode_ExtendedCase[index + i]; + if (i >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[i] = _PyUnicode_ExtendedCase[index + i]; } return n; } - if (res != NULL) { - if (0 >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[0] = ch + ctype->upper; + + if (0 >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[0] = ch + ctype->upper; return 1; } @@ -295,16 +285,15 @@ int PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, int size) int n = (ctype->lower >> 20) & 7; int i; for (i = 0; i < n; i++) { - if (res != NULL) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[i] = _PyUnicode_ExtendedCase[index + i]; + if (i >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[i] = _PyUnicode_ExtendedCase[index + i]; } return n; } + return PyUnicode_ToLower(ch, res, size); } From ae033ff5d93131320b78b4efb33daaed4297fadf Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 14:29:10 +0200 Subject: [PATCH 10/14] Only return NULL when chars < 0 in C test functions Co-authored-by: Victor Stinner --- Modules/_testcapi/unicode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 057bc3b7a6f1c9..cb1e2df5739211 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -237,7 +237,7 @@ unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int)) Py_UCS4 buf[3]; int chars = function(c, buf, Py_ARRAY_LENGTH(buf)); - if (chars <= 0) { + if (chars < 0) { return NULL; } From e7ef477c36fdd2892c68243d257ec8584cd4ad7a Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 15:00:00 +0200 Subject: [PATCH 11/14] Use Py_ssize_t and don't check overflow in loop --- Doc/c-api/unicode.rst | 8 ++--- Include/cpython/unicodeobject.h | 16 ++++----- Modules/_testcapi/unicode.c | 4 +-- Objects/unicodectype.c | 62 ++++++++++++++++----------------- 4 files changed, 45 insertions(+), 45 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 11dd600c669243..65966fb0180220 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -307,7 +307,7 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. -.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, int size) +.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to lower case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be lower cased @@ -318,7 +318,7 @@ These APIs can be used for fast direct character conversions: .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, int size) +.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to upper case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be upper cased @@ -329,7 +329,7 @@ These APIs can be used for fast direct character conversions: .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, int size) +.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to title case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be title cased @@ -340,7 +340,7 @@ These APIs can be used for fast direct character conversions: .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, int size) +.. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Foldcase *ch*, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be foldcased diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index f9142af0057b78..ea9f9b5921c3c2 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -733,28 +733,28 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); -PyAPI_FUNC(int) PyUnicode_ToLower( +PyAPI_FUNC(Py_ssize_t) PyUnicode_ToLower( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ - int size /* Buffer size */ + Py_ssize_t size /* Buffer size */ ); -PyAPI_FUNC(int) PyUnicode_ToUpper( +PyAPI_FUNC(Py_ssize_t) PyUnicode_ToUpper( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ - int size /* Buffer size */ + Py_ssize_t size /* Buffer size */ ); -PyAPI_FUNC(int) PyUnicode_ToTitle( +PyAPI_FUNC(Py_ssize_t) PyUnicode_ToTitle( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ - int size /* Buffer size */ + Py_ssize_t size /* Buffer size */ ); -PyAPI_FUNC(int) PyUnicode_ToFolded( +PyAPI_FUNC(Py_ssize_t) PyUnicode_ToFolded( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ - int size /* Buffer size */ + Py_ssize_t size /* Buffer size */ ); diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index cb1e2df5739211..21f6c0f62f11f5 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -221,7 +221,7 @@ unicode_copycharacters(PyObject *self, PyObject *args) } static PyObject * -unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int)) +unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, Py_ssize_t)) { if (!PyUnicode_Check(str)) { PyErr_Format(PyExc_TypeError, "expect str type, got %T", str); @@ -236,7 +236,7 @@ unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int)) Py_UCS4 c = PyUnicode_READ_CHAR(str, 0); Py_UCS4 buf[3]; - int chars = function(c, buf, Py_ARRAY_LENGTH(buf)); + Py_ssize_t chars = function(c, buf, Py_ARRAY_LENGTH(buf)); if (chars < 0) { return NULL; } diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index ec0ae918b339ee..da70f60b12c450 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -198,25 +198,25 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } -int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) +Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->lower & 0xFFFF; int n = ctype->lower >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; - for (i = 0; i < n; i++) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } + for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; - } return n; } - if (0 >= size) { + if (size < 1) { PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } @@ -224,25 +224,25 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) return 1; } -int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) +Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->title & 0xFFFF; int n = ctype->title >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; - for (i = 0; i < n; i++) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } + for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; - } return n; } - if (0 >= size) { + if (size < 1) { PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } @@ -250,25 +250,25 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) return 1; } -int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) +Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->upper & 0xFFFF; int n = ctype->upper >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; - for (i = 0; i < n; i++) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } + for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; - } return n; } - if (0 >= size) { + if (size < 1) { PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } @@ -276,21 +276,21 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) return 1; } -int PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, int size) +Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) { int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); int n = (ctype->lower >> 20) & 7; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; - for (i = 0; i < n; i++) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } + for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; - } return n; } From fff25db403f4ffc5fa56ef3b14250ee229611368 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 19:59:13 +0200 Subject: [PATCH 12/14] Use Py_ssize_t for return value variable in unicodeobject.c --- Objects/unicodeobject.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 05e9dbf7d3fa51..38b214df74a4df 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10037,9 +10037,9 @@ handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i return (final_sigma) ? 0x3C2 : 0x3C3; } -static int +static Py_ssize_t lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, - Py_UCS4 c, Py_UCS4 *mapped, int mapped_size) + Py_UCS4 c, Py_UCS4 *mapped, Py_ssize_t mapped_size) { /* Obscure special case. */ if (c == 0x3A3) { @@ -10052,8 +10052,7 @@ lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, static Py_ssize_t do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { - Py_ssize_t i, k = 0; - int n_res, j; + Py_ssize_t i, k = 0, n_res, j; Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); @@ -10081,7 +10080,7 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (Py_UNICODE_ISUPPER(c)) { n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); } @@ -10109,7 +10108,7 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (lower) n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else @@ -10143,7 +10142,7 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int j, n_res = PyUnicode_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); + Py_ssize_t j, n_res = PyUnicode_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); @@ -10163,7 +10162,7 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m for (i = 0; i < length; i++) { const Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (previous_is_cased) n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); From f378ceaff9feb660d9edb23b05f9c1a393419ad6 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Sun, 27 Jul 2025 20:52:58 +0200 Subject: [PATCH 13/14] Address feedback; Rename to PyUCS4_*, define macro and test small buffer case --- Doc/c-api/unicode.rst | 8 ++--- Include/cpython/unicodeobject.h | 10 +++--- Lib/test/test_capi/test_unicode.py | 6 +++- ...5-07-01-14-56-41.gh-issue-76535.9cwObj.rst | 2 +- Modules/_testcapi/unicode.c | 35 +++++++++++++------ Objects/unicodectype.c | 10 +++--- Objects/unicodeobject.c | 12 +++---- 7 files changed, 51 insertions(+), 32 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 65966fb0180220..f6caa1f1f65c37 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -307,7 +307,7 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. -.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) +.. c:function:: Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to lower case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be lower cased @@ -318,7 +318,7 @@ These APIs can be used for fast direct character conversions: .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) +.. c:function:: Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to upper case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be upper cased @@ -329,7 +329,7 @@ These APIs can be used for fast direct character conversions: .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) +.. c:function:: Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to title case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be title cased @@ -340,7 +340,7 @@ These APIs can be used for fast direct character conversions: .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) +.. c:function:: Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Foldcase *ch*, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be foldcased diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 725937af7afff7..662e3f5ab06dcf 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -733,25 +733,25 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); -PyAPI_FUNC(Py_ssize_t) PyUnicode_ToLower( +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToLower( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ Py_ssize_t size /* Buffer size */ ); -PyAPI_FUNC(Py_ssize_t) PyUnicode_ToUpper( +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToUpper( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ Py_ssize_t size /* Buffer size */ ); -PyAPI_FUNC(Py_ssize_t) PyUnicode_ToTitle( +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToTitle( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ Py_ssize_t size /* Buffer size */ ); -PyAPI_FUNC(Py_ssize_t) PyUnicode_ToFolded( +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToFolded( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ Py_ssize_t size /* Buffer size */ @@ -792,6 +792,8 @@ static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) { #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) +#define PyUCS4_CASE_CONVERSION_BUFFER_SIZE 3 + static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) { return (Py_UNICODE_ISALPHA(ch) || Py_UNICODE_ISDECIMAL(ch) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 931ce47ed2911e..3a5d1a0053f351 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1771,7 +1771,7 @@ def test_tolower(self): @unittest.skipIf(_testcapi is None, 'need _testcapi module') def test_toupper(self): import string - from _testcapi import unicode_toupper + from _testcapi import unicode_toupper, unicode_toupper_buffer_too_small for i, c in enumerate(string.ascii_lowercase): with self.subTest(c): @@ -1782,6 +1782,10 @@ def test_toupper(self): self.assertEqual(unicode_toupper("ß"), "SS") self.assertEqual(unicode_toupper("ΐ"), "Ϊ́") + # Test unicode character with smaller buffer + with self.assertRaisesRegex(ValueError, "output buffer is too small"): + unicode_toupper_buffer_too_small("ß") + @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') def test_totitle(self): diff --git a/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst index 65b5c45a33a895..37d251b6e35d8f 100644 --- a/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst +++ b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst @@ -1 +1 @@ -Make :c:func:`PyUnicode_ToLower`, :c:func:`PyUnicode_ToUpper`, :c:func:`PyUnicode_ToTitle` and :c:func:`PyUnicode_ToFolded` public. +Make :c:func:`PyUCS4_ToLower`, :c:func:`PyUCS4_ToUpper`, :c:func:`PyUCS4_ToTitle` and :c:func:`PyUCS4_ToFolded` public. diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 21f6c0f62f11f5..c3106f0fcb8543 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -221,7 +221,8 @@ unicode_copycharacters(PyObject *self, PyObject *args) } static PyObject * -unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, Py_ssize_t)) +unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, Py_ssize_t), + Py_UCS4 *buf, Py_ssize_t size) { if (!PyUnicode_Check(str)) { PyErr_Format(PyExc_TypeError, "expect str type, got %T", str); @@ -235,8 +236,7 @@ unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, Py_UCS4 c = PyUnicode_READ_CHAR(str, 0); - Py_UCS4 buf[3]; - Py_ssize_t chars = function(c, buf, Py_ARRAY_LENGTH(buf)); + Py_ssize_t chars = function(c, buf, size); if (chars < 0) { return NULL; } @@ -244,33 +244,45 @@ unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buf, chars); } -/* Test PyUnicode_ToLower() */ +/* Test PyUCS4_ToLower() */ static PyObject * unicode_tolower(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToLower); + Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUCS4_ToLower, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); } -/* Test PyUnicode_ToUpper() */ + +/* Test PyUCS4_ToUpper() */ static PyObject * unicode_toupper(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToUpper); + Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUCS4_ToUpper, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); } +/* Test PyUCS4_ToUpper() with a small buffer */ +static PyObject * +unicode_toupper_buffer_too_small(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf; + return unicode_case_operation(arg, PyUCS4_ToUpper, &buf, 1); +} -/* Test PyUnicode_ToLower() */ +/* Test PyUCS4_ToLower() */ static PyObject * unicode_totitle(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToTitle); + Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUCS4_ToTitle, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); } -/* Test PyUnicode_ToLower() */ +/* Test PyUCS4_ToLower() */ static PyObject * unicode_tofolded(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToFolded); + Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUCS4_ToFolded, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); } @@ -633,6 +645,7 @@ static PyMethodDef TestMethods[] = { {"unicode_GET_CACHED_HASH", unicode_GET_CACHED_HASH, METH_O}, {"unicode_tolower", unicode_tolower, METH_O}, {"unicode_toupper", unicode_toupper, METH_O}, + {"unicode_toupper_buffer_too_small", unicode_toupper_buffer_too_small, METH_O}, {"unicode_totitle", unicode_totitle, METH_O}, {"unicode_tofolded", unicode_tofolded, METH_O}, {NULL}, diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index da70f60b12c450..aacfc316e2b960 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -198,7 +198,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } -Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) +Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -224,7 +224,7 @@ Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) return 1; } -Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) +Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -250,7 +250,7 @@ Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) return 1; } -Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) +Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -276,7 +276,7 @@ Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) return 1; } -Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) +Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -294,7 +294,7 @@ Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) return n; } - return PyUnicode_ToLower(ch, res, size); + return PyUCS4_ToLower(ch, res, size); } int _PyUnicode_IsCased(Py_UCS4 ch) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 63c217d5c91060..c54d95cf9226fa 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10045,7 +10045,7 @@ lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, mapped[0] = handle_capital_sigma(kind, data, length, i); return 1; } - return PyUnicode_ToLower(c, mapped, mapped_size); + return PyUCS4_ToLower(c, mapped, mapped_size); } static Py_ssize_t @@ -10055,7 +10055,7 @@ do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UC Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); - n_res = PyUnicode_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + n_res = PyUCS4_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); @@ -10084,7 +10084,7 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); } else if (Py_UNICODE_ISLOWER(c)) { - n_res = PyUnicode_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); + n_res = PyUCS4_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); } else { n_res = 1; @@ -10111,7 +10111,7 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, if (lower) n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = PyUnicode_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); + n_res = PyUCS4_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); @@ -10141,7 +10141,7 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - Py_ssize_t j, n_res = PyUnicode_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); + Py_ssize_t j, n_res = PyUCS4_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); @@ -10166,7 +10166,7 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m if (previous_is_cased) n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = PyUnicode_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + n_res = PyUCS4_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); From 1caaa854ef87008a02bbff3ea179f1c7e07a7b69 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 29 Jul 2025 10:19:03 +0200 Subject: [PATCH 14/14] Address feedback --- Doc/c-api/unicode.rst | 31 +++++++++++++++++++++++-------- Tools/unicode/makeunicodedata.py | 1 + 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index f6caa1f1f65c37..de6829eb92c5dc 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -310,46 +310,61 @@ These APIs can be used for fast direct character conversions: .. c:function:: Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to lower case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be lower cased - (e.g. a maximum of two character for Unicode 16.0), and + able to hold as many characters needed for *ch* to be lower cased, and return the number of characters stored. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + In Unicode 16.0, any character can be lowercased into a buffer of *size* ``2``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + .. versionadded:: next .. c:function:: Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to upper case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be upper cased - (e.g. a maximum of three character for Unicode 16.0), and + able to hold as many characters needed for *ch* to be upper cased, and return the number of characters stored. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + In Unicode 16.0, any character can be uppercased into a buffer of *size* ``3``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + .. versionadded:: next .. c:function:: Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to title case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be title cased - (e.g. a maximum of three character for Unicode 16.0), and + able to hold as many characters needed for *ch* to be title cased, and return the number of characters stored. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + In Unicode 16.0, any character can be titlecased into a buffer of *size* ``3``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + .. versionadded:: next .. c:function:: Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Foldcase *ch*, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be foldcased - (e.g. a maximum of three character for Unicode 16.0), and + able to hold as many characters needed for *ch* to be foldcased, and return the number of characters stored. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + In Unicode 16.0, any character can be foldcased into a buffer of *size* ``3``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + .. versionadded:: next +.. c:macro:: PyUCS4_CASE_CONVERSION_BUFFER_SIZE + + The minimum buffer size needed for any call to :c:func:`PyUCS4_ToLower`, + :c:func:`PyUCS4_ToUpper`, :c:func:`PyUCS4_ToTitle`, or + :c:func:`PyUCS4_ToFolded`. That is, ``3`` for Unicode 16.0. + +.. versionadded:: next These APIs can be used to work with surrogates: diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index d4cca68c3e3e71..40ef4379419008 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -44,6 +44,7 @@ # * Doc/library/stdtypes.rst, and # * Doc/library/unicodedata.rst # * Doc/reference/lexical_analysis.rst (three occurrences) +# * Doc/c-api-unicode.rst (in case conversion APIs) UNIDATA_VERSION = "16.0.0" UNICODE_DATA = "UnicodeData%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy