diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 84fee05cb4ce20..de6829eb92c5dc 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -307,6 +307,66 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. +.. c:function:: Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) + + Convert *ch* to lower case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be lower cased, and + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + + In Unicode 16.0, any character can be lowercased into a buffer of *size* ``2``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) + + Convert *ch* to upper case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be upper cased, and + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + + In Unicode 16.0, any character can be uppercased into a buffer of *size* ``3``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) + + Convert *ch* to title case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be title cased, and + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + + In Unicode 16.0, any character can be titlecased into a buffer of *size* ``3``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) + + Foldcase *ch*, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be foldcased, and + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + + In Unicode 16.0, any character can be foldcased into a buffer of *size* ``3``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + + .. versionadded:: next + +.. c:macro:: PyUCS4_CASE_CONVERSION_BUFFER_SIZE + + The minimum buffer size needed for any call to :c:func:`PyUCS4_ToLower`, + :c:func:`PyUCS4_ToUpper`, :c:func:`PyUCS4_ToTitle`, or + :c:func:`PyUCS4_ToFolded`. That is, ``3`` for Unicode 16.0. + +.. versionadded:: next + + These APIs can be used to work with surrogates: .. c:function:: int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 73e3bc44d6c9ca..662e3f5ab06dcf 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -733,6 +733,31 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToLower( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + Py_ssize_t size /* Buffer size */ + ); + +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToUpper( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + Py_ssize_t size /* Buffer size */ + ); + +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToTitle( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + Py_ssize_t size /* Buffer size */ + ); + +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToFolded( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + Py_ssize_t size /* Buffer size */ + ); + + // Helper array used by Py_UNICODE_ISSPACE(). PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; @@ -767,6 +792,8 @@ static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) { #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) +#define PyUCS4_CASE_CONVERSION_BUFFER_SIZE 3 + static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) { return (Py_UNICODE_ISALPHA(ch) || Py_UNICODE_ISDECIMAL(ch) diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index 3791b913c17546..c83a221bb6a3a1 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -15,10 +15,6 @@ extern "C" { extern int _PyUnicode_IsXidStart(Py_UCS4 ch); extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); -extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch); extern int _PyUnicode_IsCased(Py_UCS4 ch); diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 6a9c60f3a6d75e..3a5d1a0053f351 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1753,6 +1753,65 @@ def test_GET_CACHED_HASH(self): # impl detail: ASCII string hashes are equal to bytes ones self.assertEqual(unicode_GET_CACHED_HASH(obj), hash(content_bytes)) + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_tolower(self): + import string + from _testcapi import unicode_tolower + + for i, c in enumerate(string.ascii_uppercase): + with self.subTest(c): + self.assertEqual(unicode_tolower(c), string.ascii_lowercase[i]) + + # Test unicode character + self.assertEqual(unicode_tolower("Č"), "č") + self.assertEqual(unicode_tolower("Σ"), "σ") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_toupper(self): + import string + from _testcapi import unicode_toupper, unicode_toupper_buffer_too_small + + for i, c in enumerate(string.ascii_lowercase): + with self.subTest(c): + self.assertEqual(unicode_toupper(c), string.ascii_uppercase[i]) + + # Test unicode character + self.assertEqual(unicode_toupper("č"), "Č") + self.assertEqual(unicode_toupper("ß"), "SS") + self.assertEqual(unicode_toupper("ΐ"), "Ϊ́") + + # Test unicode character with smaller buffer + with self.assertRaisesRegex(ValueError, "output buffer is too small"): + unicode_toupper_buffer_too_small("ß") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_totitle(self): + from _testcapi import unicode_totitle + + self.assertEqual(unicode_totitle("t"), "T") + + # Test unicode character + self.assertEqual(unicode_totitle("ł"), "Ł") + self.assertEqual(unicode_totitle("ß"), "Ss") + self.assertEqual(unicode_totitle("ΐ"), "Ϊ́") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_tofolded(self): + from _testcapi import unicode_tofolded + + self.assertEqual(unicode_tofolded("T"), "t") + + # Test unicode character + self.assertEqual(unicode_tofolded("Ł"), "ł") + self.assertEqual(unicode_tofolded("Σ"), "σ") + + # Test case-ignorable character + self.assertEqual(unicode_tofolded("👍"), "👍") + class PyUnicodeWriterTest(unittest.TestCase): def create_writer(self, size): diff --git a/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst new file mode 100644 index 00000000000000..37d251b6e35d8f --- /dev/null +++ b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst @@ -0,0 +1 @@ +Make :c:func:`PyUCS4_ToLower`, :c:func:`PyUCS4_ToUpper`, :c:func:`PyUCS4_ToTitle` and :c:func:`PyUCS4_ToFolded` public. diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 203282dd53dd0a..c3106f0fcb8543 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -220,6 +220,72 @@ unicode_copycharacters(PyObject *self, PyObject *args) return Py_BuildValue("(Nn)", to_copy, copied); } +static PyObject * +unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, Py_ssize_t), + Py_UCS4 *buf, Py_ssize_t size) +{ + if (!PyUnicode_Check(str)) { + PyErr_Format(PyExc_TypeError, "expect str type, got %T", str); + return NULL; + } + + if (PyUnicode_GET_LENGTH(str) != 1) { + PyErr_SetString(PyExc_ValueError, "expecting 1-character strings only"); + return NULL; + } + + Py_UCS4 c = PyUnicode_READ_CHAR(str, 0); + + Py_ssize_t chars = function(c, buf, size); + if (chars < 0) { + return NULL; + } + + return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buf, chars); +} + +/* Test PyUCS4_ToLower() */ +static PyObject * +unicode_tolower(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUCS4_ToLower, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); +} + + +/* Test PyUCS4_ToUpper() */ +static PyObject * +unicode_toupper(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUCS4_ToUpper, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); +} + +/* Test PyUCS4_ToUpper() with a small buffer */ +static PyObject * +unicode_toupper_buffer_too_small(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf; + return unicode_case_operation(arg, PyUCS4_ToUpper, &buf, 1); +} + +/* Test PyUCS4_ToLower() */ +static PyObject * +unicode_totitle(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUCS4_ToTitle, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); +} + +/* Test PyUCS4_ToLower() */ +static PyObject * +unicode_tofolded(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUCS4_ToFolded, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); +} + + static PyObject* unicode_GET_CACHED_HASH(PyObject *self, PyObject *arg) { @@ -577,6 +643,11 @@ static PyMethodDef TestMethods[] = { {"unicode_asutf8", unicode_asutf8, METH_VARARGS}, {"unicode_copycharacters", unicode_copycharacters, METH_VARARGS}, {"unicode_GET_CACHED_HASH", unicode_GET_CACHED_HASH, METH_O}, + {"unicode_tolower", unicode_tolower, METH_O}, + {"unicode_toupper", unicode_toupper, METH_O}, + {"unicode_toupper_buffer_too_small", unicode_toupper_buffer_too_small, METH_O}, + {"unicode_totitle", unicode_totitle, METH_O}, + {"unicode_tofolded", unicode_tofolded, METH_O}, {NULL}, }; diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 7cd0dca3d13545..aacfc316e2b960 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -198,67 +198,103 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } -int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) +Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->lower & 0xFFFF; int n = ctype->lower >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; return n; } + + if (size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->lower; return 1; } -int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) +Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->title & 0xFFFF; int n = ctype->title >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; return n; } + + if (size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->title; return 1; } -int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) +Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->upper & 0xFFFF; int n = ctype->upper >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; return n; } + + if (size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->upper; return 1; } -int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) +Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) { int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); int n = (ctype->lower >> 20) & 7; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; return n; } - return _PyUnicode_ToLowerFull(ch, res); + + return PyUCS4_ToLower(ch, res, size); } int _PyUnicode_IsCased(Py_UCS4 ch) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8df7a48284dccd..c54d95cf9226fa 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10036,34 +10036,35 @@ handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i return (final_sigma) ? 0x3C2 : 0x3C3; } -static int +static Py_ssize_t lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, - Py_UCS4 c, Py_UCS4 *mapped) + Py_UCS4 c, Py_UCS4 *mapped, Py_ssize_t mapped_size) { /* Obscure special case. */ if (c == 0x3A3) { mapped[0] = handle_capital_sigma(kind, data, length, i); return 1; } - return _PyUnicode_ToLowerFull(c, mapped); + return PyUCS4_ToLower(c, mapped, mapped_size); } static Py_ssize_t do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { - Py_ssize_t i, k = 0; - int n_res, j; + Py_ssize_t i, k = 0, n_res, j; Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); - n_res = _PyUnicode_ToTitleFull(c, mapped); + n_res = PyUCS4_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; } for (i = 1; i < length; i++) { c = PyUnicode_READ(kind, data, i); - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10078,17 +10079,18 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (Py_UNICODE_ISUPPER(c)) { - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); } else if (Py_UNICODE_ISLOWER(c)) { - n_res = _PyUnicode_ToUpperFull(c, mapped); + n_res = PyUCS4_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); } else { n_res = 1; mapped[0] = c; } + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10105,11 +10107,12 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (lower) - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = _PyUnicode_ToUpperFull(c, mapped); + n_res = PyUCS4_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10138,7 +10141,8 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); + Py_ssize_t j, n_res = PyUCS4_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10157,13 +10161,13 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m for (i = 0; i < length; i++) { const Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (previous_is_cased) - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = _PyUnicode_ToTitleFull(c, mapped); - + n_res = PyUCS4_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index d4cca68c3e3e71..40ef4379419008 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -44,6 +44,7 @@ # * Doc/library/stdtypes.rst, and # * Doc/library/unicodedata.rst # * Doc/reference/lexical_analysis.rst (three occurrences) +# * Doc/c-api-unicode.rst (in case conversion APIs) UNIDATA_VERSION = "16.0.0" UNICODE_DATA = "UnicodeData%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy