diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 2a2cb1b8c458e7..5ab9f1cab23ef8 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1396,6 +1396,28 @@ They all return ``NULL`` or ``-1`` if an exception occurs. :c:func:`PyErr_Occurred` to check for errors. +.. c:function:: int PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *string, Py_ssize_t size) + + Compare a Unicode object with a char buffer which is interpreted as + being UTF-8 or ASCII encoded and return true (``1``) if they are equal, + or false (``0``) otherwise. + If the Unicode object contains surrogate characters or + the C string is not valid UTF-8, false (``0``) is returned. + + This function does not raise exceptions. + + .. versionadded:: 3.13 + + +.. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string) + + Similar to :c:func:`PyUnicode_EqualToUTF8AndSize`, but compute *string* + length using :c:func:`!strlen`. + If the Unicode object contains null characters, false (``0``) is returned. + + .. versionadded:: 3.13 + + .. c:function:: int PyUnicode_CompareWithASCIIString(PyObject *uni, const char *string) Compare a Unicode object, *uni*, with *string* and return ``-1``, ``0``, ``1`` for less diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index 5bccd5edf586f4..6ec9c907254b04 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -755,6 +755,8 @@ function,PyUnicode_DecodeUnicodeEscape,3.2,, function,PyUnicode_EncodeCodePage,3.7,on Windows, function,PyUnicode_EncodeFSDefault,3.2,, function,PyUnicode_EncodeLocale,3.7,, +function,PyUnicode_EqualToUTF8,3.13,, +function,PyUnicode_EqualToUTF8AndSize,3.13,, function,PyUnicode_FSConverter,3.2,, function,PyUnicode_FSDecoder,3.2,, function,PyUnicode_Find,3.2,, diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 9a24c1fabf05d5..250f33e164fb30 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -1024,6 +1024,12 @@ New Features functions on Python 3.11 and 3.12. (Contributed by Victor Stinner in :gh:`107073`.) +* Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8` + functions: compare Unicode object with a :c:expr:`const char*` UTF-8 encoded + string and return true (``1``) if they are equal, or false (``0``) otherwise. + These functions do not raise exceptions. + (Contributed by Serhiy Storchaka in :gh:`110289`.) + * Add :c:func:`PyThreadState_GetUnchecked()` function: similar to :c:func:`PyThreadState_Get()`, but don't kill the process with a fatal error if it is NULL. The caller is responsible to check if the result is NULL. diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index f00277787122aa..dee00715b3c51d 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -957,6 +957,15 @@ PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( const char *right /* ASCII-encoded string */ ); +#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000 +/* Compare a Unicode object with UTF-8 encoded C string. + Return 1 if they are equal, or 0 otherwise. + This function does not raise exceptions. */ + +PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *); +PyAPI_FUNC(int) PyUnicode_EqualToUTF8AndSize(PyObject *, const char *, Py_ssize_t); +#endif + /* Rich compare two strings and return one of the following: - NULL in case an exception was raised diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 622ee8993907fa..a73e669dda7ddc 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1297,6 +1297,118 @@ def test_comparewithasciistring(self): # CRASHES comparewithasciistring([], b'abc') # CRASHES comparewithasciistring(NULL, b'abc') + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_equaltoutf8(self): + # Test PyUnicode_EqualToUTF8() + from _testcapi import unicode_equaltoutf8 as equaltoutf8 + from _testcapi import unicode_asutf8andsize as asutf8andsize + + strings = [ + 'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16', + '\U0001f600\U0001f601\U0001f602', + '\U0010ffff', + ] + for s in strings: + # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8 + # encoded string cached in the Unicode object. + asutf8andsize(s, 0) + b = s.encode() + self.assertEqual(equaltoutf8(s, b), 1) # Use the UTF-8 cache. + s2 = b.decode() # New Unicode object without the UTF-8 cache. + self.assertEqual(equaltoutf8(s2, b), 1) + self.assertEqual(equaltoutf8(s + 'x', b + b'x'), 1) + self.assertEqual(equaltoutf8(s + 'x', b + b'y'), 0) + self.assertEqual(equaltoutf8(s, b + b'\0'), 1) + self.assertEqual(equaltoutf8(s2, b + b'\0'), 1) + self.assertEqual(equaltoutf8(s + '\0', b + b'\0'), 0) + self.assertEqual(equaltoutf8(s + '\0', b), 0) + self.assertEqual(equaltoutf8(s2, b + b'x'), 0) + self.assertEqual(equaltoutf8(s2, b[:-1]), 0) + self.assertEqual(equaltoutf8(s2, b[:-1] + b'x'), 0) + + self.assertEqual(equaltoutf8('', b''), 1) + self.assertEqual(equaltoutf8('', b'\0'), 1) + + # embedded null chars/bytes + self.assertEqual(equaltoutf8('abc', b'abc\0def\0'), 1) + self.assertEqual(equaltoutf8('a\0bc', b'abc'), 0) + self.assertEqual(equaltoutf8('abc', b'a\0bc'), 0) + + # Surrogate characters are always treated as not equal + self.assertEqual(equaltoutf8('\udcfe', + '\udcfe'.encode("utf8", "surrogateescape")), 0) + self.assertEqual(equaltoutf8('\udcfe', + '\udcfe'.encode("utf8", "surrogatepass")), 0) + self.assertEqual(equaltoutf8('\ud801', + '\ud801'.encode("utf8", "surrogatepass")), 0) + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_equaltoutf8andsize(self): + # Test PyUnicode_EqualToUTF8AndSize() + from _testcapi import unicode_equaltoutf8andsize as equaltoutf8andsize + from _testcapi import unicode_asutf8andsize as asutf8andsize + + strings = [ + 'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16', + '\U0001f600\U0001f601\U0001f602', + '\U0010ffff', + ] + for s in strings: + # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8 + # encoded string cached in the Unicode object. + asutf8andsize(s, 0) + b = s.encode() + self.assertEqual(equaltoutf8andsize(s, b), 1) # Use the UTF-8 cache. + s2 = b.decode() # New Unicode object without the UTF-8 cache. + self.assertEqual(equaltoutf8andsize(s2, b), 1) + self.assertEqual(equaltoutf8andsize(s + 'x', b + b'x'), 1) + self.assertEqual(equaltoutf8andsize(s + 'x', b + b'y'), 0) + self.assertEqual(equaltoutf8andsize(s, b + b'\0'), 0) + self.assertEqual(equaltoutf8andsize(s2, b + b'\0'), 0) + self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0'), 1) + self.assertEqual(equaltoutf8andsize(s + '\0', b), 0) + self.assertEqual(equaltoutf8andsize(s2, b + b'x'), 0) + self.assertEqual(equaltoutf8andsize(s2, b[:-1]), 0) + self.assertEqual(equaltoutf8andsize(s2, b[:-1] + b'x'), 0) + # Not null-terminated, + self.assertEqual(equaltoutf8andsize(s, b + b'x', len(b)), 1) + self.assertEqual(equaltoutf8andsize(s2, b + b'x', len(b)), 1) + self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0x', len(b) + 1), 1) + self.assertEqual(equaltoutf8andsize(s2, b, len(b) - 1), 0) + + self.assertEqual(equaltoutf8andsize('', b''), 1) + self.assertEqual(equaltoutf8andsize('', b'\0'), 0) + self.assertEqual(equaltoutf8andsize('', b'x', 0), 1) + + # embedded null chars/bytes + self.assertEqual(equaltoutf8andsize('abc\0def', b'abc\0def'), 1) + self.assertEqual(equaltoutf8andsize('abc\0def\0', b'abc\0def\0'), 1) + + # Surrogate characters are always treated as not equal + self.assertEqual(equaltoutf8andsize('\udcfe', + '\udcfe'.encode("utf8", "surrogateescape")), 0) + self.assertEqual(equaltoutf8andsize('\udcfe', + '\udcfe'.encode("utf8", "surrogatepass")), 0) + self.assertEqual(equaltoutf8andsize('\ud801', + '\ud801'.encode("utf8", "surrogatepass")), 0) + + def check_not_equal_encoding(text, encoding): + self.assertEqual(equaltoutf8andsize(text, text.encode(encoding)), 0) + self.assertNotEqual(text.encode(encoding), text.encode("utf8")) + + # Strings encoded to other encodings are not equal to expected UTF8-encoding string + check_not_equal_encoding('Stéphane', 'latin1') + check_not_equal_encoding('Stéphane', 'utf-16-le') # embedded null characters + check_not_equal_encoding('北京市', 'gbk') + + # CRASHES equaltoutf8andsize('abc', b'abc', -1) + # CRASHES equaltoutf8andsize(b'abc', b'abc') + # CRASHES equaltoutf8andsize([], b'abc') + # CRASHES equaltoutf8andsize(NULL, b'abc') + # CRASHES equaltoutf8andsize('abc', NULL) + @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') def test_richcompare(self): diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py index 4691687ed9d391..e06f9cabf4366b 100644 --- a/Lib/test/test_stable_abi_ctypes.py +++ b/Lib/test/test_stable_abi_ctypes.py @@ -770,6 +770,8 @@ def test_windows_feature_macros(self): "PyUnicode_DecodeUnicodeEscape", "PyUnicode_EncodeFSDefault", "PyUnicode_EncodeLocale", + "PyUnicode_EqualToUTF8", + "PyUnicode_EqualToUTF8AndSize", "PyUnicode_FSConverter", "PyUnicode_FSDecoder", "PyUnicode_Find", diff --git a/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst new file mode 100644 index 00000000000000..9028e35130d50c --- /dev/null +++ b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst @@ -0,0 +1 @@ +Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8` functions. diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml index 469fd27b622344..9d66b92eb8edf0 100644 --- a/Misc/stable_abi.toml +++ b/Misc/stable_abi.toml @@ -2462,3 +2462,7 @@ added = '3.13' [function.Py_IsFinalizing] added = '3.13' +[function.PyUnicode_EqualToUTF8] + added = '3.13' +[function.PyUnicode_EqualToUTF8AndSize] + added = '3.13' diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 232b2ad543fca0..d52d88a65d86fc 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -1429,6 +1429,48 @@ unicode_comparewithasciistring(PyObject *self, PyObject *args) return PyLong_FromLong(result); } +/* Test PyUnicode_EqualToUTF8() */ +static PyObject * +unicode_equaltoutf8(PyObject *self, PyObject *args) +{ + PyObject *left; + const char *right = NULL; + Py_ssize_t right_len; + int result; + + if (!PyArg_ParseTuple(args, "Oz#", &left, &right, &right_len)) { + return NULL; + } + + NULLABLE(left); + result = PyUnicode_EqualToUTF8(left, right); + assert(!PyErr_Occurred()); + return PyLong_FromLong(result); +} + +/* Test PyUnicode_EqualToUTF8AndSize() */ +static PyObject * +unicode_equaltoutf8andsize(PyObject *self, PyObject *args) +{ + PyObject *left; + const char *right = NULL; + Py_ssize_t right_len; + Py_ssize_t size = -100; + int result; + + if (!PyArg_ParseTuple(args, "Oz#|n", &left, &right, &right_len, &size)) { + return NULL; + } + + NULLABLE(left); + if (size == -100) { + size = right_len; + } + result = PyUnicode_EqualToUTF8AndSize(left, right, size); + assert(!PyErr_Occurred()); + return PyLong_FromLong(result); +} + /* Test PyUnicode_RichCompare() */ static PyObject * unicode_richcompare(PyObject *self, PyObject *args) @@ -2044,6 +2086,8 @@ static PyMethodDef TestMethods[] = { {"unicode_replace", unicode_replace, METH_VARARGS}, {"unicode_compare", unicode_compare, METH_VARARGS}, {"unicode_comparewithasciistring",unicode_comparewithasciistring,METH_VARARGS}, + {"unicode_equaltoutf8", unicode_equaltoutf8, METH_VARARGS}, + {"unicode_equaltoutf8andsize",unicode_equaltoutf8andsize, METH_VARARGS}, {"unicode_richcompare", unicode_richcompare, METH_VARARGS}, {"unicode_format", unicode_format, METH_VARARGS}, {"unicode_contains", unicode_contains, METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 49981a1f881c21..33cbc987d43282 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10673,6 +10673,82 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) } } +int +PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) +{ + return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str)); +} + +int +PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size) +{ + assert(_PyUnicode_CHECK(unicode)); + assert(str); + + if (PyUnicode_IS_ASCII(unicode)) { + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + return size == len && + memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0; + } + if (PyUnicode_UTF8(unicode) != NULL) { + Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode); + return size == len && + memcmp(PyUnicode_UTF8(unicode), str, len) == 0; + } + + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) { + return 0; + } + const unsigned char *s = (const unsigned char *)str; + const unsigned char *ends = s + (size_t)size; + int kind = PyUnicode_KIND(unicode); + const void *data = PyUnicode_DATA(unicode); + /* Compare Unicode string and UTF-8 string */ + for (Py_ssize_t i = 0; i < len; i++) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if (ch < 0x80) { + if (ends == s || s[0] != ch) { + return 0; + } + s += 1; + } + else if (ch < 0x800) { + if ((ends - s) < 2 || + s[0] != (0xc0 | (ch >> 6)) || + s[1] != (0x80 | (ch & 0x3f))) + { + return 0; + } + s += 2; + } + else if (ch < 0x10000) { + if (Py_UNICODE_IS_SURROGATE(ch) || + (ends - s) < 3 || + s[0] != (0xe0 | (ch >> 12)) || + s[1] != (0x80 | ((ch >> 6) & 0x3f)) || + s[2] != (0x80 | (ch & 0x3f))) + { + return 0; + } + s += 3; + } + else { + assert(ch <= MAX_UNICODE); + if ((ends - s) < 4 || + s[0] != (0xf0 | (ch >> 18)) || + s[1] != (0x80 | ((ch >> 12) & 0x3f)) || + s[2] != (0x80 | ((ch >> 6) & 0x3f)) || + s[3] != (0x80 | (ch & 0x3f))) + { + return 0; + } + s += 4; + } + } + return s == ends; +} + int _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str) { diff --git a/PC/python3dll.c b/PC/python3dll.c index 785d6886f39f6d..7ee11746770442 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -689,6 +689,8 @@ EXPORT_FUNC(PyUnicode_DecodeUTF8Stateful) EXPORT_FUNC(PyUnicode_EncodeCodePage) EXPORT_FUNC(PyUnicode_EncodeFSDefault) EXPORT_FUNC(PyUnicode_EncodeLocale) +EXPORT_FUNC(PyUnicode_EqualToUTF8) +EXPORT_FUNC(PyUnicode_EqualToUTF8AndSize) EXPORT_FUNC(PyUnicode_Find) EXPORT_FUNC(PyUnicode_FindChar) EXPORT_FUNC(PyUnicode_Format)
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: