From d39945ec55aaa14d62c90fac3f7541034c5597be Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 3 Oct 2023 18:24:05 +0300 Subject: [PATCH 01/16] gh-110289: C API: Add PyUnicode_EqualToString() function --- Doc/c-api/unicode.rst | 10 +++ Doc/data/stable_abi.dat | 1 + Doc/whatsnew/3.13.rst | 7 +++ Include/unicodeobject.h | 8 +++ Lib/test/test_stable_abi_ctypes.py | 1 + ...-10-03-19-01-20.gh-issue-110289.YBIHEz.rst | 1 + Misc/stable_abi.toml | 2 + Objects/unicodeobject.c | 61 +++++++++++++++++++ PC/python3dll.c | 1 + 9 files changed, 92 insertions(+) create mode 100644 Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 2a2cb1b8c458e7..f552380124bb37 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1396,6 +1396,16 @@ They all return ``NULL`` or ``-1`` if an exception occurs. :c:func:`PyErr_Occurred` to check for errors. +.. c:function:: int PyUnicode_EqualToString(PyObject *unicode, const char *string) + + Compare a Unicode object with a UTF-8 encoded C string and return true + if they are equal and false otherwise. + + This function does not raise exceptions. + + .. versionadded:: 3.13 + + .. c:function:: int PyUnicode_CompareWithASCIIString(PyObject *uni, const char *string) Compare a Unicode object, *uni*, with *string* and return ``-1``, ``0``, ``1`` for less diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index c189c78238f40f..abfc186cdc460d 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -755,6 +755,7 @@ function,PyUnicode_DecodeUnicodeEscape,3.2,, function,PyUnicode_EncodeCodePage,3.7,on Windows, function,PyUnicode_EncodeFSDefault,3.2,, function,PyUnicode_EncodeLocale,3.7,, +function,PyUnicode_EqualToString,3.13,, function,PyUnicode_FSConverter,3.2,, function,PyUnicode_FSDecoder,3.2,, function,PyUnicode_Find,3.2,, diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 1ef04fa7ae6adc..7f05a0275f4664 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -1001,6 +1001,13 @@ New Features :c:macro:`Py_TPFLAGS_MANAGED_DICT` flag. (Contributed by Victor Stinner in :gh:`107073`.) +* Add :c:func:`PyUnicode_EqualToString` function: compare Unicode object with + a :c:expr:`const char*` UTF-8 encoded bytes string and return true if they + are equal or false otherwise. + This function does not raise exceptions. + (Contributed by Serhiy Storchaka in :gh:`110289`.) + + Porting to Python 3.13 ---------------------- diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index f00277787122aa..a7fad22e606b28 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -957,6 +957,14 @@ PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( const char *right /* ASCII-encoded string */ ); +#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000 +/* Compare a Unicode object with UTF-8 encoded C string and return 1 for equal + and 0 otherwise. + This function does not raise exceptions. */ + +PyAPI_FUNC(int) PyUnicode_EqualToString(PyObject *, const char *); +#endif + /* Rich compare two strings and return one of the following: - NULL in case an exception was raised diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py index 94f817f8e1d159..f224d67e6416d6 100644 --- a/Lib/test/test_stable_abi_ctypes.py +++ b/Lib/test/test_stable_abi_ctypes.py @@ -770,6 +770,7 @@ def test_windows_feature_macros(self): "PyUnicode_DecodeUnicodeEscape", "PyUnicode_EncodeFSDefault", "PyUnicode_EncodeLocale", + "PyUnicode_EqualToString", "PyUnicode_FSConverter", "PyUnicode_FSDecoder", "PyUnicode_Find", diff --git a/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst new file mode 100644 index 00000000000000..ada5072071a476 --- /dev/null +++ b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst @@ -0,0 +1 @@ +Add :c:func:`PyUnicode_EqualToString` function. diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml index 8df3f85e61eec6..20f6ea560b4316 100644 --- a/Misc/stable_abi.toml +++ b/Misc/stable_abi.toml @@ -2460,3 +2460,5 @@ added = '3.13' [function.PyMapping_HasKeyStringWithError] added = '3.13' +[function.PyUnicode_EqualToString] + added = '3.13' diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 49981a1f881c21..8c71990a011849 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10673,6 +10673,67 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) } } +int +PyUnicode_EqualToString(PyObject *unicode, const char *str) +{ + assert(_PyUnicode_CHECK(unicode)); + assert(str); + if (PyUnicode_IS_ASCII(unicode)) { + size_t len = (size_t)PyUnicode_GET_LENGTH(unicode); + return strlen(str) == len && + memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0; + } + if (PyUnicode_UTF8(unicode) != NULL) { + size_t len = (size_t)PyUnicode_UTF8_LENGTH(unicode); + return strlen(str) == len && + memcmp(PyUnicode_UTF8(unicode), str, len) == 0; + } + + Py_UCS4 ch; + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + int kind = PyUnicode_KIND(unicode); + const void *data = PyUnicode_DATA(unicode); + /* Compare Unicode string and UTF-8 string */ + for (Py_ssize_t i = 0; i < len; i++) { + ch = PyUnicode_READ(kind, data, i); + if (ch == 0x80) { + return 0; + } + else if (ch < 0x80) { + if (ch != (unsigned char)*str++) { + return 0; + } + } + else if (ch < 0x800) { + if ((0xc0 | (ch >> 6)) != (unsigned char)*str++ || + (0x80 | (ch & 0x3f)) != (unsigned char)*str++) + { + return 0; + } + } + else if (ch < 0x10000) { + if (Py_UNICODE_IS_SURROGATE(ch) || + (0xe0 | (ch >> 12)) != (unsigned char)*str++ || + (0x80 | ((ch >> 6) & 0x3f)) != (unsigned char)*str++ || + (0x80 | (ch & 0x3f)) != (unsigned char)*str++) + { + return 0; + } + } + else { + assert(ch <= MAX_UNICODE); + if ((0xf0 | (ch >> 18)) != (unsigned char)*str++ || + (0x80 | ((ch >> 12) & 0x3f)) != (unsigned char)*str++ || + (0x80 | ((ch >> 6) & 0x3f)) != (unsigned char)*str++ || + (0x80 | (ch & 0x3f)) != (unsigned char)*str++) + { + return 0; + } + } + } + return *str == 0; +} + int _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str) { diff --git a/PC/python3dll.c b/PC/python3dll.c index 2c1cc8098ce856..5f629ccf99d28a 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -688,6 +688,7 @@ EXPORT_FUNC(PyUnicode_DecodeUTF8Stateful) EXPORT_FUNC(PyUnicode_EncodeCodePage) EXPORT_FUNC(PyUnicode_EncodeFSDefault) EXPORT_FUNC(PyUnicode_EncodeLocale) +EXPORT_FUNC(PyUnicode_EqualToString) EXPORT_FUNC(PyUnicode_Find) EXPORT_FUNC(PyUnicode_FindChar) EXPORT_FUNC(PyUnicode_Format) From 4793161fcb730e2d09794a2b7cf91460b2d48a87 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 3 Oct 2023 21:20:38 +0300 Subject: [PATCH 02/16] Add tests and address review comments. --- Doc/c-api/unicode.rst | 4 ++- Doc/data/stable_abi.dat | 2 +- Doc/whatsnew/3.13.rst | 2 +- Include/unicodeobject.h | 2 +- Lib/test/test_capi/test_unicode.py | 31 +++++++++++++++++++ Lib/test/test_stable_abi_ctypes.py | 2 +- ...-10-03-19-01-20.gh-issue-110289.YBIHEz.rst | 2 +- Misc/stable_abi.toml | 2 +- Modules/_testcapi/unicode.c | 19 ++++++++++++ Objects/unicodeobject.c | 4 +-- PC/python3dll.c | 2 +- 11 files changed, 62 insertions(+), 10 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index f552380124bb37..c8bd0d7f81c7e5 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1396,10 +1396,12 @@ They all return ``NULL`` or ``-1`` if an exception occurs. :c:func:`PyErr_Occurred` to check for errors. -.. c:function:: int PyUnicode_EqualToString(PyObject *unicode, const char *string) +.. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string) Compare a Unicode object with a UTF-8 encoded C string and return true if they are equal and false otherwise. + If the Unicode object contains null or surrogate characters or + the C string not encoded to UTF-8 return false. This function does not raise exceptions. diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index abfc186cdc460d..1407659d1ae576 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -755,7 +755,7 @@ function,PyUnicode_DecodeUnicodeEscape,3.2,, function,PyUnicode_EncodeCodePage,3.7,on Windows, function,PyUnicode_EncodeFSDefault,3.2,, function,PyUnicode_EncodeLocale,3.7,, -function,PyUnicode_EqualToString,3.13,, +function,PyUnicode_EqualToUTF8,3.13,, function,PyUnicode_FSConverter,3.2,, function,PyUnicode_FSDecoder,3.2,, function,PyUnicode_Find,3.2,, diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 7f05a0275f4664..ccc29fd915fcf2 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -1001,7 +1001,7 @@ New Features :c:macro:`Py_TPFLAGS_MANAGED_DICT` flag. (Contributed by Victor Stinner in :gh:`107073`.) -* Add :c:func:`PyUnicode_EqualToString` function: compare Unicode object with +* Add :c:func:`PyUnicode_EqualToUTF8` function: compare Unicode object with a :c:expr:`const char*` UTF-8 encoded bytes string and return true if they are equal or false otherwise. This function does not raise exceptions. diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index a7fad22e606b28..e2787497c47bef 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -962,7 +962,7 @@ PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( and 0 otherwise. This function does not raise exceptions. */ -PyAPI_FUNC(int) PyUnicode_EqualToString(PyObject *, const char *); +PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *); #endif /* Rich compare two strings and return one of the following: diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 622ee8993907fa..e6e3792c639aec 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1297,6 +1297,37 @@ def test_comparewithasciistring(self): # CRASHES comparewithasciistring([], b'abc') # CRASHES comparewithasciistring(NULL, b'abc') + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_equaltoutf8(self): + """Test PyUnicode_EqualToUTF8()""" + from _testcapi import unicode_equaltoutf8 as equaltoutf8 + + strings = [ + 'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16', + '\U0001f600\U0001f601\U0001f602' + ] + for s in strings: + b = s.encode() + self.assertEqual(equaltoutf8(s, b), 1) + self.assertEqual(equaltoutf8(b.decode(), b), 1) + self.assertEqual(equaltoutf8(s + 'x', b + b'x'), 1) + self.assertEqual(equaltoutf8(s + 'x', b + b'y'), 0) + self.assertEqual(equaltoutf8(s + '\0', b + b'\0'), 0) + self.assertEqual(equaltoutf8(s, b + b'x'), 0) + self.assertEqual(equaltoutf8(s, b[:-1]), 0) + self.assertEqual(equaltoutf8(s, b[:-1] + b'x'), 0) + + # surrogateescape + self.assertEqual(equaltoutf8('\udcfe', b'\xfe'), 0) + # surrogatepass + self.assertEqual(equaltoutf8('\udcfe', b'\xed\xb3\xbe'), 0) + + # CRASHES equaltoutf8(b'abc', b'abc') + # CRASHES equaltoutf8([], b'abc') + # CRASHES equaltoutf8(NULL, b'abc') + # CRASHES equaltoutf8('abc') # NULL + @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') def test_richcompare(self): diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py index f224d67e6416d6..d1ae4e382c9470 100644 --- a/Lib/test/test_stable_abi_ctypes.py +++ b/Lib/test/test_stable_abi_ctypes.py @@ -770,7 +770,7 @@ def test_windows_feature_macros(self): "PyUnicode_DecodeUnicodeEscape", "PyUnicode_EncodeFSDefault", "PyUnicode_EncodeLocale", - "PyUnicode_EqualToString", + "PyUnicode_EqualToUTF8", "PyUnicode_FSConverter", "PyUnicode_FSDecoder", "PyUnicode_Find", diff --git a/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst index ada5072071a476..b1582bc1591590 100644 --- a/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst +++ b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst @@ -1 +1 @@ -Add :c:func:`PyUnicode_EqualToString` function. +Add :c:func:`PyUnicode_EqualToUTF8` function. diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml index 20f6ea560b4316..ae39fea3a66a24 100644 --- a/Misc/stable_abi.toml +++ b/Misc/stable_abi.toml @@ -2460,5 +2460,5 @@ added = '3.13' [function.PyMapping_HasKeyStringWithError] added = '3.13' -[function.PyUnicode_EqualToString] +[function.PyUnicode_EqualToUTF8] added = '3.13' diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 232b2ad543fca0..83fb8a7cfbcb87 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -1429,6 +1429,24 @@ unicode_comparewithasciistring(PyObject *self, PyObject *args) return PyLong_FromLong(result); } +/* Test PyUnicode_EqualToUTF8() */ +static PyObject * +unicode_equaltoutf8(PyObject *self, PyObject *args) +{ + PyObject *left; + const char *right = NULL; + Py_ssize_t right_len; + int result; + + if (!PyArg_ParseTuple(args, "O|y#", &left, &right, &right_len)) + return NULL; + + NULLABLE(left); + result = PyUnicode_EqualToUTF8(left, right); + assert(!PyErr_Occurred()); + return PyLong_FromLong(result); +} + /* Test PyUnicode_RichCompare() */ static PyObject * unicode_richcompare(PyObject *self, PyObject *args) @@ -2044,6 +2062,7 @@ static PyMethodDef TestMethods[] = { {"unicode_replace", unicode_replace, METH_VARARGS}, {"unicode_compare", unicode_compare, METH_VARARGS}, {"unicode_comparewithasciistring",unicode_comparewithasciistring,METH_VARARGS}, + {"unicode_equaltoutf8", unicode_equaltoutf8, METH_VARARGS}, {"unicode_richcompare", unicode_richcompare, METH_VARARGS}, {"unicode_format", unicode_format, METH_VARARGS}, {"unicode_contains", unicode_contains, METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8c71990a011849..4993b0c9c52b17 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10674,7 +10674,7 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) } int -PyUnicode_EqualToString(PyObject *unicode, const char *str) +PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) { assert(_PyUnicode_CHECK(unicode)); assert(str); @@ -10696,7 +10696,7 @@ PyUnicode_EqualToString(PyObject *unicode, const char *str) /* Compare Unicode string and UTF-8 string */ for (Py_ssize_t i = 0; i < len; i++) { ch = PyUnicode_READ(kind, data, i); - if (ch == 0x80) { + if (ch == 0) { return 0; } else if (ch < 0x80) { diff --git a/PC/python3dll.c b/PC/python3dll.c index 5f629ccf99d28a..0beb61f28e0ef8 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -688,7 +688,7 @@ EXPORT_FUNC(PyUnicode_DecodeUTF8Stateful) EXPORT_FUNC(PyUnicode_EncodeCodePage) EXPORT_FUNC(PyUnicode_EncodeFSDefault) EXPORT_FUNC(PyUnicode_EncodeLocale) -EXPORT_FUNC(PyUnicode_EqualToString) +EXPORT_FUNC(PyUnicode_EqualToUTF8) EXPORT_FUNC(PyUnicode_Find) EXPORT_FUNC(PyUnicode_FindChar) EXPORT_FUNC(PyUnicode_Format) From c55f9ac784a417bc615f2335c813c0e39437e0fd Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 4 Oct 2023 10:53:32 +0300 Subject: [PATCH 03/16] Apply suggestions from code review Co-authored-by: Victor Stinner --- Doc/c-api/unicode.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index c8bd0d7f81c7e5..c9f11d93638333 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1401,7 +1401,7 @@ They all return ``NULL`` or ``-1`` if an exception occurs. Compare a Unicode object with a UTF-8 encoded C string and return true if they are equal and false otherwise. If the Unicode object contains null or surrogate characters or - the C string not encoded to UTF-8 return false. + the C string is not encoded to UTF-8 return 0. This function does not raise exceptions. From bdf2f1e27cdc42ec976a7a23b83f0aade13a56ad Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 4 Oct 2023 11:30:14 +0300 Subject: [PATCH 04/16] Address some of review comments and test the UTF-8 cache. --- Doc/c-api/unicode.rst | 6 +++--- Lib/test/test_capi/test_unicode.py | 28 +++++++++++++++++---------- Objects/unicodeobject.c | 31 +++++++++++++++++------------- 3 files changed, 39 insertions(+), 26 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index c9f11d93638333..dec451464137fd 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1398,10 +1398,10 @@ They all return ``NULL`` or ``-1`` if an exception occurs. .. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string) - Compare a Unicode object with a UTF-8 encoded C string and return true - if they are equal and false otherwise. + Compare a Unicode object with a UTF-8 encoded C string and return true (``1``) + if they are equal and false (``0``) otherwise. If the Unicode object contains null or surrogate characters or - the C string is not encoded to UTF-8 return 0. + the C string is not encoded to UTF-8 return false. This function does not raise exceptions. diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index e6e3792c639aec..c3d7e3bc4c56ab 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1302,26 +1302,34 @@ def test_comparewithasciistring(self): def test_equaltoutf8(self): """Test PyUnicode_EqualToUTF8()""" from _testcapi import unicode_equaltoutf8 as equaltoutf8 + from _testcapi import unicode_asutf8andsize as asutf8andsize strings = [ 'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16', '\U0001f600\U0001f601\U0001f602' ] for s in strings: + # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8 + # encoded string cached in the Unicode object. + asutf8andsize(s, 0) b = s.encode() - self.assertEqual(equaltoutf8(s, b), 1) - self.assertEqual(equaltoutf8(b.decode(), b), 1) + self.assertEqual(equaltoutf8(s, b), 1) # Use the UTF-8 cache. + s2 = b.decode() # New Unicode object without the UTF-8 cache. + self.assertEqual(equaltoutf8(s2, b), 1) self.assertEqual(equaltoutf8(s + 'x', b + b'x'), 1) self.assertEqual(equaltoutf8(s + 'x', b + b'y'), 0) self.assertEqual(equaltoutf8(s + '\0', b + b'\0'), 0) - self.assertEqual(equaltoutf8(s, b + b'x'), 0) - self.assertEqual(equaltoutf8(s, b[:-1]), 0) - self.assertEqual(equaltoutf8(s, b[:-1] + b'x'), 0) - - # surrogateescape - self.assertEqual(equaltoutf8('\udcfe', b'\xfe'), 0) - # surrogatepass - self.assertEqual(equaltoutf8('\udcfe', b'\xed\xb3\xbe'), 0) + self.assertEqual(equaltoutf8(s2, b + b'x'), 0) + self.assertEqual(equaltoutf8(s2, b[:-1]), 0) + self.assertEqual(equaltoutf8(s2, b[:-1] + b'x'), 0) + + # Surrogate characters are always treated as not equal + self.assertEqual(equaltoutf8('\udcfe', + '\udcfe'.encode("utf8", "surrogateescape")), 0) + self.assertEqual(equaltoutf8('\udcfe', + '\udcfe'.encode("utf8", "surrogatepass")), 0) + self.assertEqual(equaltoutf8('\ud801', + '\ud801'.encode("utf8", "surrogatepass")), 0) # CRASHES equaltoutf8(b'abc', b'abc') # CRASHES equaltoutf8([], b'abc') diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4993b0c9c52b17..18e99a500bf3c6 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10678,6 +10678,7 @@ PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) { assert(_PyUnicode_CHECK(unicode)); assert(str); + if (PyUnicode_IS_ASCII(unicode)) { size_t len = (size_t)PyUnicode_GET_LENGTH(unicode); return strlen(str) == len && @@ -10689,49 +10690,53 @@ PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) memcmp(PyUnicode_UTF8(unicode), str, len) == 0; } - Py_UCS4 ch; + const unsigned char *s = (const unsigned char *)str; Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); int kind = PyUnicode_KIND(unicode); const void *data = PyUnicode_DATA(unicode); /* Compare Unicode string and UTF-8 string */ for (Py_ssize_t i = 0; i < len; i++) { - ch = PyUnicode_READ(kind, data, i); + Py_UCS4 ch = PyUnicode_READ(kind, data, i); if (ch == 0) { return 0; } else if (ch < 0x80) { - if (ch != (unsigned char)*str++) { + if (s[0] != ch) { return 0; } + s += 1; } else if (ch < 0x800) { - if ((0xc0 | (ch >> 6)) != (unsigned char)*str++ || - (0x80 | (ch & 0x3f)) != (unsigned char)*str++) + if (s[0] != (0xc0 | (ch >> 6)) || + s[1] != (0x80 | (ch & 0x3f))) { return 0; } + s += 2; } else if (ch < 0x10000) { if (Py_UNICODE_IS_SURROGATE(ch) || - (0xe0 | (ch >> 12)) != (unsigned char)*str++ || - (0x80 | ((ch >> 6) & 0x3f)) != (unsigned char)*str++ || - (0x80 | (ch & 0x3f)) != (unsigned char)*str++) + s[0] != (0xe0 | (ch >> 12)) || + s[1] != (0x80 | ((ch >> 6) & 0x3f)) || + s[2] != (0x80 | (ch & 0x3f))) { return 0; } + s += 3; } else { assert(ch <= MAX_UNICODE); - if ((0xf0 | (ch >> 18)) != (unsigned char)*str++ || - (0x80 | ((ch >> 12) & 0x3f)) != (unsigned char)*str++ || - (0x80 | ((ch >> 6) & 0x3f)) != (unsigned char)*str++ || - (0x80 | (ch & 0x3f)) != (unsigned char)*str++) + if (s[0] != (0xf0 | (ch >> 18)) || + s[1] != (0x80 | ((ch >> 12) & 0x3f)) || + s[2] != (0x80 | ((ch >> 6) & 0x3f)) || + s[3] != (0x80 | (ch & 0x3f))) { return 0; } + s += 4; } } - return *str == 0; + return *s == 0; } int From 7223c14e3f9629d777fd27b477ad39d516472d80 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 4 Oct 2023 16:15:04 +0300 Subject: [PATCH 05/16] Address review comments. --- Doc/c-api/unicode.rst | 6 +++--- Lib/test/test_capi/test_unicode.py | 9 +++++++++ Modules/_testcapi/unicode.c | 3 ++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index dec451464137fd..36926b0681f7bc 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1398,10 +1398,10 @@ They all return ``NULL`` or ``-1`` if an exception occurs. .. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string) - Compare a Unicode object with a UTF-8 encoded C string and return true (``1``) - if they are equal and false (``0``) otherwise. + Compare a Unicode object with a UTF-8 or ASCII encoded C string + and return true (``1``) if they are equal and false (``0``) otherwise. If the Unicode object contains null or surrogate characters or - the C string is not encoded to UTF-8 return false. + the C string is not encoded to UTF-8 or ASCII, return false. This function does not raise exceptions. diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index c3d7e3bc4c56ab..f3fff6aa4dab9d 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1331,6 +1331,15 @@ def test_equaltoutf8(self): self.assertEqual(equaltoutf8('\ud801', '\ud801'.encode("utf8", "surrogatepass")), 0) + def check_not_equal_encoding(text, encoding): + self.assertEqual(equaltoutf8(text, text.encode(encoding)), 0) + self.assertNotEqual(text.encode(encoding), text.encode("utf8")) + + # Strings encoded to other encodings are not equal to expected UTF8-encoding string + check_not_equal_encoding('Stéphane', 'latin1') + check_not_equal_encoding('Stéphane', 'utf-16-le') # embedded null characters + check_not_equal_encoding('北京市', 'gbk') + # CRASHES equaltoutf8(b'abc', b'abc') # CRASHES equaltoutf8([], b'abc') # CRASHES equaltoutf8(NULL, b'abc') diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 83fb8a7cfbcb87..094cae40049e6d 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -1438,8 +1438,9 @@ unicode_equaltoutf8(PyObject *self, PyObject *args) Py_ssize_t right_len; int result; - if (!PyArg_ParseTuple(args, "O|y#", &left, &right, &right_len)) + if (!PyArg_ParseTuple(args, "O|y#", &left, &right, &right_len)) { return NULL; + } NULLABLE(left); result = PyUnicode_EqualToUTF8(left, right); From b2713274d26af3460d60f60b7189a3eeef823b9b Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 4 Oct 2023 17:53:32 +0300 Subject: [PATCH 06/16] Apply suggestions from code review Co-authored-by: Victor Stinner --- Doc/c-api/unicode.rst | 4 ++-- Doc/whatsnew/3.13.rst | 4 ++-- Lib/test/test_capi/test_unicode.py | 8 +++++++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 36926b0681f7bc..ee72af3b5c9cb4 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1399,9 +1399,9 @@ They all return ``NULL`` or ``-1`` if an exception occurs. .. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string) Compare a Unicode object with a UTF-8 or ASCII encoded C string - and return true (``1``) if they are equal and false (``0``) otherwise. + and return true (``1``) if they are equal, or false (``0``) otherwise. If the Unicode object contains null or surrogate characters or - the C string is not encoded to UTF-8 or ASCII, return false. + the C string is not encoded to UTF-8 or ASCII, return false (``0``) . This function does not raise exceptions. diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 067ccbd9259b9b..5181d346254b97 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -1004,8 +1004,8 @@ New Features (Contributed by Victor Stinner in :gh:`107073`.) * Add :c:func:`PyUnicode_EqualToUTF8` function: compare Unicode object with - a :c:expr:`const char*` UTF-8 encoded bytes string and return true if they - are equal or false otherwise. + a :c:expr:`const char*` UTF-8 encoded bytes string and return true (``1``) + if they are equal, or false (``0``) otherwise. This function does not raise exceptions. (Contributed by Serhiy Storchaka in :gh:`110289`.) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index f3fff6aa4dab9d..0dd9bebf33ad9d 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1306,7 +1306,8 @@ def test_equaltoutf8(self): strings = [ 'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16', - '\U0001f600\U0001f601\U0001f602' + '\U0001f600\U0001f601\U0001f602', + '\U0010ffff', ] for s in strings: # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8 @@ -1323,6 +1324,11 @@ def test_equaltoutf8(self): self.assertEqual(equaltoutf8(s2, b[:-1]), 0) self.assertEqual(equaltoutf8(s2, b[:-1] + b'x'), 0) + # embedded null chars/bytes + self.assertEqual(equaltoutf8('abc', b'abc\0def\0'), 1) + self.assertEqual(equaltoutf8('a\0bc', b'abc'), 0) + self.assertEqual(equaltoutf8('abc', b'a\0bc'), 0) + # Surrogate characters are always treated as not equal self.assertEqual(equaltoutf8('\udcfe', '\udcfe'.encode("utf8", "surrogateescape")), 0) From 6f26ad6ccf4726073c912a7b2cb8e9bc469dfb38 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 4 Oct 2023 17:57:32 +0300 Subject: [PATCH 07/16] Remove trailing spaces. --- Lib/test/test_capi/test_unicode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 0dd9bebf33ad9d..3e32b1b7150ab1 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1328,7 +1328,7 @@ def test_equaltoutf8(self): self.assertEqual(equaltoutf8('abc', b'abc\0def\0'), 1) self.assertEqual(equaltoutf8('a\0bc', b'abc'), 0) self.assertEqual(equaltoutf8('abc', b'a\0bc'), 0) - + # Surrogate characters are always treated as not equal self.assertEqual(equaltoutf8('\udcfe', '\udcfe'.encode("utf8", "surrogateescape")), 0) From dd124b87e00aa51f54e1da9adccb7b46c0aa16f5 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 4 Oct 2023 18:41:44 +0300 Subject: [PATCH 08/16] Apply suggestions from code review Co-authored-by: Victor Stinner --- Include/unicodeobject.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index e2787497c47bef..1bce505e9c4d32 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -958,8 +958,8 @@ PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( ); #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000 -/* Compare a Unicode object with UTF-8 encoded C string and return 1 for equal - and 0 otherwise. +/* Compare a Unicode object with UTF-8 encoded C string. + Return 1 if they are equal, or 0 otherwise. This function does not raise exceptions. */ PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *); From 76b9177c4158e5b6f9afc898cf0cce8167c48ee4 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 4 Oct 2023 18:52:36 +0300 Subject: [PATCH 09/16] Use "z#" instead of "|y#". --- Lib/test/test_capi/test_unicode.py | 2 +- Modules/_testcapi/unicode.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 3e32b1b7150ab1..98cc69741baa0a 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1349,7 +1349,7 @@ def check_not_equal_encoding(text, encoding): # CRASHES equaltoutf8(b'abc', b'abc') # CRASHES equaltoutf8([], b'abc') # CRASHES equaltoutf8(NULL, b'abc') - # CRASHES equaltoutf8('abc') # NULL + # CRASHES equaltoutf8('abc', NULL) @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 094cae40049e6d..732d7f48ec49e4 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -1438,7 +1438,7 @@ unicode_equaltoutf8(PyObject *self, PyObject *args) Py_ssize_t right_len; int result; - if (!PyArg_ParseTuple(args, "O|y#", &left, &right, &right_len)) { + if (!PyArg_ParseTuple(args, "Oz#", &left, &right, &right_len)) { return NULL; } From ee5781d223e3bf55dc0a3de59e3eb81a9726b40a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 5 Oct 2023 22:31:17 +0300 Subject: [PATCH 10/16] Apply suggestions from code review Co-authored-by: Antoine Pitrou --- Doc/c-api/unicode.rst | 2 +- Doc/whatsnew/3.13.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index ee72af3b5c9cb4..57e55cdbfe496a 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1401,7 +1401,7 @@ They all return ``NULL`` or ``-1`` if an exception occurs. Compare a Unicode object with a UTF-8 or ASCII encoded C string and return true (``1``) if they are equal, or false (``0``) otherwise. If the Unicode object contains null or surrogate characters or - the C string is not encoded to UTF-8 or ASCII, return false (``0``) . + the C string is not valid UTF-8, false (``0``) is returned. This function does not raise exceptions. diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 5181d346254b97..ff92bd17065922 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -1004,7 +1004,7 @@ New Features (Contributed by Victor Stinner in :gh:`107073`.) * Add :c:func:`PyUnicode_EqualToUTF8` function: compare Unicode object with - a :c:expr:`const char*` UTF-8 encoded bytes string and return true (``1``) + a :c:expr:`const char*` UTF-8 encoded string and return true (``1``) if they are equal, or false (``0``) otherwise. This function does not raise exceptions. (Contributed by Serhiy Storchaka in :gh:`110289`.) From 1a4eb7bb149cd1b3dcdde7e822119f80ece85cd6 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 6 Oct 2023 09:44:53 +0300 Subject: [PATCH 11/16] Add PyUnicode_EqualToUTF8AndSize(). --- Doc/c-api/unicode.rst | 18 ++++-- Doc/data/stable_abi.dat | 1 + Doc/whatsnew/3.13.rst | 8 +-- Include/unicodeobject.h | 1 + Lib/test/test_capi/test_unicode.py | 58 +++++++++++++++++-- Lib/test/test_stable_abi_ctypes.py | 1 + ...-10-03-19-01-20.gh-issue-110289.YBIHEz.rst | 2 +- Misc/stable_abi.toml | 2 + Modules/_testcapi/unicode.c | 24 ++++++++ Objects/unicodeobject.c | 36 +++++++----- PC/python3dll.c | 1 + 11 files changed, 125 insertions(+), 27 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 57e55cdbfe496a..00f4bac65a252a 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1396,11 +1396,12 @@ They all return ``NULL`` or ``-1`` if an exception occurs. :c:func:`PyErr_Occurred` to check for errors. -.. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string) +.. c:function:: int PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *string, Py_ssize_t size) - Compare a Unicode object with a UTF-8 or ASCII encoded C string - and return true (``1``) if they are equal, or false (``0``) otherwise. - If the Unicode object contains null or surrogate characters or + Compare a Unicode object with a char buffer which is interpreted as + being UTF-8 or ASCII encoded and return true (``1``) if they are equal, + or false (``0``) otherwise. + If the Unicode object contains surrogate characters or the C string is not valid UTF-8, false (``0``) is returned. This function does not raise exceptions. @@ -1408,6 +1409,15 @@ They all return ``NULL`` or ``-1`` if an exception occurs. .. versionadded:: 3.13 +.. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string) + + Similar to :c:func:`PyUnicode_EqualToUTF8AndSize`, but compute the string + length using :c:func:`!strlen`. + If the Unicode object contains null characters, false (``0``) is returned. + + .. versionadded:: 3.13 + + .. c:function:: int PyUnicode_CompareWithASCIIString(PyObject *uni, const char *string) Compare a Unicode object, *uni*, with *string* and return ``-1``, ``0``, ``1`` for less diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index 1407659d1ae576..bfb1f97b554fc6 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -756,6 +756,7 @@ function,PyUnicode_EncodeCodePage,3.7,on Windows, function,PyUnicode_EncodeFSDefault,3.2,, function,PyUnicode_EncodeLocale,3.7,, function,PyUnicode_EqualToUTF8,3.13,, +function,PyUnicode_EqualToUTF8AndSize,3.13,, function,PyUnicode_FSConverter,3.2,, function,PyUnicode_FSDecoder,3.2,, function,PyUnicode_Find,3.2,, diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index ff92bd17065922..2577606373e4ba 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -1003,10 +1003,10 @@ New Features functions on Python 3.11 and 3.12. (Contributed by Victor Stinner in :gh:`107073`.) -* Add :c:func:`PyUnicode_EqualToUTF8` function: compare Unicode object with - a :c:expr:`const char*` UTF-8 encoded string and return true (``1``) - if they are equal, or false (``0``) otherwise. - This function does not raise exceptions. +* Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8` + functions: compare Unicode object with a :c:expr:`const char*` UTF-8 encoded + string and return true (``1``) if they are equal, or false (``0``) otherwise. + These functions do not raise exceptions. (Contributed by Serhiy Storchaka in :gh:`110289`.) * Add :c:func:`PyThreadState_GetUnchecked()` function: similar to diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 1bce505e9c4d32..dee00715b3c51d 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -963,6 +963,7 @@ PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( This function does not raise exceptions. */ PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *); +PyAPI_FUNC(int) PyUnicode_EqualToUTF8AndSize(PyObject *, const char *, Py_ssize_t); #endif /* Rich compare two strings and return one of the following: diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 98cc69741baa0a..0bbab9bfc0ec01 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1320,6 +1320,7 @@ def test_equaltoutf8(self): self.assertEqual(equaltoutf8(s + 'x', b + b'x'), 1) self.assertEqual(equaltoutf8(s + 'x', b + b'y'), 0) self.assertEqual(equaltoutf8(s + '\0', b + b'\0'), 0) + self.assertEqual(equaltoutf8(s + '\0', b), 0) self.assertEqual(equaltoutf8(s2, b + b'x'), 0) self.assertEqual(equaltoutf8(s2, b[:-1]), 0) self.assertEqual(equaltoutf8(s2, b[:-1] + b'x'), 0) @@ -1337,8 +1338,54 @@ def test_equaltoutf8(self): self.assertEqual(equaltoutf8('\ud801', '\ud801'.encode("utf8", "surrogatepass")), 0) + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_equaltoutf8andsize(self): + """Test PyUnicode_EqualToUTF8AndSize()""" + from _testcapi import unicode_equaltoutf8andsize as equaltoutf8andsize + from _testcapi import unicode_asutf8andsize as asutf8andsize + + strings = [ + 'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16', + '\U0001f600\U0001f601\U0001f602', + '\U0010ffff', + ] + for s in strings: + # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8 + # encoded string cached in the Unicode object. + asutf8andsize(s, 0) + b = s.encode() + self.assertEqual(equaltoutf8andsize(s, b), 1) # Use the UTF-8 cache. + s2 = b.decode() # New Unicode object without the UTF-8 cache. + self.assertEqual(equaltoutf8andsize(s2, b), 1) + self.assertEqual(equaltoutf8andsize(s + 'x', b + b'x'), 1) + self.assertEqual(equaltoutf8andsize(s + 'x', b + b'y'), 0) + self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0'), 1) + self.assertEqual(equaltoutf8andsize(s + '\0', b), 0) + self.assertEqual(equaltoutf8andsize(s2, b + b'x'), 0) + self.assertEqual(equaltoutf8andsize(s2, b[:-1]), 0) + self.assertEqual(equaltoutf8andsize(s2, b[:-1] + b'x'), 0) + # Not null-terminated, + self.assertEqual(equaltoutf8andsize(s, b + b'x', len(b)), 1) + self.assertEqual(equaltoutf8andsize(s2, b + b'x', len(b)), 1) + self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0x', len(b) + 1), 1) + self.assertEqual(equaltoutf8andsize(s2, b, len(b) - 1), 0) + + # embedded null chars/bytes + self.assertEqual(equaltoutf8andsize('abc', b'abc\0def\0'), 0) + self.assertEqual(equaltoutf8andsize('a\0bc', b'abc'), 0) + self.assertEqual(equaltoutf8andsize('abc', b'a\0bc'), 0) + + # Surrogate characters are always treated as not equal + self.assertEqual(equaltoutf8andsize('\udcfe', + '\udcfe'.encode("utf8", "surrogateescape")), 0) + self.assertEqual(equaltoutf8andsize('\udcfe', + '\udcfe'.encode("utf8", "surrogatepass")), 0) + self.assertEqual(equaltoutf8andsize('\ud801', + '\ud801'.encode("utf8", "surrogatepass")), 0) + def check_not_equal_encoding(text, encoding): - self.assertEqual(equaltoutf8(text, text.encode(encoding)), 0) + self.assertEqual(equaltoutf8andsize(text, text.encode(encoding)), 0) self.assertNotEqual(text.encode(encoding), text.encode("utf8")) # Strings encoded to other encodings are not equal to expected UTF8-encoding string @@ -1346,10 +1393,11 @@ def check_not_equal_encoding(text, encoding): check_not_equal_encoding('Stéphane', 'utf-16-le') # embedded null characters check_not_equal_encoding('北京市', 'gbk') - # CRASHES equaltoutf8(b'abc', b'abc') - # CRASHES equaltoutf8([], b'abc') - # CRASHES equaltoutf8(NULL, b'abc') - # CRASHES equaltoutf8('abc', NULL) + # CRASHES equaltoutf8andsize('abc', b'abc', -1) + # CRASHES equaltoutf8andsize(b'abc', b'abc') + # CRASHES equaltoutf8andsize([], b'abc') + # CRASHES equaltoutf8andsize(NULL, b'abc') + # CRASHES equaltoutf8andsize('abc', NULL) @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py index d1ae4e382c9470..2a22f6edbf4761 100644 --- a/Lib/test/test_stable_abi_ctypes.py +++ b/Lib/test/test_stable_abi_ctypes.py @@ -771,6 +771,7 @@ def test_windows_feature_macros(self): "PyUnicode_EncodeFSDefault", "PyUnicode_EncodeLocale", "PyUnicode_EqualToUTF8", + "PyUnicode_EqualToUTF8AndSize", "PyUnicode_FSConverter", "PyUnicode_FSDecoder", "PyUnicode_Find", diff --git a/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst index b1582bc1591590..9028e35130d50c 100644 --- a/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst +++ b/Misc/NEWS.d/next/C API/2023-10-03-19-01-20.gh-issue-110289.YBIHEz.rst @@ -1 +1 @@ -Add :c:func:`PyUnicode_EqualToUTF8` function. +Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8` functions. diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml index ae39fea3a66a24..4503a9c45d4ac0 100644 --- a/Misc/stable_abi.toml +++ b/Misc/stable_abi.toml @@ -2462,3 +2462,5 @@ added = '3.13' [function.PyUnicode_EqualToUTF8] added = '3.13' +[function.PyUnicode_EqualToUTF8AndSize] + added = '3.13' diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 732d7f48ec49e4..d52d88a65d86fc 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -1448,6 +1448,29 @@ unicode_equaltoutf8(PyObject *self, PyObject *args) return PyLong_FromLong(result); } +/* Test PyUnicode_EqualToUTF8AndSize() */ +static PyObject * +unicode_equaltoutf8andsize(PyObject *self, PyObject *args) +{ + PyObject *left; + const char *right = NULL; + Py_ssize_t right_len; + Py_ssize_t size = -100; + int result; + + if (!PyArg_ParseTuple(args, "Oz#|n", &left, &right, &right_len, &size)) { + return NULL; + } + + NULLABLE(left); + if (size == -100) { + size = right_len; + } + result = PyUnicode_EqualToUTF8AndSize(left, right, size); + assert(!PyErr_Occurred()); + return PyLong_FromLong(result); +} + /* Test PyUnicode_RichCompare() */ static PyObject * unicode_richcompare(PyObject *self, PyObject *args) @@ -2064,6 +2087,7 @@ static PyMethodDef TestMethods[] = { {"unicode_compare", unicode_compare, METH_VARARGS}, {"unicode_comparewithasciistring",unicode_comparewithasciistring,METH_VARARGS}, {"unicode_equaltoutf8", unicode_equaltoutf8, METH_VARARGS}, + {"unicode_equaltoutf8andsize",unicode_equaltoutf8andsize, METH_VARARGS}, {"unicode_richcompare", unicode_richcompare, METH_VARARGS}, {"unicode_format", unicode_format, METH_VARARGS}, {"unicode_contains", unicode_contains, METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 18e99a500bf3c6..63b65f35f2936a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10675,39 +10675,47 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) int PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) +{ + return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str)); +} + +int +PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size) { assert(_PyUnicode_CHECK(unicode)); assert(str); if (PyUnicode_IS_ASCII(unicode)) { - size_t len = (size_t)PyUnicode_GET_LENGTH(unicode); - return strlen(str) == len && + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + return size == len && memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0; } if (PyUnicode_UTF8(unicode) != NULL) { - size_t len = (size_t)PyUnicode_UTF8_LENGTH(unicode); - return strlen(str) == len && + Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode); + return size == len && memcmp(PyUnicode_UTF8(unicode), str, len) == 0; } - const unsigned char *s = (const unsigned char *)str; Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) { + return 0; + } + const unsigned char *s = (const unsigned char *)str; + const unsigned char *ends = s + (size_t)size; int kind = PyUnicode_KIND(unicode); const void *data = PyUnicode_DATA(unicode); /* Compare Unicode string and UTF-8 string */ for (Py_ssize_t i = 0; i < len; i++) { Py_UCS4 ch = PyUnicode_READ(kind, data, i); - if (ch == 0) { - return 0; - } - else if (ch < 0x80) { - if (s[0] != ch) { + if (ch < 0x80) { + if (ends == s || s[0] != ch) { return 0; } s += 1; } else if (ch < 0x800) { - if (s[0] != (0xc0 | (ch >> 6)) || + if (ends - s < 2 || + s[0] != (0xc0 | (ch >> 6)) || s[1] != (0x80 | (ch & 0x3f))) { return 0; @@ -10716,6 +10724,7 @@ PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) } else if (ch < 0x10000) { if (Py_UNICODE_IS_SURROGATE(ch) || + ends - s < 3 || s[0] != (0xe0 | (ch >> 12)) || s[1] != (0x80 | ((ch >> 6) & 0x3f)) || s[2] != (0x80 | (ch & 0x3f))) @@ -10726,7 +10735,8 @@ PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) } else { assert(ch <= MAX_UNICODE); - if (s[0] != (0xf0 | (ch >> 18)) || + if (ends - s < 4 || + s[0] != (0xf0 | (ch >> 18)) || s[1] != (0x80 | ((ch >> 12) & 0x3f)) || s[2] != (0x80 | ((ch >> 6) & 0x3f)) || s[3] != (0x80 | (ch & 0x3f))) @@ -10736,7 +10746,7 @@ PyUnicode_EqualToUTF8(PyObject *unicode, const char *str) s += 4; } } - return *s == 0; + return s == ends; } int diff --git a/PC/python3dll.c b/PC/python3dll.c index 0beb61f28e0ef8..1fb4c810cf1cfb 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -689,6 +689,7 @@ EXPORT_FUNC(PyUnicode_EncodeCodePage) EXPORT_FUNC(PyUnicode_EncodeFSDefault) EXPORT_FUNC(PyUnicode_EncodeLocale) EXPORT_FUNC(PyUnicode_EqualToUTF8) +EXPORT_FUNC(PyUnicode_EqualToUTF8AndSize) EXPORT_FUNC(PyUnicode_Find) EXPORT_FUNC(PyUnicode_FindChar) EXPORT_FUNC(PyUnicode_Format) From b1243770a7bb5d8234708d8070320fd256a8b5df Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 7 Oct 2023 15:43:47 +0300 Subject: [PATCH 12/16] Apply suggestions from code review Co-authored-by: Victor Stinner --- Doc/c-api/unicode.rst | 2 +- Lib/test/test_capi/test_unicode.py | 5 ++--- Objects/unicodeobject.c | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 00f4bac65a252a..5ab9f1cab23ef8 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1411,7 +1411,7 @@ They all return ``NULL`` or ``-1`` if an exception occurs. .. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string) - Similar to :c:func:`PyUnicode_EqualToUTF8AndSize`, but compute the string + Similar to :c:func:`PyUnicode_EqualToUTF8AndSize`, but compute *string* length using :c:func:`!strlen`. If the Unicode object contains null characters, false (``0``) is returned. diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 0bbab9bfc0ec01..e10c4ff3a94f7c 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1372,9 +1372,8 @@ def test_equaltoutf8andsize(self): self.assertEqual(equaltoutf8andsize(s2, b, len(b) - 1), 0) # embedded null chars/bytes - self.assertEqual(equaltoutf8andsize('abc', b'abc\0def\0'), 0) - self.assertEqual(equaltoutf8andsize('a\0bc', b'abc'), 0) - self.assertEqual(equaltoutf8andsize('abc', b'a\0bc'), 0) + self.assertEqual(equaltoutf8andsize('abc\0def', b'abc\0def', 7), 1) + self.assertEqual(equaltoutf8andsize('abc\0def\0', b'abc\0def\0', 8), 1) # Surrogate characters are always treated as not equal self.assertEqual(equaltoutf8andsize('\udcfe', diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 63b65f35f2936a..e234277c37513a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10714,7 +10714,7 @@ PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size s += 1; } else if (ch < 0x800) { - if (ends - s < 2 || + if ((ends - s) < 2 || s[0] != (0xc0 | (ch >> 6)) || s[1] != (0x80 | (ch & 0x3f))) { From 029f1a06efd2e41139d2ce9842e6a5511163c74a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 7 Oct 2023 15:48:57 +0300 Subject: [PATCH 13/16] Add more parentheses. --- Objects/unicodeobject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e234277c37513a..33cbc987d43282 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10724,7 +10724,7 @@ PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size } else if (ch < 0x10000) { if (Py_UNICODE_IS_SURROGATE(ch) || - ends - s < 3 || + (ends - s) < 3 || s[0] != (0xe0 | (ch >> 12)) || s[1] != (0x80 | ((ch >> 6) & 0x3f)) || s[2] != (0x80 | (ch & 0x3f))) @@ -10735,7 +10735,7 @@ PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size } else { assert(ch <= MAX_UNICODE); - if (ends - s < 4 || + if ((ends - s) < 4 || s[0] != (0xf0 | (ch >> 18)) || s[1] != (0x80 | ((ch >> 12) & 0x3f)) || s[2] != (0x80 | ((ch >> 6) & 0x3f)) || From be2ffe844a5f274168f2f7e554f8e1f745e83cb9 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 7 Oct 2023 15:50:58 +0300 Subject: [PATCH 14/16] Remove redundant arguments. --- Lib/test/test_capi/test_unicode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index e10c4ff3a94f7c..28ab4ddb46009b 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1372,8 +1372,8 @@ def test_equaltoutf8andsize(self): self.assertEqual(equaltoutf8andsize(s2, b, len(b) - 1), 0) # embedded null chars/bytes - self.assertEqual(equaltoutf8andsize('abc\0def', b'abc\0def', 7), 1) - self.assertEqual(equaltoutf8andsize('abc\0def\0', b'abc\0def\0', 8), 1) + self.assertEqual(equaltoutf8andsize('abc\0def', b'abc\0def'), 1) + self.assertEqual(equaltoutf8andsize('abc\0def\0', b'abc\0def\0'), 1) # Surrogate characters are always treated as not equal self.assertEqual(equaltoutf8andsize('\udcfe', From 78de49d5f40466abfa88640eb251d956e6ebb855 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 10 Oct 2023 23:34:19 +0300 Subject: [PATCH 15/16] Turn docstrings into comments. --- Lib/test/test_capi/test_unicode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 28ab4ddb46009b..dd0dc950ca0b90 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1300,7 +1300,7 @@ def test_comparewithasciistring(self): @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') def test_equaltoutf8(self): - """Test PyUnicode_EqualToUTF8()""" + # Test PyUnicode_EqualToUTF8() from _testcapi import unicode_equaltoutf8 as equaltoutf8 from _testcapi import unicode_asutf8andsize as asutf8andsize @@ -1341,7 +1341,7 @@ def test_equaltoutf8(self): @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') def test_equaltoutf8andsize(self): - """Test PyUnicode_EqualToUTF8AndSize()""" + # Test PyUnicode_EqualToUTF8AndSize() from _testcapi import unicode_equaltoutf8andsize as equaltoutf8andsize from _testcapi import unicode_asutf8andsize as asutf8andsize From 19ad12633077bc6122a2a1340d6843f8da241574 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 11 Oct 2023 13:05:25 +0300 Subject: [PATCH 16/16] Add tests for empty strings. --- Lib/test/test_capi/test_unicode.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index dd0dc950ca0b90..a73e669dda7ddc 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1319,12 +1319,17 @@ def test_equaltoutf8(self): self.assertEqual(equaltoutf8(s2, b), 1) self.assertEqual(equaltoutf8(s + 'x', b + b'x'), 1) self.assertEqual(equaltoutf8(s + 'x', b + b'y'), 0) + self.assertEqual(equaltoutf8(s, b + b'\0'), 1) + self.assertEqual(equaltoutf8(s2, b + b'\0'), 1) self.assertEqual(equaltoutf8(s + '\0', b + b'\0'), 0) self.assertEqual(equaltoutf8(s + '\0', b), 0) self.assertEqual(equaltoutf8(s2, b + b'x'), 0) self.assertEqual(equaltoutf8(s2, b[:-1]), 0) self.assertEqual(equaltoutf8(s2, b[:-1] + b'x'), 0) + self.assertEqual(equaltoutf8('', b''), 1) + self.assertEqual(equaltoutf8('', b'\0'), 1) + # embedded null chars/bytes self.assertEqual(equaltoutf8('abc', b'abc\0def\0'), 1) self.assertEqual(equaltoutf8('a\0bc', b'abc'), 0) @@ -1360,6 +1365,8 @@ def test_equaltoutf8andsize(self): self.assertEqual(equaltoutf8andsize(s2, b), 1) self.assertEqual(equaltoutf8andsize(s + 'x', b + b'x'), 1) self.assertEqual(equaltoutf8andsize(s + 'x', b + b'y'), 0) + self.assertEqual(equaltoutf8andsize(s, b + b'\0'), 0) + self.assertEqual(equaltoutf8andsize(s2, b + b'\0'), 0) self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0'), 1) self.assertEqual(equaltoutf8andsize(s + '\0', b), 0) self.assertEqual(equaltoutf8andsize(s2, b + b'x'), 0) @@ -1371,6 +1378,10 @@ def test_equaltoutf8andsize(self): self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0x', len(b) + 1), 1) self.assertEqual(equaltoutf8andsize(s2, b, len(b) - 1), 0) + self.assertEqual(equaltoutf8andsize('', b''), 1) + self.assertEqual(equaltoutf8andsize('', b'\0'), 0) + self.assertEqual(equaltoutf8andsize('', b'x', 0), 1) + # embedded null chars/bytes self.assertEqual(equaltoutf8andsize('abc\0def', b'abc\0def'), 1) self.assertEqual(equaltoutf8andsize('abc\0def\0', b'abc\0def\0'), 1) pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy