From 4c06f9ea74da809bd352deab0eb97aa757c2c01d Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 11 May 2020 20:54:17 +0300 Subject: [PATCH 1/2] bpo-40596: Fix str.isidentifier() for non-canonicalized strings containing non-BMP characters on Windows. --- Lib/test/test_unicode.py | 7 +++++++ .../2020-05-11-20-53-52.bpo-40596.dwOH_X.rst | 2 ++ Objects/unicodeobject.c | 20 ++++++++++++++++--- 3 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 28398896467898..2ee4e64d635303 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -720,6 +720,13 @@ def test_isidentifier(self): self.assertFalse("©".isidentifier()) self.assertFalse("0".isidentifier()) + @support.cpython_only + def test_isidentifier_legacy(self): + import _testcapi + u = '𝖀𝖓𝖎𝖈𝖔𝖉𝖊' + self.assertTrue(u.isidentifier()) + self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier()) + def test_isprintable(self): self.assertTrue("".isprintable()) self.assertTrue(" ".isprintable()) diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst new file mode 100644 index 00000000000000..1252db4dc9848d --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst @@ -0,0 +1,2 @@ +Fixed :meth:`str.isidentifier` for non-canonicalized strings containing +non-BMP characters on Windows. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 276547ca48a5b2..058598e7a8555e 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12356,20 +12356,34 @@ PyUnicode_IsIdentifier(PyObject *self) return len && i == len; } else { - Py_ssize_t i, len = PyUnicode_GET_SIZE(self); + Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self); if (len == 0) { /* an empty string is not a valid identifier */ return 0; } const wchar_t *wstr = _PyUnicode_WSTR(self); - Py_UCS4 ch = wstr[0]; + Py_UCS4 ch = wstr[i++]; + if (Py_UNICODE_IS_HIGH_SURROGATE(ch) + && i < len + && Py_UNICODE_IS_LOW_SURROGATE(wstr[i])) + { + ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]); + i++; + } if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { return 0; } - for (i = 1; i < len; i++) { + for (; i < len; i++) { ch = wstr[i]; + if (Py_UNICODE_IS_HIGH_SURROGATE(ch) + && i < len + && Py_UNICODE_IS_LOW_SURROGATE(wstr[i])) + { + ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]); + i++; + } if (!_PyUnicode_IsXidContinue(ch)) { return 0; } From 236d27b3399e7690eb8c979c2e4f672ab4705fa4 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 12 May 2020 14:40:18 +0300 Subject: [PATCH 2/2] Fix iteration. --- Objects/unicodeobject.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 058598e7a8555e..826298c23a924c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12364,6 +12364,7 @@ PyUnicode_IsIdentifier(PyObject *self) const wchar_t *wstr = _PyUnicode_WSTR(self); Py_UCS4 ch = wstr[i++]; +#if SIZEOF_WCHAR_T == 2 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && i < len && Py_UNICODE_IS_LOW_SURROGATE(wstr[i])) @@ -12371,12 +12372,14 @@ PyUnicode_IsIdentifier(PyObject *self) ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]); i++; } +#endif if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { return 0; } - for (; i < len; i++) { - ch = wstr[i]; + while (i < len) { + ch = wstr[i++]; +#if SIZEOF_WCHAR_T == 2 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && i < len && Py_UNICODE_IS_LOW_SURROGATE(wstr[i])) @@ -12384,6 +12387,7 @@ PyUnicode_IsIdentifier(PyObject *self) ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]); i++; } +#endif if (!_PyUnicode_IsXidContinue(ch)) { return 0; } pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy