diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst index bcdb60d86d8553..4a1362d943c809 100644 --- a/Doc/whatsnew/3.8.rst +++ b/Doc/whatsnew/3.8.rst @@ -1090,8 +1090,9 @@ unicodedata `_ release. * New function :func:`~unicodedata.is_normalized` can be used to verify a string - is in a specific normal form. (Contributed by Max Belanger and David Euresti in - :issue:`32285`). + is in a specific normal form, often much faster than by actually normalizing + the string. (Contributed by Max Belanger, David Euresti, and Greg Price in + :issue:`32285` and :issue:`37966`). unittest diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index a52b6de547fbc9..07d717688b0c59 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -220,6 +220,8 @@ def test_issue29456(self): self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b) self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) + # For tests of unicodedata.is_normalized / self.db.is_normalized , + # see test_normalization.py . def test_east_asian_width(self): eaw = self.db.east_asian_width diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-08-27-21-21-36.bpo-37966.5OBLez.rst b/Misc/NEWS.d/next/Core and Builtins/2019-08-27-21-21-36.bpo-37966.5OBLez.rst new file mode 100644 index 00000000000000..6b9d69c5b3a9a4 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2019-08-27-21-21-36.bpo-37966.5OBLez.rst @@ -0,0 +1,3 @@ +The implementation of :func:`~unicodedata.is_normalized` has been greatly +sped up on strings that aren't normalized, by implementing the full +normalization-quick-check algorithm from the Unicode standard. diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index ae0d4e46f9a409..5e8ba602d66848 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -19,6 +19,8 @@ #include "ucnhash.h" #include "structmember.h" +#include + _Py_IDENTIFIER(NFC); _Py_IDENTIFIER(NFD); _Py_IDENTIFIER(NFKC); @@ -775,25 +777,40 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) return result; } -typedef enum {YES, NO, MAYBE} NormalMode; - -/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */ -static NormalMode -is_normalized(PyObject *self, PyObject *input, int nfc, int k) +// This needs to match the logic in makeunicodedata.py +// which constructs the quickcheck data. +typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult; + +/* Run the Unicode normalization "quickcheck" algorithm. + * + * Return YES or NO if quickcheck determines the input is certainly + * normalized or certainly not, and MAYBE if quickcheck is unable to + * tell. + * + * If `yes_only` is true, then return MAYBE as soon as we determine + * the answer is not YES. + * + * For background and details on the algorithm, see UAX #15: + * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms + */ +static QuickcheckResult +is_normalized_quickcheck(PyObject *self, PyObject *input, + int nfc, int k, bool yes_only) { - Py_ssize_t i, len; - int kind; - void *data; - unsigned char prev_combining = 0, quickcheck_mask; - /* An older version of the database is requested, quickchecks must be disabled. */ if (self && UCD_Check(self)) return NO; - /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, - as described in http://unicode.org/reports/tr15/#Annex8. */ - quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0)); + Py_ssize_t i, len; + int kind; + void *data; + unsigned char prev_combining = 0; + + /* The two quickcheck bits at this shift have type QuickcheckResult. */ + int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0); + + QuickcheckResult result = YES; /* certainly normalized, unless we find something */ i = 0; kind = PyUnicode_KIND(input); @@ -802,16 +819,26 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k) while (i < len) { Py_UCS4 ch = PyUnicode_READ(kind, data, i++); const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch); - unsigned char combining = record->combining; - unsigned char quickcheck = record->normalization_quick_check; - if (quickcheck & quickcheck_mask) - return MAYBE; /* this string might need normalization */ + unsigned char combining = record->combining; if (combining && prev_combining > combining) return NO; /* non-canonical sort order, not normalized */ prev_combining = combining; + + unsigned char quickcheck_whole = record->normalization_quick_check; + if (yes_only) { + if (quickcheck_whole & (3 << quickcheck_shift)) + return MAYBE; + } else { + switch ((quickcheck_whole >> quickcheck_shift) & 3) { + case NO: + return NO; + case MAYBE: + result = MAYBE; /* this string might need normalization */ + } + } } - return YES; /* certainly normalized */ + return result; } /*[clinic input] @@ -844,7 +871,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, PyObject *result; int nfc = 0; int k = 0; - NormalMode m; + QuickcheckResult m; PyObject *cmp; int match = 0; @@ -867,7 +894,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, return NULL; } - m = is_normalized(self, input, nfc, k); + m = is_normalized_quickcheck(self, input, nfc, k, false); if (m == MAYBE) { cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k); @@ -913,28 +940,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { - if (is_normalized(self, input, 1, 0) == YES) { + if (is_normalized_quickcheck(self, input, 1, 0, true) == YES) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 0); } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { - if (is_normalized(self, input, 1, 1) == YES) { + if (is_normalized_quickcheck(self, input, 1, 1, true) == YES) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 1); } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { - if (is_normalized(self, input, 0, 0) == YES) { + if (is_normalized_quickcheck(self, input, 0, 0, true) == YES) { Py_INCREF(input); return input; } return nfd_nfkd(self, input, 0); } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { - if (is_normalized(self, input, 0, 1) == YES) { + if (is_normalized_quickcheck(self, input, 0, 1, true) == YES) { Py_INCREF(input); return input; } pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy