From 92873d68937cb68077e01957611969fd26f0bad4 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 14 Jul 2025 09:34:06 +0100 Subject: [PATCH 1/7] C --- Lib/encodings/__init__.py | 33 +++++++++------- Lib/test/test_codecs.py | 6 ++- ...5-07-14-09-33-17.gh-issue-55531.Gt2e12.rst | 4 ++ Modules/_codecsmodule.c | 39 +++++++++++++++++++ Modules/clinic/_codecsmodule.c.h | 31 ++++++++++++++- Objects/unicodeobject.c | 15 +++---- Python/codecs.c | 7 ++-- Python/fileutils.c | 4 +- 8 files changed, 110 insertions(+), 29 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 298177eb8003a7..31ab4147668f26 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -26,7 +26,7 @@ (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. -"""#" +""" import codecs import sys @@ -37,10 +37,23 @@ _import_tail = ['*'] _aliases = aliases.aliases + +_norm_encoding_map = ( + #0123456789ABCDEF0123456789ABCDEF + ' ' + ' . 0123456789 ' + ' ABCDEFGHIJKLMNOPQRSTUVWXYZ ' + ' abcdefghijklmnopqrstuvwxyz ' + ' ' + ' ' + ' ' + ' ') + + class CodecRegistryError(LookupError, SystemError): pass -def normalize_encoding(encoding): +def normalize_encoding(encoding, /): """ Normalize an encoding name. @@ -55,18 +68,10 @@ def normalize_encoding(encoding): if isinstance(encoding, bytes): encoding = str(encoding, "ascii") - chars = [] - punct = False - for c in encoding: - if c.isalnum() or c == '.': - if punct and chars: - chars.append('_') - if c.isascii(): - chars.append(c) - punct = False - else: - punct = True - return ''.join(chars) + s = encoding.translate(_norm_encoding_map) + return '_'.join(s.split()) + +from _codecs import _normalize_encoding as normalize_encoding def search_function(encoding): diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index d8666f7290e72e..99ea833b60bce6 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3895,11 +3895,13 @@ def search_function(encoding): self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8')) def test_encodings_normalize_encoding(self): - # encodings.normalize_encoding() ignores non-ASCII characters. normalize = encodings.normalize_encoding self.assertEqual(normalize('utf_8'), 'utf_8') - self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') self.assertEqual(normalize('utf 8'), 'utf_8') + + # encodings.normalize_encoding() does not accept non-ASCII characters. + self.assertRaises(UnicodeEncodeError, normalize, 'utf\xE9\u20AC\U0010ffff-8') + # encodings.normalize_encoding() doesn't convert # characters to lower case. self.assertEqual(normalize('UTF 8'), 'UTF_8') diff --git a/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst b/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst new file mode 100644 index 00000000000000..70e39a4f2c167c --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst @@ -0,0 +1,4 @@ +:mod:`encodings`: Improve :func:`~encodings.normalize_encoding` performance +by implementing the function in C using the private +``_Py_normalize_encoding`` which has been modified to make lowercase +conversion optional. diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 7cf3f152eeecc6..c8ce3738693bc0 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -1022,6 +1022,44 @@ _codecs_lookup_error_impl(PyObject *module, const char *name) return PyCodec_LookupError(name); } +extern int _Py_normalize_encoding(const char *, char *, size_t, int); + +/*[clinic input] +_codecs._normalize_encoding + encoding: str(encoding='ascii') + / + +Normalize an encoding name, while not converting to lower case (to_lower == 1). +Used for encodings.normalize_encoding. +[clinic start generated code]*/ + +static PyObject * +_codecs__normalize_encoding_impl(PyObject *module, char *encoding) +/*[clinic end generated code: output=d5e3a4b5266fbe96 input=ca002bbc262228f1]*/ +{ + size_t len = strlen(encoding); + if (len > PY_SSIZE_T_MAX) { + PyErr_SetString(PyExc_OverflowError, "encoding is too large"); + return NULL; + } + + char *normalized = PyMem_Malloc(len + 1); + if (normalized == NULL) { + return PyErr_NoMemory(); + } + + if (!_Py_normalize_encoding(encoding, normalized, len + 1, 0)) { + PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed"); + PyMem_Free(normalized); + return NULL; + } + + PyObject *v = PyUnicode_FromString(normalized); + PyMem_Free(normalized); + return v; +} + + /* --- Module API --------------------------------------------------------- */ static PyMethodDef _codecs_functions[] = { @@ -1071,6 +1109,7 @@ static PyMethodDef _codecs_functions[] = { _CODECS_REGISTER_ERROR_METHODDEF _CODECS__UNREGISTER_ERROR_METHODDEF _CODECS_LOOKUP_ERROR_METHODDEF + _CODECS__NORMALIZE_ENCODING_METHODDEF {NULL, NULL} /* sentinel */ }; diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h index b0310325759326..eb4f481f8118de 100644 --- a/Modules/clinic/_codecsmodule.c.h +++ b/Modules/clinic/_codecsmodule.c.h @@ -2779,6 +2779,35 @@ _codecs_lookup_error(PyObject *module, PyObject *arg) return return_value; } +PyDoc_STRVAR(_codecs__normalize_encoding__doc__, +"_normalize_encoding($module, encoding, /)\n" +"--\n" +"\n" +"Normalize an encoding name. Used for encodings.normalize_encoding."); + +#define _CODECS__NORMALIZE_ENCODING_METHODDEF \ + {"_normalize_encoding", (PyCFunction)_codecs__normalize_encoding, METH_O, _codecs__normalize_encoding__doc__}, + +static PyObject * +_codecs__normalize_encoding_impl(PyObject *module, char *encoding); + +static PyObject * +_codecs__normalize_encoding(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + char *encoding = NULL; + + if (!PyArg_Parse(arg, "es:_normalize_encoding", "ascii", &encoding)) { + goto exit; + } + return_value = _codecs__normalize_encoding_impl(module, encoding); + /* Post parse cleanup for encoding */ + PyMem_FREE(encoding); + +exit: + return return_value; +} + #ifndef _CODECS_MBCS_DECODE_METHODDEF #define _CODECS_MBCS_DECODE_METHODDEF #endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */ @@ -2802,4 +2831,4 @@ _codecs_lookup_error(PyObject *module, PyObject *arg) #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ -/*[clinic end generated code: output=ed13f20dfb09e306 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=aa3636e281f5268f input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5c2308a012142a..64d8cf4397237c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3587,13 +3587,14 @@ PyUnicode_FromEncodedObject(PyObject *obj, return v; } -/* Normalize an encoding name: similar to encodings.normalize_encoding(), but - also convert to lowercase. Return 1 on success, or 0 on error (encoding is - longer than lower_len-1). */ +/* Normalize an encoding name like encodings.normalize_encoding() + Optionally covert convert to lowercase by setting *to_lower* to 1. + Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */ int _Py_normalize_encoding(const char *encoding, char *lower, - size_t lower_len) + size_t lower_len, + int to_lower) { const char *e; char *l; @@ -3624,7 +3625,7 @@ _Py_normalize_encoding(const char *encoding, if (l == l_end) { return 0; } - *l++ = Py_TOLOWER(c); + *l++ = to_lower ? Py_TOLOWER(c) : c; } else { punct = 1; @@ -3659,7 +3660,7 @@ PyUnicode_Decode(const char *s, } /* Shortcuts for common default encodings */ - if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { + if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) { char *lower = buflower; /* Fast paths */ @@ -3916,7 +3917,7 @@ PyUnicode_AsEncodedString(PyObject *unicode, } /* Shortcuts for common default encodings */ - if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { + if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) { char *lower = buflower; /* Fast paths */ diff --git a/Python/codecs.c b/Python/codecs.c index caf8d9d5f3c188..ffcb14928e0a82 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -90,7 +90,7 @@ PyCodec_Unregister(PyObject *search_function) return 0; } -extern int _Py_normalize_encoding(const char *, char *, size_t); +extern int _Py_normalize_encoding(const char *, char *, size_t, int); /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are converted to lower case, spaces and hyphens are replaced with underscores. */ @@ -108,10 +108,11 @@ PyObject *normalizestring(const char *string) } encoding = PyMem_Malloc(len + 1); - if (encoding == NULL) + if (encoding == NULL) { return PyErr_NoMemory(); + } - if (!_Py_normalize_encoding(string, encoding, len + 1)) + if (!_Py_normalize_encoding(string, encoding, len + 1, 1)) { PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed"); PyMem_Free(encoding); diff --git a/Python/fileutils.c b/Python/fileutils.c index 2a3f12d4e872f8..aedf8576c7a930 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -180,7 +180,7 @@ _Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs) #define USE_FORCE_ASCII -extern int _Py_normalize_encoding(const char *, char *, size_t); +extern int _Py_normalize_encoding(const char *, char *, size_t, int); /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale and POSIX locale. nl_langinfo(CODESET) announces an alias of the @@ -231,7 +231,7 @@ check_force_ascii(void) } char encoding[20]; /* longest name: "iso_646.irv_1991\0" */ - if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) { + if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding), 1)) { goto error; } From 4bae23a7353f8ec04631dd647a5c51d56baf86c6 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 14 Jul 2025 09:43:10 +0100 Subject: [PATCH 2/7] Correct clinic note --- Modules/_codecsmodule.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index c8ce3738693bc0..d9441ee9b2a74e 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -1029,8 +1029,10 @@ _codecs._normalize_encoding encoding: str(encoding='ascii') / -Normalize an encoding name, while not converting to lower case (to_lower == 1). +Normalize an encoding name *encoding*. + Used for encodings.normalize_encoding. +Does not convert to lower case (to_lower == 1). [clinic start generated code]*/ static PyObject * From b5f3df3a44dff71ea9795e12e510fd0de9eafdcb Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 14 Jul 2025 09:54:58 +0100 Subject: [PATCH 3/7] Little fixes --- Lib/encodings/__init__.py | 2 +- Modules/_codecsmodule.c | 6 ++--- Modules/clinic/_codecsmodule.c.h | 42 +++++++++++++++++++++++++++----- Objects/unicodeobject.c | 2 +- 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 31ab4147668f26..ef15189b984f40 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -53,7 +53,7 @@ class CodecRegistryError(LookupError, SystemError): pass -def normalize_encoding(encoding, /): +def normalize_encoding(encoding): """ Normalize an encoding name. diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index d9441ee9b2a74e..36e58015e84286 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -1027,17 +1027,15 @@ extern int _Py_normalize_encoding(const char *, char *, size_t, int); /*[clinic input] _codecs._normalize_encoding encoding: str(encoding='ascii') - / Normalize an encoding name *encoding*. -Used for encodings.normalize_encoding. -Does not convert to lower case (to_lower == 1). +Used for encodings.normalize_encoding. Does not convert to lower case. [clinic start generated code]*/ static PyObject * _codecs__normalize_encoding_impl(PyObject *module, char *encoding) -/*[clinic end generated code: output=d5e3a4b5266fbe96 input=ca002bbc262228f1]*/ +/*[clinic end generated code: output=d5e3a4b5266fbe96 input=cdb53c013b2400e3]*/ { size_t len = strlen(encoding); if (len > PY_SSIZE_T_MAX) { diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h index eb4f481f8118de..540c980216dcb6 100644 --- a/Modules/clinic/_codecsmodule.c.h +++ b/Modules/clinic/_codecsmodule.c.h @@ -2780,24 +2780,54 @@ _codecs_lookup_error(PyObject *module, PyObject *arg) } PyDoc_STRVAR(_codecs__normalize_encoding__doc__, -"_normalize_encoding($module, encoding, /)\n" +"_normalize_encoding($module, /, encoding)\n" "--\n" "\n" -"Normalize an encoding name. Used for encodings.normalize_encoding."); +"Normalize an encoding name *encoding*.\n" +"\n" +"Used for encodings.normalize_encoding. Does not convert to lower case."); #define _CODECS__NORMALIZE_ENCODING_METHODDEF \ - {"_normalize_encoding", (PyCFunction)_codecs__normalize_encoding, METH_O, _codecs__normalize_encoding__doc__}, + {"_normalize_encoding", _PyCFunction_CAST(_codecs__normalize_encoding), METH_FASTCALL|METH_KEYWORDS, _codecs__normalize_encoding__doc__}, static PyObject * _codecs__normalize_encoding_impl(PyObject *module, char *encoding); static PyObject * -_codecs__normalize_encoding(PyObject *module, PyObject *arg) +_codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(encoding), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"encoding", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .format = "es:_normalize_encoding", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE char *encoding = NULL; - if (!PyArg_Parse(arg, "es:_normalize_encoding", "ascii", &encoding)) { + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, + "ascii", &encoding)) { goto exit; } return_value = _codecs__normalize_encoding_impl(module, encoding); @@ -2831,4 +2861,4 @@ _codecs__normalize_encoding(PyObject *module, PyObject *arg) #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ -/*[clinic end generated code: output=aa3636e281f5268f input=a9049054013a1b77]*/ +/*[clinic end generated code: output=0859b218fa612efd input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 64d8cf4397237c..ba66e273a208be 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3588,7 +3588,7 @@ PyUnicode_FromEncodedObject(PyObject *obj, } /* Normalize an encoding name like encodings.normalize_encoding() - Optionally covert convert to lowercase by setting *to_lower* to 1. + but allow to convert to lowercase if *to_lower* is true. Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */ int _Py_normalize_encoding(const char *encoding, From 2ad72b20dfd463f2f17bea6d4c9284eff6cd39f6 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 14 Jul 2025 10:17:22 +0100 Subject: [PATCH 4/7] Keep the messiness --- Lib/encodings/__init__.py | 18 ++---------------- Modules/_codecsmodule.c | 15 ++++++++++----- Modules/clinic/_codecsmodule.c.h | 21 +++++++++++++-------- 3 files changed, 25 insertions(+), 29 deletions(-) diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index ef15189b984f40..523b43e2f69cde 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -29,6 +29,7 @@ """ import codecs +from _codecs import _normalize_encoding import sys from . import aliases @@ -38,18 +39,6 @@ _aliases = aliases.aliases -_norm_encoding_map = ( - #0123456789ABCDEF0123456789ABCDEF - ' ' - ' . 0123456789 ' - ' ABCDEFGHIJKLMNOPQRSTUVWXYZ ' - ' abcdefghijklmnopqrstuvwxyz ' - ' ' - ' ' - ' ' - ' ') - - class CodecRegistryError(LookupError, SystemError): pass @@ -68,10 +57,7 @@ def normalize_encoding(encoding): if isinstance(encoding, bytes): encoding = str(encoding, "ascii") - s = encoding.translate(_norm_encoding_map) - return '_'.join(s.split()) - -from _codecs import _normalize_encoding as normalize_encoding + return _normalize_encoding(encoding) def search_function(encoding): diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 36e58015e84286..f2ba4eb79650ab 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -1026,7 +1026,7 @@ extern int _Py_normalize_encoding(const char *, char *, size_t, int); /*[clinic input] _codecs._normalize_encoding - encoding: str(encoding='ascii') + encoding: unicode Normalize an encoding name *encoding*. @@ -1034,10 +1034,15 @@ Used for encodings.normalize_encoding. Does not convert to lower case. [clinic start generated code]*/ static PyObject * -_codecs__normalize_encoding_impl(PyObject *module, char *encoding) -/*[clinic end generated code: output=d5e3a4b5266fbe96 input=cdb53c013b2400e3]*/ +_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding) +/*[clinic end generated code: output=d27465d81e361f8e input=3ff3f4d64995b988]*/ { - size_t len = strlen(encoding); + const char *cstr = PyUnicode_AsUTF8(encoding); + if (cstr == NULL) { + return NULL; + } + + size_t len = strlen(cstr); if (len > PY_SSIZE_T_MAX) { PyErr_SetString(PyExc_OverflowError, "encoding is too large"); return NULL; @@ -1048,7 +1053,7 @@ _codecs__normalize_encoding_impl(PyObject *module, char *encoding) return PyErr_NoMemory(); } - if (!_Py_normalize_encoding(encoding, normalized, len + 1, 0)) { + if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) { PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed"); PyMem_Free(normalized); return NULL; diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h index 540c980216dcb6..9e2a7950ebde64 100644 --- a/Modules/clinic/_codecsmodule.c.h +++ b/Modules/clinic/_codecsmodule.c.h @@ -2791,7 +2791,7 @@ PyDoc_STRVAR(_codecs__normalize_encoding__doc__, {"_normalize_encoding", _PyCFunction_CAST(_codecs__normalize_encoding), METH_FASTCALL|METH_KEYWORDS, _codecs__normalize_encoding__doc__}, static PyObject * -_codecs__normalize_encoding_impl(PyObject *module, char *encoding); +_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding); static PyObject * _codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) @@ -2820,19 +2820,24 @@ _codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t static const char * const _keywords[] = {"encoding", NULL}; static _PyArg_Parser _parser = { .keywords = _keywords, - .format = "es:_normalize_encoding", + .fname = "_normalize_encoding", .kwtuple = KWTUPLE, }; #undef KWTUPLE - char *encoding = NULL; + PyObject *argsbuf[1]; + PyObject *encoding; - if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, - "ascii", &encoding)) { + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + if (!PyUnicode_Check(args[0])) { + _PyArg_BadArgument("_normalize_encoding", "argument 'encoding'", "str", args[0]); goto exit; } + encoding = args[0]; return_value = _codecs__normalize_encoding_impl(module, encoding); - /* Post parse cleanup for encoding */ - PyMem_FREE(encoding); exit: return return_value; @@ -2861,4 +2866,4 @@ _codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ -/*[clinic end generated code: output=0859b218fa612efd input=a9049054013a1b77]*/ +/*[clinic end generated code: output=a968c493bb28be3e input=a9049054013a1b77]*/ From 3660160929bd17ad3afae8a00ad805dc1ff93ef6 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 14 Jul 2025 10:29:49 +0100 Subject: [PATCH 5/7] Clean up tests --- Lib/test/test_codecs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 99ea833b60bce6..348b450d1118d2 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3895,13 +3895,12 @@ def search_function(encoding): self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8')) def test_encodings_normalize_encoding(self): + # encodings.normalize_encoding() ignores non-ASCII characters. normalize = encodings.normalize_encoding self.assertEqual(normalize('utf_8'), 'utf_8') + self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') self.assertEqual(normalize('utf 8'), 'utf_8') - # encodings.normalize_encoding() does not accept non-ASCII characters. - self.assertRaises(UnicodeEncodeError, normalize, 'utf\xE9\u20AC\U0010ffff-8') - # encodings.normalize_encoding() doesn't convert # characters to lower case. self.assertEqual(normalize('UTF 8'), 'UTF_8') From 4e12b9ec888d1d33c9d956e10318847e3a50b58b Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Mon, 14 Jul 2025 13:53:48 +0100 Subject: [PATCH 6/7] Remove unnecessary message --- Modules/_codecsmodule.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index f2ba4eb79650ab..1d3534ab98fc47 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -1054,7 +1054,6 @@ _codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding) } if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) { - PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed"); PyMem_Free(normalized); return NULL; } @@ -1064,7 +1063,6 @@ _codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding) return v; } - /* --- Module API --------------------------------------------------------- */ static PyMethodDef _codecs_functions[] = { From 1c9e55ab8ffafd2bb0e68c688fadab90399cfc16 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Tue, 15 Jul 2025 18:02:12 +0100 Subject: [PATCH 7/7] Review --- Lib/encodings/__init__.py | 5 ++--- Lib/test/test_codecs.py | 1 - Modules/_codecsmodule.c | 19 +++++++++++++++---- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 523b43e2f69cde..e7e4ca3358e0f9 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -26,11 +26,11 @@ (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. -""" +"""#" import codecs -from _codecs import _normalize_encoding import sys +from _codecs import _normalize_encoding from . import aliases _cache = {} @@ -38,7 +38,6 @@ _import_tail = ['*'] _aliases = aliases.aliases - class CodecRegistryError(LookupError, SystemError): pass diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 348b450d1118d2..d8666f7290e72e 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3900,7 +3900,6 @@ def test_encodings_normalize_encoding(self): self.assertEqual(normalize('utf_8'), 'utf_8') self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') self.assertEqual(normalize('utf 8'), 'utf_8') - # encodings.normalize_encoding() doesn't convert # characters to lower case. self.assertEqual(normalize('UTF 8'), 'UTF_8') diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 1d3534ab98fc47..853d461ef15950 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -1037,30 +1037,41 @@ static PyObject * _codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding) /*[clinic end generated code: output=d27465d81e361f8e input=3ff3f4d64995b988]*/ { - const char *cstr = PyUnicode_AsUTF8(encoding); + Py_ssize_t len; + const char *cstr = PyUnicode_AsUTF8AndSize(encoding, &len); if (cstr == NULL) { return NULL; } - size_t len = strlen(cstr); if (len > PY_SSIZE_T_MAX) { PyErr_SetString(PyExc_OverflowError, "encoding is too large"); return NULL; } + PyUnicodeWriter *writer = PyUnicodeWriter_Create(len + 1); + if (writer == NULL) { + return NULL; + } + char *normalized = PyMem_Malloc(len + 1); if (normalized == NULL) { + PyUnicodeWriter_Discard(writer); return PyErr_NoMemory(); } if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) { PyMem_Free(normalized); + PyUnicodeWriter_Discard(writer); return NULL; } - PyObject *v = PyUnicode_FromString(normalized); + if (PyUnicodeWriter_WriteUTF8(writer, normalized, (Py_ssize_t)strlen(normalized)) < 0) { + PyUnicodeWriter_Discard(writer); + PyMem_Free(normalized); + return NULL; + } PyMem_Free(normalized); - return v; + return PyUnicodeWriter_Finish(writer); } /* --- Module API --------------------------------------------------------- */ pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy