diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 298177eb8003a7..e7e4ca3358e0f9 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -30,6 +30,7 @@ import codecs import sys +from _codecs import _normalize_encoding from . import aliases _cache = {} @@ -55,18 +56,7 @@ def normalize_encoding(encoding): if isinstance(encoding, bytes): encoding = str(encoding, "ascii") - chars = [] - punct = False - for c in encoding: - if c.isalnum() or c == '.': - if punct and chars: - chars.append('_') - if c.isascii(): - chars.append(c) - punct = False - else: - punct = True - return ''.join(chars) + return _normalize_encoding(encoding) def search_function(encoding): diff --git a/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst b/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst new file mode 100644 index 00000000000000..70e39a4f2c167c --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst @@ -0,0 +1,4 @@ +:mod:`encodings`: Improve :func:`~encodings.normalize_encoding` performance +by implementing the function in C using the private +``_Py_normalize_encoding`` which has been modified to make lowercase +conversion optional. diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 7cf3f152eeecc6..853d461ef15950 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -1022,6 +1022,58 @@ _codecs_lookup_error_impl(PyObject *module, const char *name) return PyCodec_LookupError(name); } +extern int _Py_normalize_encoding(const char *, char *, size_t, int); + +/*[clinic input] +_codecs._normalize_encoding + encoding: unicode + +Normalize an encoding name *encoding*. + +Used for encodings.normalize_encoding. Does not convert to lower case. +[clinic start generated code]*/ + +static PyObject * +_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding) +/*[clinic end generated code: output=d27465d81e361f8e input=3ff3f4d64995b988]*/ +{ + Py_ssize_t len; + const char *cstr = PyUnicode_AsUTF8AndSize(encoding, &len); + if (cstr == NULL) { + return NULL; + } + + if (len > PY_SSIZE_T_MAX) { + PyErr_SetString(PyExc_OverflowError, "encoding is too large"); + return NULL; + } + + PyUnicodeWriter *writer = PyUnicodeWriter_Create(len + 1); + if (writer == NULL) { + return NULL; + } + + char *normalized = PyMem_Malloc(len + 1); + if (normalized == NULL) { + PyUnicodeWriter_Discard(writer); + return PyErr_NoMemory(); + } + + if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) { + PyMem_Free(normalized); + PyUnicodeWriter_Discard(writer); + return NULL; + } + + if (PyUnicodeWriter_WriteUTF8(writer, normalized, (Py_ssize_t)strlen(normalized)) < 0) { + PyUnicodeWriter_Discard(writer); + PyMem_Free(normalized); + return NULL; + } + PyMem_Free(normalized); + return PyUnicodeWriter_Finish(writer); +} + /* --- Module API --------------------------------------------------------- */ static PyMethodDef _codecs_functions[] = { @@ -1071,6 +1123,7 @@ static PyMethodDef _codecs_functions[] = { _CODECS_REGISTER_ERROR_METHODDEF _CODECS__UNREGISTER_ERROR_METHODDEF _CODECS_LOOKUP_ERROR_METHODDEF + _CODECS__NORMALIZE_ENCODING_METHODDEF {NULL, NULL} /* sentinel */ }; diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h index b0310325759326..9e2a7950ebde64 100644 --- a/Modules/clinic/_codecsmodule.c.h +++ b/Modules/clinic/_codecsmodule.c.h @@ -2779,6 +2779,70 @@ _codecs_lookup_error(PyObject *module, PyObject *arg) return return_value; } +PyDoc_STRVAR(_codecs__normalize_encoding__doc__, +"_normalize_encoding($module, /, encoding)\n" +"--\n" +"\n" +"Normalize an encoding name *encoding*.\n" +"\n" +"Used for encodings.normalize_encoding. Does not convert to lower case."); + +#define _CODECS__NORMALIZE_ENCODING_METHODDEF \ + {"_normalize_encoding", _PyCFunction_CAST(_codecs__normalize_encoding), METH_FASTCALL|METH_KEYWORDS, _codecs__normalize_encoding__doc__}, + +static PyObject * +_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding); + +static PyObject * +_codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(encoding), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"encoding", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "_normalize_encoding", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[1]; + PyObject *encoding; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + if (!PyUnicode_Check(args[0])) { + _PyArg_BadArgument("_normalize_encoding", "argument 'encoding'", "str", args[0]); + goto exit; + } + encoding = args[0]; + return_value = _codecs__normalize_encoding_impl(module, encoding); + +exit: + return return_value; +} + #ifndef _CODECS_MBCS_DECODE_METHODDEF #define _CODECS_MBCS_DECODE_METHODDEF #endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */ @@ -2802,4 +2866,4 @@ _codecs_lookup_error(PyObject *module, PyObject *arg) #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ -/*[clinic end generated code: output=ed13f20dfb09e306 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=a968c493bb28be3e input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5c2308a012142a..ba66e273a208be 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3587,13 +3587,14 @@ PyUnicode_FromEncodedObject(PyObject *obj, return v; } -/* Normalize an encoding name: similar to encodings.normalize_encoding(), but - also convert to lowercase. Return 1 on success, or 0 on error (encoding is - longer than lower_len-1). */ +/* Normalize an encoding name like encodings.normalize_encoding() + but allow to convert to lowercase if *to_lower* is true. + Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */ int _Py_normalize_encoding(const char *encoding, char *lower, - size_t lower_len) + size_t lower_len, + int to_lower) { const char *e; char *l; @@ -3624,7 +3625,7 @@ _Py_normalize_encoding(const char *encoding, if (l == l_end) { return 0; } - *l++ = Py_TOLOWER(c); + *l++ = to_lower ? Py_TOLOWER(c) : c; } else { punct = 1; @@ -3659,7 +3660,7 @@ PyUnicode_Decode(const char *s, } /* Shortcuts for common default encodings */ - if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { + if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) { char *lower = buflower; /* Fast paths */ @@ -3916,7 +3917,7 @@ PyUnicode_AsEncodedString(PyObject *unicode, } /* Shortcuts for common default encodings */ - if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { + if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) { char *lower = buflower; /* Fast paths */ diff --git a/Python/codecs.c b/Python/codecs.c index caf8d9d5f3c188..ffcb14928e0a82 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -90,7 +90,7 @@ PyCodec_Unregister(PyObject *search_function) return 0; } -extern int _Py_normalize_encoding(const char *, char *, size_t); +extern int _Py_normalize_encoding(const char *, char *, size_t, int); /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are converted to lower case, spaces and hyphens are replaced with underscores. */ @@ -108,10 +108,11 @@ PyObject *normalizestring(const char *string) } encoding = PyMem_Malloc(len + 1); - if (encoding == NULL) + if (encoding == NULL) { return PyErr_NoMemory(); + } - if (!_Py_normalize_encoding(string, encoding, len + 1)) + if (!_Py_normalize_encoding(string, encoding, len + 1, 1)) { PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed"); PyMem_Free(encoding); diff --git a/Python/fileutils.c b/Python/fileutils.c index 2a3f12d4e872f8..aedf8576c7a930 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -180,7 +180,7 @@ _Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs) #define USE_FORCE_ASCII -extern int _Py_normalize_encoding(const char *, char *, size_t); +extern int _Py_normalize_encoding(const char *, char *, size_t, int); /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale and POSIX locale. nl_langinfo(CODESET) announces an alias of the @@ -231,7 +231,7 @@ check_force_ascii(void) } char encoding[20]; /* longest name: "iso_646.irv_1991\0" */ - if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) { + if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding), 1)) { goto error; }
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: