From b24fd1341c32a30f7d08a852cf6cbb5649684226 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 24 Jun 2020 00:17:47 +0200 Subject: [PATCH] bpo-40521: Make Unicode latin1 singletons per interpreter Each interpreter now has its own Unicode latin1 singletons. Remove "ifdef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS" and "ifdef LATIN1_SINGLETONS": always enable latin1 singletons. Optimize unicode_result_ready(): only attempt to get a latin1 singleton for PyUnicode_1BYTE_KIND. --- Include/internal/pycore_interp.h | 3 + .../2020-05-20-01-17-34.bpo-40521.wvAehI.rst | 2 +- Objects/unicodeobject.c | 74 ++++++++----------- 3 files changed, 36 insertions(+), 43 deletions(-) diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index d8947e700f84e8..bf1769e5ce2c24 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -73,6 +73,9 @@ struct _Py_bytes_state { struct _Py_unicode_state { // The empty Unicode object is a singleton to improve performance. PyObject *empty; + /* Single character Unicode strings in the Latin-1 range are being + shared as well. */ + PyObject *latin1[256]; struct _Py_unicode_fs_codec fs_codec; }; diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst index e970551f531d0c..43226931ccc88d 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst @@ -3,7 +3,7 @@ Each interpreter now its has own free lists, singletons and caches: * Free lists: float, tuple, list, dict, frame, context, asynchronous generator, MemoryError. * Singletons: empty tuple, empty bytes string, empty Unicode string, - single byte character. + single byte character, single Unicode (latin1) character. * Slice cache. They are no longer shared by all interpreters. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e4235b1aca3cf6..5ba99514d29691 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -303,17 +303,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, /* List of static strings. */ static _Py_Identifier *static_strings = NULL; -/* bpo-40521: Latin1 singletons are shared by all interpreters. */ -#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS -# define LATIN1_SINGLETONS -#endif - -#ifdef LATIN1_SINGLETONS -/* Single character Unicode strings in the Latin-1 range are being - shared as well. */ -static PyObject *unicode_latin1[256] = {NULL}; -#endif - /* Fast detection of the most frequent whitespace characters */ const unsigned char _Py_ascii_whitespace[] = { 0, 0, 0, 0, 0, 0, 0, 0, @@ -657,9 +646,8 @@ unicode_result_wchar(PyObject *unicode) if (len == 1) { wchar_t ch = _PyUnicode_WSTR(unicode)[0]; if ((Py_UCS4)ch < 256) { - PyObject *latin1_char = get_latin1_char((unsigned char)ch); Py_DECREF(unicode); - return latin1_char; + return get_latin1_char((unsigned char)ch); } } @@ -692,13 +680,13 @@ unicode_result_ready(PyObject *unicode) return empty; } -#ifdef LATIN1_SINGLETONS if (length == 1) { - const void *data = PyUnicode_DATA(unicode); int kind = PyUnicode_KIND(unicode); - Py_UCS4 ch = PyUnicode_READ(kind, data, 0); - if (ch < 256) { - PyObject *latin1_char = unicode_latin1[ch]; + if (kind == PyUnicode_1BYTE_KIND) { + Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); + Py_UCS1 ch = data[0]; + struct _Py_unicode_state *state = get_unicode_state(); + PyObject *latin1_char = state->latin1[ch]; if (latin1_char != NULL) { if (unicode != latin1_char) { Py_INCREF(latin1_char); @@ -709,12 +697,14 @@ unicode_result_ready(PyObject *unicode) else { assert(_PyUnicode_CheckConsistency(unicode, 1)); Py_INCREF(unicode); - unicode_latin1[ch] = unicode; + state->latin1[ch] = unicode; return unicode; } } + else { + assert(PyUnicode_READ_CHAR(unicode, 0) >= 256); + } } -#endif assert(_PyUnicode_CheckConsistency(unicode, 1)); return unicode; @@ -1981,18 +1971,18 @@ unicode_dealloc(PyObject *unicode) static int unicode_is_singleton(PyObject *unicode) { - if (unicode == unicode_get_empty()) { + struct _Py_unicode_state *state = get_unicode_state(); + if (unicode == state->empty) { return 1; } -#ifdef LATIN1_SINGLETONS PyASCIIObject *ascii = (PyASCIIObject *)unicode; if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) { Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); - if (ch < 256 && unicode_latin1[ch] == unicode) + if (ch < 256 && state->latin1[ch] == unicode) { return 1; + } } -#endif return 0; } #endif @@ -2130,17 +2120,15 @@ unicode_write_cstr(PyObject *unicode, Py_ssize_t index, } static PyObject* -get_latin1_char(unsigned char ch) +get_latin1_char(Py_UCS1 ch) { - PyObject *unicode; + struct _Py_unicode_state *state = get_unicode_state(); -#ifdef LATIN1_SINGLETONS - unicode = unicode_latin1[ch]; + PyObject *unicode = state->latin1[ch]; if (unicode) { Py_INCREF(unicode); return unicode; } -#endif unicode = PyUnicode_New(1, ch); if (!unicode) { @@ -2150,10 +2138,8 @@ get_latin1_char(unsigned char ch) PyUnicode_1BYTE_DATA(unicode)[0] = ch; assert(_PyUnicode_CheckConsistency(unicode, 1)); -#ifdef LATIN1_SINGLETONS Py_INCREF(unicode); - unicode_latin1[ch] = unicode; -#endif + state->latin1[ch] = unicode; return unicode; } @@ -2164,8 +2150,9 @@ unicode_char(Py_UCS4 ch) assert(ch <= MAX_UNICODE); - if (ch < 256) + if (ch < 256) { return get_latin1_char(ch); + } unicode = PyUnicode_New(1, ch); if (unicode == NULL) @@ -2367,11 +2354,13 @@ _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size) PyObject *res; unsigned char max_char; - if (size == 0) + if (size == 0) { _Py_RETURN_UNICODE_EMPTY(); + } assert(size > 0); - if (size == 1) + if (size == 1) { return get_latin1_char(u[0]); + } max_char = ucs1lib_find_max_char(u, u + size); res = PyUnicode_New(size, max_char); @@ -5008,8 +4997,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, /* ASCII is equivalent to the first 128 ordinals in Unicode. */ if (size == 1 && (unsigned char)s[0] < 128) { - if (consumed) + if (consumed) { *consumed = 1; + } return get_latin1_char((unsigned char)s[0]); } @@ -7176,8 +7166,9 @@ PyUnicode_DecodeASCII(const char *s, _Py_RETURN_UNICODE_EMPTY(); /* ASCII is equivalent to the first 128 ordinals in Unicode. */ - if (size == 1 && (unsigned char)s[0] < 128) + if (size == 1 && (unsigned char)s[0] < 128) { return get_latin1_char((unsigned char)s[0]); + } // Shortcut for simple case PyObject *u = PyUnicode_New(size, 127); @@ -16234,12 +16225,11 @@ _PyUnicode_Fini(PyThreadState *tstate) Py_CLEAR(state->empty); + for (Py_ssize_t i = 0; i < 256; i++) { + Py_CLEAR(state->latin1[i]); + } + if (is_main_interp) { -#ifdef LATIN1_SINGLETONS - for (Py_ssize_t i = 0; i < 256; i++) { - Py_CLEAR(unicode_latin1[i]); - } -#endif unicode_clear_static_strings(); } pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy