From 5126f97c238ef4bbc5cb6decaedb36f1241e8fe1 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 23 Jun 2020 17:43:45 +0200 Subject: [PATCH] bpo-40521: Make empty Unicode string per interpreter Each interpreter now has its own empty Unicode string singleton. --- Include/internal/pycore_interp.h | 2 + Include/internal/pycore_pylifecycle.h | 2 +- .../2020-05-20-01-17-34.bpo-40521.wvAehI.rst | 2 +- Objects/stringlib/asciilib.h | 1 - Objects/stringlib/partition.h | 7 +- Objects/stringlib/stringdefs.h | 4 - Objects/stringlib/ucs1lib.h | 1 - Objects/stringlib/ucs2lib.h | 1 - Objects/stringlib/ucs4lib.h | 1 - Objects/stringlib/unicodedefs.h | 1 - Objects/unicodeobject.c | 190 +++++++++++------- Python/pylifecycle.c | 8 +- 12 files changed, 130 insertions(+), 90 deletions(-) diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 435a72a522011f..d8947e700f84e8 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -71,6 +71,8 @@ struct _Py_bytes_state { }; struct _Py_unicode_state { + // The empty Unicode object is a singleton to improve performance. + PyObject *empty; struct _Py_unicode_fs_codec fs_codec; }; diff --git a/Include/internal/pycore_pylifecycle.h b/Include/internal/pycore_pylifecycle.h index cd470441817a2d..f29c7cb9f392ca 100644 --- a/Include/internal/pycore_pylifecycle.h +++ b/Include/internal/pycore_pylifecycle.h @@ -31,7 +31,7 @@ PyAPI_FUNC(int) _Py_IsLocaleCoercionTarget(const char *ctype_loc); /* Various one-time initializers */ -extern PyStatus _PyUnicode_Init(void); +extern PyStatus _PyUnicode_Init(PyThreadState *tstate); extern int _PyStructSequence_Init(void); extern int _PyLong_Init(PyThreadState *tstate); extern PyStatus _PyFaulthandler_Init(int enable); diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst index 9b94bcc016927f..e970551f531d0c 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst @@ -2,7 +2,7 @@ Each interpreter now its has own free lists, singletons and caches: * Free lists: float, tuple, list, dict, frame, context, asynchronous generator, MemoryError. -* Singletons: empty tuple, empty bytes string, +* Singletons: empty tuple, empty bytes string, empty Unicode string, single byte character. * Slice cache. diff --git a/Objects/stringlib/asciilib.h b/Objects/stringlib/asciilib.h index 8599d38a5a7f54..7749e8fb339822 100644 --- a/Objects/stringlib/asciilib.h +++ b/Objects/stringlib/asciilib.h @@ -11,7 +11,6 @@ #define STRINGLIB_CHAR Py_UCS1 #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_GET_EMPTY() unicode_empty #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE #define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL diff --git a/Objects/stringlib/partition.h b/Objects/stringlib/partition.h index 3731df56987fd5..bcc217697b2e9c 100644 --- a/Objects/stringlib/partition.h +++ b/Objects/stringlib/partition.h @@ -1,9 +1,14 @@ /* stringlib: partition implementation */ #ifndef STRINGLIB_FASTSEARCH_H -#error must include "stringlib/fastsearch.h" before including this module +# error must include "stringlib/fastsearch.h" before including this module #endif +#if !STRINGLIB_MUTABLE && !defined(STRINGLIB_GET_EMPTY) +# error "STRINGLIB_GET_EMPTY must be defined if STRINGLIB_MUTABLE is zero" +#endif + + Py_LOCAL_INLINE(PyObject*) STRINGLIB(partition)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, diff --git a/Objects/stringlib/stringdefs.h b/Objects/stringlib/stringdefs.h index c12ecc59e5c6d4..88641b25d47c6f 100644 --- a/Objects/stringlib/stringdefs.h +++ b/Objects/stringlib/stringdefs.h @@ -1,10 +1,6 @@ #ifndef STRINGLIB_STRINGDEFS_H #define STRINGLIB_STRINGDEFS_H -#ifndef STRINGLIB_GET_EMPTY -# error "STRINGLIB_GET_EMPTY macro must be defined" -#endif - /* this is sort of a hack. there's at least one place (formatting floats) where some stringlib code takes a different path if it's compiled as unicode. */ diff --git a/Objects/stringlib/ucs1lib.h b/Objects/stringlib/ucs1lib.h index bdf30356b8457d..5b0b8a025e808f 100644 --- a/Objects/stringlib/ucs1lib.h +++ b/Objects/stringlib/ucs1lib.h @@ -11,7 +11,6 @@ #define STRINGLIB_CHAR Py_UCS1 #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_GET_EMPTY() unicode_empty #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE #define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL diff --git a/Objects/stringlib/ucs2lib.h b/Objects/stringlib/ucs2lib.h index 9d6888801867d2..6af01511c5f8ab 100644 --- a/Objects/stringlib/ucs2lib.h +++ b/Objects/stringlib/ucs2lib.h @@ -11,7 +11,6 @@ #define STRINGLIB_CHAR Py_UCS2 #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_GET_EMPTY() unicode_empty #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE #define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL diff --git a/Objects/stringlib/ucs4lib.h b/Objects/stringlib/ucs4lib.h index c7dfa527433e31..39071a0cdf0cde 100644 --- a/Objects/stringlib/ucs4lib.h +++ b/Objects/stringlib/ucs4lib.h @@ -11,7 +11,6 @@ #define STRINGLIB_CHAR Py_UCS4 #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_GET_EMPTY() unicode_empty #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE #define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL diff --git a/Objects/stringlib/unicodedefs.h b/Objects/stringlib/unicodedefs.h index e4d4163afc2f94..5ea79cd4f50ac3 100644 --- a/Objects/stringlib/unicodedefs.h +++ b/Objects/stringlib/unicodedefs.h @@ -13,7 +13,6 @@ #define STRINGLIB_CHAR Py_UNICODE #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_GET_EMPTY() unicode_empty #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE #define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 1433848c81f8e1..06ca7a5751d2fe 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -222,26 +222,43 @@ extern "C" { static PyObject *interned = NULL; #endif -/* The empty Unicode object is shared to improve performance. */ -static PyObject *unicode_empty = NULL; +static struct _Py_unicode_state* +get_unicode_state(void) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + return &interp->unicode; +} -#define _Py_INCREF_UNICODE_EMPTY() \ - do { \ - if (unicode_empty != NULL) \ - Py_INCREF(unicode_empty); \ - else { \ - unicode_empty = PyUnicode_New(0, 0); \ - if (unicode_empty != NULL) { \ - Py_INCREF(unicode_empty); \ - assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \ - } \ - } \ - } while (0) -#define _Py_RETURN_UNICODE_EMPTY() \ - do { \ - _Py_INCREF_UNICODE_EMPTY(); \ - return unicode_empty; \ +// Return a borrowed reference to the empty string singleton. +// Return NULL if the singleton was not created yet. +static inline PyObject* unicode_get_empty(void) +{ + struct _Py_unicode_state *state = get_unicode_state(); + return state->empty; +} + +static inline PyObject* unicode_new_empty(void) +{ + struct _Py_unicode_state *state = get_unicode_state(); + PyObject *empty = state->empty; + if (empty != NULL) { + Py_INCREF(empty); + } + else { + empty = PyUnicode_New(0, 0); + if (empty != NULL) { + Py_INCREF(empty); + assert(_PyUnicode_CheckConsistency(empty, 1)); + state->empty = empty; + } + } + return empty; +} + +#define _Py_RETURN_UNICODE_EMPTY() \ + do { \ + return unicode_new_empty(); \ } while (0) static inline void @@ -676,11 +693,15 @@ unicode_result_ready(PyObject *unicode) length = PyUnicode_GET_LENGTH(unicode); if (length == 0) { - if (unicode != unicode_empty) { + PyObject *empty = unicode_get_empty(); + if (unicode != empty) { Py_DECREF(unicode); - _Py_RETURN_UNICODE_EMPTY(); + + Py_INCREF(empty); + return empty; } - return unicode_empty; + // unicode is the empty string singleton + return unicode; } #ifdef LATIN1_SINGLETONS @@ -864,7 +885,7 @@ xmlcharrefreplace(_PyBytesWriter *writer, char *str, to keep things simple, we use a single bitmask, using the least 5 bits from each unicode characters as the bit index. */ -/* the linebreak mask is set up by Unicode_Init below */ +/* the linebreak mask is set up by _PyUnicode_Init() below */ #if LONG_BIT >= 128 #define BLOOM_WIDTH 128 @@ -938,6 +959,8 @@ ensure_unicode(PyObject *obj) /* Compilation of templated routines */ +#define STRINGLIB_GET_EMPTY() unicode_get_empty() + #include "stringlib/asciilib.h" #include "stringlib/fastsearch.h" #include "stringlib/partition.h" @@ -986,6 +1009,8 @@ _Py_COMP_DIAG_IGNORE_DEPR_DECLS #include "stringlib/undef.h" _Py_COMP_DIAG_POP +#undef STRINGLIB_GET_EMPTY + /* --- Unicode Object ----------------------------------------------------- */ static inline Py_ssize_t @@ -1234,9 +1259,12 @@ _PyUnicode_New(Py_ssize_t length) size_t new_size; /* Optimization for empty strings */ - if (length == 0 && unicode_empty != NULL) { - Py_INCREF(unicode_empty); - return (PyUnicodeObject*)unicode_empty; + if (length == 0) { + PyObject *empty = unicode_get_empty(); + if (empty != NULL) { + Py_INCREF(empty); + return (PyUnicodeObject *)empty; + } } /* Ensure we won't overflow the size. */ @@ -1386,6 +1414,15 @@ _PyUnicode_Dump(PyObject *op) PyObject * PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) { + /* Optimization for empty strings */ + if (size == 0) { + PyObject *empty = unicode_get_empty(); + if (empty != NULL) { + Py_INCREF(empty); + return empty; + } + } + PyObject *obj; PyCompactUnicodeObject *unicode; void *data; @@ -1394,12 +1431,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) Py_ssize_t char_size; Py_ssize_t struct_size; - /* Optimization for empty strings */ - if (size == 0 && unicode_empty != NULL) { - Py_INCREF(unicode_empty); - return unicode_empty; - } - is_ascii = 0; is_sharing = 0; struct_size = sizeof(PyCompactUnicodeObject); @@ -1970,7 +2001,8 @@ unicode_dealloc(PyObject *unicode) static int unicode_is_singleton(PyObject *unicode) { - if (unicode == unicode_empty) { + struct _Py_unicode_state *state = get_unicode_state(); + if (unicode == state->empty) { return 1; } #ifdef LATIN1_SINGLETONS @@ -2026,10 +2058,10 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length) return 0; if (length == 0) { - _Py_INCREF_UNICODE_EMPTY(); - if (!unicode_empty) + PyObject *empty = unicode_new_empty(); + if (!empty) return -1; - Py_SETREF(*p_unicode, unicode_empty); + Py_SETREF(*p_unicode, empty); return 0; } @@ -10836,10 +10868,10 @@ replace(PyObject *self, PyObject *str1, } new_size = slen + n * (len2 - len1); if (new_size == 0) { - _Py_INCREF_UNICODE_EMPTY(); - if (!unicode_empty) + PyObject *empty = unicode_new_empty(); + if (!empty) goto error; - u = unicode_empty; + u = empty; goto done; } if (new_size > (PY_SSIZE_T_MAX / rkind)) { @@ -11497,10 +11529,13 @@ PyUnicode_Concat(PyObject *left, PyObject *right) return NULL; /* Shortcuts */ - if (left == unicode_empty) + PyObject *empty = unicode_get_empty(); // Borrowed reference + if (left == empty) { return PyUnicode_FromObject(right); - if (right == unicode_empty) + } + if (right == empty) { return PyUnicode_FromObject(left); + } left_len = PyUnicode_GET_LENGTH(left); right_len = PyUnicode_GET_LENGTH(right); @@ -11551,14 +11586,16 @@ PyUnicode_Append(PyObject **p_left, PyObject *right) goto error; /* Shortcuts */ - if (left == unicode_empty) { + PyObject *empty = unicode_get_empty(); // Borrowed reference + if (left == empty) { Py_DECREF(left); Py_INCREF(right); *p_left = right; return; } - if (right == unicode_empty) + if (right == empty) { return; + } left_len = PyUnicode_GET_LENGTH(left); right_len = PyUnicode_GET_LENGTH(right); @@ -13255,12 +13292,12 @@ PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj) len1 = PyUnicode_GET_LENGTH(str_obj); len2 = PyUnicode_GET_LENGTH(sep_obj); if (kind1 < kind2 || len1 < len2) { - _Py_INCREF_UNICODE_EMPTY(); - if (!unicode_empty) + PyObject *empty = unicode_get_empty(); // Borrowed reference + if (!empty) { out = NULL; + } else { - out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty); - Py_DECREF(unicode_empty); + out = PyTuple_Pack(3, str_obj, empty, empty); } return out; } @@ -13313,12 +13350,12 @@ PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj) len1 = PyUnicode_GET_LENGTH(str_obj); len2 = PyUnicode_GET_LENGTH(sep_obj); if (kind1 < kind2 || len1 < len2) { - _Py_INCREF_UNICODE_EMPTY(); - if (!unicode_empty) + PyObject *empty = unicode_get_empty(); // Borrowed reference + if (!empty) { out = NULL; + } else { - out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj); - Py_DECREF(unicode_empty); + out = PyTuple_Pack(3, empty, empty, str_obj); } return out; } @@ -15538,10 +15575,10 @@ PyTypeObject PyUnicode_Type = { /* Initialize the Unicode implementation */ PyStatus -_PyUnicode_Init(void) +_PyUnicode_Init(PyThreadState *tstate) { /* XXX - move this array to unicodectype.c ? */ - Py_UCS2 linebreak[] = { + const Py_UCS2 linebreak[] = { 0x000A, /* LINE FEED */ 0x000D, /* CARRIAGE RETURN */ 0x001C, /* FILE SEPARATOR */ @@ -15553,29 +15590,31 @@ _PyUnicode_Init(void) }; /* Init the implementation */ - _Py_INCREF_UNICODE_EMPTY(); - if (!unicode_empty) { - return _PyStatus_ERR("Can't create empty string"); + PyObject *empty = unicode_new_empty(); + if (!empty) { + return _PyStatus_NO_MEMORY(); } - Py_DECREF(unicode_empty); + Py_DECREF(empty); - if (PyType_Ready(&PyUnicode_Type) < 0) { - return _PyStatus_ERR("Can't initialize unicode type"); - } + if (_Py_IsMainInterpreter(tstate)) { + /* initialize the linebreak bloom filter */ + bloom_linebreak = make_bloom_mask( + PyUnicode_2BYTE_KIND, linebreak, + Py_ARRAY_LENGTH(linebreak)); - /* initialize the linebreak bloom filter */ - bloom_linebreak = make_bloom_mask( - PyUnicode_2BYTE_KIND, linebreak, - Py_ARRAY_LENGTH(linebreak)); + if (PyType_Ready(&PyUnicode_Type) < 0) { + return _PyStatus_ERR("Can't initialize unicode type"); + } - if (PyType_Ready(&EncodingMapType) < 0) { - return _PyStatus_ERR("Can't initialize encoding map type"); - } - if (PyType_Ready(&PyFieldNameIter_Type) < 0) { - return _PyStatus_ERR("Can't initialize field name iterator type"); - } - if (PyType_Ready(&PyFormatterIter_Type) < 0) { - return _PyStatus_ERR("Can't initialize formatter iter type"); + if (PyType_Ready(&EncodingMapType) < 0) { + return _PyStatus_ERR("Can't initialize encoding map type"); + } + if (PyType_Ready(&PyFieldNameIter_Type) < 0) { + return _PyStatus_ERR("Can't initialize field name iterator type"); + } + if (PyType_Ready(&PyFormatterIter_Type) < 0) { + return _PyStatus_ERR("Can't initialize formatter iter type"); + } } return _PyStatus_OK(); } @@ -16205,7 +16244,10 @@ _PyUnicode_EnableLegacyWindowsFSEncoding(void) void _PyUnicode_Fini(PyThreadState *tstate) { - if (_Py_IsMainInterpreter(tstate)) { + struct _Py_unicode_state *state = &tstate->interp->unicode; + + int is_main_interp = _Py_IsMainInterpreter(tstate); + if (is_main_interp) { #if defined(WITH_VALGRIND) || defined(__INSURE__) /* Insure++ is a memory analysis tool that aids in discovering * memory leaks and other memory problems. On Python exit, the @@ -16218,9 +16260,11 @@ _PyUnicode_Fini(PyThreadState *tstate) */ unicode_release_interned(); #endif /* __INSURE__ */ + } - Py_CLEAR(unicode_empty); + Py_CLEAR(state->empty); + if (is_main_interp) { #ifdef LATIN1_SINGLETONS for (Py_ssize_t i = 0; i < 256; i++) { Py_CLEAR(unicode_latin1[i]); diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index f0b40b3aa68e32..eda4c6ad7e474c 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -595,11 +595,9 @@ pycore_init_types(PyThreadState *tstate) return _PyStatus_ERR("can't init longs"); } - if (is_main_interp) { - status = _PyUnicode_Init(); - if (_PyStatus_EXCEPTION(status)) { - return status; - } + status = _PyUnicode_Init(tstate); + if (_PyStatus_EXCEPTION(status)) { + return status; } status = _PyExc_Init(tstate); pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy