Skip to content

Commit fa20e77

Browse files
vstinnerfasihahmad
authored andcommitted
bpo-40521: Make Unicode latin1 singletons per interpreter (pythonGH-21101)
Each interpreter now has its own Unicode latin1 singletons. Remove "ifdef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS" and "ifdef LATIN1_SINGLETONS": always enable latin1 singletons. Optimize unicode_result_ready(): only attempt to get a latin1 singleton for PyUnicode_1BYTE_KIND.
1 parent 8742b82 commit fa20e77

File tree

3 files changed

+36
-43
lines changed

3 files changed

+36
-43
lines changed

Include/internal/pycore_interp.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ struct _Py_bytes_state {
7373
struct _Py_unicode_state {
7474
// The empty Unicode object is a singleton to improve performance.
7575
PyObject *empty;
76+
/* Single character Unicode strings in the Latin-1 range are being
77+
shared as well. */
78+
PyObject *latin1[256];
7679
struct _Py_unicode_fs_codec fs_codec;
7780
};
7881

Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ Each interpreter now its has own free lists, singletons and caches:
33
* Free lists: float, tuple, list, dict, frame, context,
44
asynchronous generator, MemoryError.
55
* Singletons: empty tuple, empty bytes string, empty Unicode string,
6-
single byte character.
6+
single byte character, single Unicode (latin1) character.
77
* Slice cache.
88

99
They are no longer shared by all interpreters.

Objects/unicodeobject.c

Lines changed: 32 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -303,17 +303,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
303303
/* List of static strings. */
304304
static _Py_Identifier *static_strings = NULL;
305305

306-
/* bpo-40521: Latin1 singletons are shared by all interpreters. */
307-
#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
308-
# define LATIN1_SINGLETONS
309-
#endif
310-
311-
#ifdef LATIN1_SINGLETONS
312-
/* Single character Unicode strings in the Latin-1 range are being
313-
shared as well. */
314-
static PyObject *unicode_latin1[256] = {NULL};
315-
#endif
316-
317306
/* Fast detection of the most frequent whitespace characters */
318307
const unsigned char _Py_ascii_whitespace[] = {
319308
0, 0, 0, 0, 0, 0, 0, 0,
@@ -657,9 +646,8 @@ unicode_result_wchar(PyObject *unicode)
657646
if (len == 1) {
658647
wchar_t ch = _PyUnicode_WSTR(unicode)[0];
659648
if ((Py_UCS4)ch < 256) {
660-
PyObject *latin1_char = get_latin1_char((unsigned char)ch);
661649
Py_DECREF(unicode);
662-
return latin1_char;
650+
return get_latin1_char((unsigned char)ch);
663651
}
664652
}
665653

@@ -692,13 +680,13 @@ unicode_result_ready(PyObject *unicode)
692680
return empty;
693681
}
694682

695-
#ifdef LATIN1_SINGLETONS
696683
if (length == 1) {
697-
const void *data = PyUnicode_DATA(unicode);
698684
int kind = PyUnicode_KIND(unicode);
699-
Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
700-
if (ch < 256) {
701-
PyObject *latin1_char = unicode_latin1[ch];
685+
if (kind == PyUnicode_1BYTE_KIND) {
686+
Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
687+
Py_UCS1 ch = data[0];
688+
struct _Py_unicode_state *state = get_unicode_state();
689+
PyObject *latin1_char = state->latin1[ch];
702690
if (latin1_char != NULL) {
703691
if (unicode != latin1_char) {
704692
Py_INCREF(latin1_char);
@@ -709,12 +697,14 @@ unicode_result_ready(PyObject *unicode)
709697
else {
710698
assert(_PyUnicode_CheckConsistency(unicode, 1));
711699
Py_INCREF(unicode);
712-
unicode_latin1[ch] = unicode;
700+
state->latin1[ch] = unicode;
713701
return unicode;
714702
}
715703
}
704+
else {
705+
assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
706+
}
716707
}
717-
#endif
718708

719709
assert(_PyUnicode_CheckConsistency(unicode, 1));
720710
return unicode;
@@ -1981,18 +1971,18 @@ unicode_dealloc(PyObject *unicode)
19811971
static int
19821972
unicode_is_singleton(PyObject *unicode)
19831973
{
1984-
if (unicode == unicode_get_empty()) {
1974+
struct _Py_unicode_state *state = get_unicode_state();
1975+
if (unicode == state->empty) {
19851976
return 1;
19861977
}
1987-
#ifdef LATIN1_SINGLETONS
19881978
PyASCIIObject *ascii = (PyASCIIObject *)unicode;
19891979
if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
19901980
{
19911981
Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1992-
if (ch < 256 && unicode_latin1[ch] == unicode)
1982+
if (ch < 256 && state->latin1[ch] == unicode) {
19931983
return 1;
1984+
}
19941985
}
1995-
#endif
19961986
return 0;
19971987
}
19981988
#endif
@@ -2130,17 +2120,15 @@ unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
21302120
}
21312121

21322122
static PyObject*
2133-
get_latin1_char(unsigned char ch)
2123+
get_latin1_char(Py_UCS1 ch)
21342124
{
2135-
PyObject *unicode;
2125+
struct _Py_unicode_state *state = get_unicode_state();
21362126

2137-
#ifdef LATIN1_SINGLETONS
2138-
unicode = unicode_latin1[ch];
2127+
PyObject *unicode = state->latin1[ch];
21392128
if (unicode) {
21402129
Py_INCREF(unicode);
21412130
return unicode;
21422131
}
2143-
#endif
21442132

21452133
unicode = PyUnicode_New(1, ch);
21462134
if (!unicode) {
@@ -2150,10 +2138,8 @@ get_latin1_char(unsigned char ch)
21502138
PyUnicode_1BYTE_DATA(unicode)[0] = ch;
21512139
assert(_PyUnicode_CheckConsistency(unicode, 1));
21522140

2153-
#ifdef LATIN1_SINGLETONS
21542141
Py_INCREF(unicode);
2155-
unicode_latin1[ch] = unicode;
2156-
#endif
2142+
state->latin1[ch] = unicode;
21572143
return unicode;
21582144
}
21592145

@@ -2164,8 +2150,9 @@ unicode_char(Py_UCS4 ch)
21642150

21652151
assert(ch <= MAX_UNICODE);
21662152

2167-
if (ch < 256)
2153+
if (ch < 256) {
21682154
return get_latin1_char(ch);
2155+
}
21692156

21702157
unicode = PyUnicode_New(1, ch);
21712158
if (unicode == NULL)
@@ -2367,11 +2354,13 @@ _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
23672354
PyObject *res;
23682355
unsigned char max_char;
23692356

2370-
if (size == 0)
2357+
if (size == 0) {
23712358
_Py_RETURN_UNICODE_EMPTY();
2359+
}
23722360
assert(size > 0);
2373-
if (size == 1)
2361+
if (size == 1) {
23742362
return get_latin1_char(u[0]);
2363+
}
23752364

23762365
max_char = ucs1lib_find_max_char(u, u + size);
23772366
res = PyUnicode_New(size, max_char);
@@ -5008,8 +4997,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
50084997

50094998
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
50104999
if (size == 1 && (unsigned char)s[0] < 128) {
5011-
if (consumed)
5000+
if (consumed) {
50125001
*consumed = 1;
5002+
}
50135003
return get_latin1_char((unsigned char)s[0]);
50145004
}
50155005

@@ -7176,8 +7166,9 @@ PyUnicode_DecodeASCII(const char *s,
71767166
_Py_RETURN_UNICODE_EMPTY();
71777167

71787168
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
7179-
if (size == 1 && (unsigned char)s[0] < 128)
7169+
if (size == 1 && (unsigned char)s[0] < 128) {
71807170
return get_latin1_char((unsigned char)s[0]);
7171+
}
71817172

71827173
// Shortcut for simple case
71837174
PyObject *u = PyUnicode_New(size, 127);
@@ -16234,12 +16225,11 @@ _PyUnicode_Fini(PyThreadState *tstate)
1623416225

1623516226
Py_CLEAR(state->empty);
1623616227

16228+
for (Py_ssize_t i = 0; i < 256; i++) {
16229+
Py_CLEAR(state->latin1[i]);
16230+
}
16231+
1623716232
if (is_main_interp) {
16238-
#ifdef LATIN1_SINGLETONS
16239-
for (Py_ssize_t i = 0; i < 256; i++) {
16240-
Py_CLEAR(unicode_latin1[i]);
16241-
}
16242-
#endif
1624316233
unicode_clear_static_strings();
1624416234
}
1624516235

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy