Skip to content

Commit ab9893c

Browse files
[3.10] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133944) (GH-134345)
If the error handler is used, a new bytes object is created to set as the object attribute of UnicodeDecodeError, and that bytes object then replaces the original data. A pointer to the decoded data will became invalid after destroying that temporary bytes object. So we need other way to return the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal(). _PyBytes_DecodeEscape() does not have such issue, because it does not use the error handlers registry, but it should be changed for compatibility with _PyUnicode_DecodeUnicodeEscapeInternal(). (cherry picked from commit 9f69a58) (cherry picked from commit 6279eb8) (cherry picked from commit a75953b) (cherry picked from commit 0c33e5b) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
1 parent f85e71a commit ab9893c

File tree

8 files changed

+164
-41
lines changed

8 files changed

+164
-41
lines changed

Include/cpython/bytesobject.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
2525
int use_bytearray);
2626

2727
/* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */
28+
PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape2(const char *, Py_ssize_t,
29+
const char *,
30+
int *, const char **);
31+
// Export for binary compatibility.
2832
PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
2933
const char *, const char **);
3034

Include/cpython/unicodeobject.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -844,6 +844,19 @@ PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
844844

845845
/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
846846
chars. */
847+
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
848+
const char *string, /* Unicode-Escape encoded string */
849+
Py_ssize_t length, /* size of string */
850+
const char *errors, /* error handling */
851+
Py_ssize_t *consumed, /* bytes consumed */
852+
int *first_invalid_escape_char, /* on return, if not -1, contain the first
853+
invalid escaped char (<= 0xff) or invalid
854+
octal escape (> 0xff) in string. */
855+
const char **first_invalid_escape_ptr); /* on return, if not NULL, may
856+
point to the first invalid escaped
857+
char in string.
858+
May be NULL if errors is not NULL. */
859+
// Export for binary compatibility.
847860
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
848861
const char *string, /* Unicode-Escape encoded string */
849862
Py_ssize_t length, /* size of string */

Lib/test/test_codeccallbacks.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1124,7 +1124,7 @@ def test_bug828737(self):
11241124
text = 'abc<def>ghi'*n
11251125
text.translate(charmap)
11261126

1127-
def test_mutatingdecodehandler(self):
1127+
def test_mutating_decode_handler(self):
11281128
baddata = [
11291129
("ascii", b"\xff"),
11301130
("utf-7", b"++"),
@@ -1159,6 +1159,40 @@ def mutating(exc):
11591159
for (encoding, data) in baddata:
11601160
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
11611161

1162+
def test_mutating_decode_handler_unicode_escape(self):
1163+
decode = codecs.unicode_escape_decode
1164+
def mutating(exc):
1165+
if isinstance(exc, UnicodeDecodeError):
1166+
r = data.get(exc.object[:exc.end])
1167+
if r is not None:
1168+
exc.object = r[0] + exc.object[exc.end:]
1169+
return ('\u0404', r[1])
1170+
raise AssertionError("don't know how to handle %r" % exc)
1171+
1172+
codecs.register_error('test.mutating2', mutating)
1173+
data = {
1174+
br'\x0': (b'\\', 0),
1175+
br'\x3': (b'xxx\\', 3),
1176+
br'\x5': (b'x\\', 1),
1177+
}
1178+
def check(input, expected, msg):
1179+
with self.assertWarns(DeprecationWarning) as cm:
1180+
self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
1181+
self.assertIn(msg, str(cm.warning))
1182+
1183+
check(br'\x0n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
1184+
check(br'\x0z', '\u0404\\z', r"invalid escape sequence '\z'")
1185+
1186+
check(br'\x3n\zr', '\u0404\n\\zr', r"invalid escape sequence '\z'")
1187+
check(br'\x3zr', '\u0404\\zr', r"invalid escape sequence '\z'")
1188+
check(br'\x3z5', '\u0404\\z5', r"invalid escape sequence '\z'")
1189+
check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r"invalid escape sequence '\z'")
1190+
check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r"invalid escape sequence '\z'")
1191+
1192+
check(br'\x5n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
1193+
check(br'\x5z', '\u0404\\z', r"invalid escape sequence '\z'")
1194+
check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r"invalid escape sequence '\z'")
1195+
11621196
# issue32583
11631197
def test_crashing_decode_handler(self):
11641198
# better generating one more character to fill the extra space slot

Lib/test/test_codecs.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1181,20 +1181,32 @@ def test_escape(self):
11811181
check(br"[\501]", b"[A]")
11821182
check(br"[\x41]", b"[A]")
11831183
check(br"[\x410]", b"[A0]")
1184+
1185+
def test_warnings(self):
1186+
decode = codecs.escape_decode
1187+
check = coding_checker(self, decode)
11841188
for i in range(97, 123):
11851189
b = bytes([i])
11861190
if b not in b'abfnrtvx':
1187-
with self.assertWarns(DeprecationWarning):
1191+
with self.assertWarnsRegex(DeprecationWarning,
1192+
r"invalid escape sequence '\\%c'" % i):
11881193
check(b"\\" + b, b"\\" + b)
1189-
with self.assertWarns(DeprecationWarning):
1194+
with self.assertWarnsRegex(DeprecationWarning,
1195+
r"invalid escape sequence '\\%c'" % (i-32)):
11901196
check(b"\\" + b.upper(), b"\\" + b.upper())
1191-
with self.assertWarns(DeprecationWarning):
1197+
with self.assertWarnsRegex(DeprecationWarning,
1198+
r"invalid escape sequence '\\8'"):
11921199
check(br"\8", b"\\8")
11931200
with self.assertWarns(DeprecationWarning):
11941201
check(br"\9", b"\\9")
1195-
with self.assertWarns(DeprecationWarning):
1202+
with self.assertWarnsRegex(DeprecationWarning,
1203+
r"invalid escape sequence '\\\xfa'") as cm:
11961204
check(b"\\\xfa", b"\\\xfa")
11971205

1206+
with self.assertWarnsRegex(DeprecationWarning,
1207+
r"invalid escape sequence '\\z'"):
1208+
self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4))
1209+
11981210
def test_errors(self):
11991211
decode = codecs.escape_decode
12001212
self.assertRaises(ValueError, decode, br"\x")
@@ -2408,20 +2420,31 @@ def test_escape_decode(self):
24082420
check(br"[\x410]", "[A0]")
24092421
check(br"\u20ac", "\u20ac")
24102422
check(br"\U0001d120", "\U0001d120")
2423+
2424+
def test_decode_warnings(self):
2425+
decode = codecs.unicode_escape_decode
2426+
check = coding_checker(self, decode)
24112427
for i in range(97, 123):
24122428
b = bytes([i])
24132429
if b not in b'abfnrtuvx':
2414-
with self.assertWarns(DeprecationWarning):
2430+
with self.assertWarnsRegex(DeprecationWarning,
2431+
r"invalid escape sequence '\\%c'" % i):
24152432
check(b"\\" + b, "\\" + chr(i))
24162433
if b.upper() not in b'UN':
2417-
with self.assertWarns(DeprecationWarning):
2434+
with self.assertWarnsRegex(DeprecationWarning,
2435+
r"invalid escape sequence '\\%c'" % (i-32)):
24182436
check(b"\\" + b.upper(), "\\" + chr(i-32))
2419-
with self.assertWarns(DeprecationWarning):
2437+
with self.assertWarnsRegex(DeprecationWarning,
2438+
r"invalid escape sequence '\\8'"):
24202439
check(br"\8", "\\8")
24212440
with self.assertWarns(DeprecationWarning):
24222441
check(br"\9", "\\9")
2423-
with self.assertWarns(DeprecationWarning):
2442+
with self.assertWarnsRegex(DeprecationWarning,
2443+
r"invalid escape sequence '\\\xfa'") as cm:
24242444
check(b"\\\xfa", "\\\xfa")
2445+
with self.assertWarnsRegex(DeprecationWarning,
2446+
r"invalid escape sequence '\\z'"):
2447+
self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4))
24252448

24262449
def test_decode_errors(self):
24272450
decode = codecs.unicode_escape_decode
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix use-after-free in the "unicode-escape" decoder with a non-"strict" error
2+
handler.

Objects/bytesobject.c

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,10 +1089,11 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
10891089
}
10901090

10911091
/* Unescape a backslash-escaped string. */
1092-
PyObject *_PyBytes_DecodeEscape(const char *s,
1092+
PyObject *_PyBytes_DecodeEscape2(const char *s,
10931093
Py_ssize_t len,
10941094
const char *errors,
1095-
const char **first_invalid_escape)
1095+
int *first_invalid_escape_char,
1096+
const char **first_invalid_escape_ptr)
10961097
{
10971098
int c;
10981099
char *p;
@@ -1106,7 +1107,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11061107
return NULL;
11071108
writer.overallocate = 1;
11081109

1109-
*first_invalid_escape = NULL;
1110+
*first_invalid_escape_char = -1;
1111+
*first_invalid_escape_ptr = NULL;
11101112

11111113
end = s + len;
11121114
while (s < end) {
@@ -1181,9 +1183,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11811183
break;
11821184

11831185
default:
1184-
if (*first_invalid_escape == NULL) {
1185-
*first_invalid_escape = s-1; /* Back up one char, since we've
1186-
already incremented s. */
1186+
if (*first_invalid_escape_char == -1) {
1187+
*first_invalid_escape_char = (unsigned char)s[-1];
1188+
/* Back up one char, since we've already incremented s. */
1189+
*first_invalid_escape_ptr = s - 1;
11871190
}
11881191
*p++ = '\\';
11891192
s--;
@@ -1197,21 +1200,36 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11971200
return NULL;
11981201
}
11991202

1203+
// Export for binary compatibility.
1204+
PyObject *_PyBytes_DecodeEscape(const char *s,
1205+
Py_ssize_t len,
1206+
const char *errors,
1207+
const char **first_invalid_escape)
1208+
{
1209+
int first_invalid_escape_char;
1210+
return _PyBytes_DecodeEscape2(
1211+
s, len, errors,
1212+
&first_invalid_escape_char,
1213+
first_invalid_escape);
1214+
}
1215+
12001216
PyObject *PyBytes_DecodeEscape(const char *s,
12011217
Py_ssize_t len,
12021218
const char *errors,
12031219
Py_ssize_t Py_UNUSED(unicode),
12041220
const char *Py_UNUSED(recode_encoding))
12051221
{
1206-
const char* first_invalid_escape;
1207-
PyObject *result = _PyBytes_DecodeEscape(s, len, errors,
1208-
&first_invalid_escape);
1222+
int first_invalid_escape_char;
1223+
const char *first_invalid_escape_ptr;
1224+
PyObject *result = _PyBytes_DecodeEscape2(s, len, errors,
1225+
&first_invalid_escape_char,
1226+
&first_invalid_escape_ptr);
12091227
if (result == NULL)
12101228
return NULL;
1211-
if (first_invalid_escape != NULL) {
1229+
if (first_invalid_escape_char != -1) {
12121230
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
12131231
"invalid escape sequence '\\%c'",
1214-
(unsigned char)*first_invalid_escape) < 0) {
1232+
first_invalid_escape_char) < 0) {
12151233
Py_DECREF(result);
12161234
return NULL;
12171235
}

Objects/unicodeobject.c

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6432,20 +6432,23 @@ PyUnicode_AsUTF16String(PyObject *unicode)
64326432
static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
64336433

64346434
PyObject *
6435-
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6435+
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
64366436
Py_ssize_t size,
64376437
const char *errors,
64386438
Py_ssize_t *consumed,
6439-
const char **first_invalid_escape)
6439+
int *first_invalid_escape_char,
6440+
const char **first_invalid_escape_ptr)
64406441
{
64416442
const char *starts = s;
6443+
const char *initial_starts = starts;
64426444
_PyUnicodeWriter writer;
64436445
const char *end;
64446446
PyObject *errorHandler = NULL;
64456447
PyObject *exc = NULL;
64466448

64476449
// so we can remember if we've seen an invalid escape char or not
6448-
*first_invalid_escape = NULL;
6450+
*first_invalid_escape_char = -1;
6451+
*first_invalid_escape_ptr = NULL;
64496452

64506453
if (size == 0) {
64516454
if (consumed) {
@@ -6628,9 +6631,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
66286631
goto error;
66296632

66306633
default:
6631-
if (*first_invalid_escape == NULL) {
6632-
*first_invalid_escape = s-1; /* Back up one char, since we've
6633-
already incremented s. */
6634+
if (*first_invalid_escape_char == -1) {
6635+
*first_invalid_escape_char = c;
6636+
if (starts == initial_starts) {
6637+
/* Back up one char, since we've already incremented s. */
6638+
*first_invalid_escape_ptr = s - 1;
6639+
}
66346640
}
66356641
WRITE_ASCII_CHAR('\\');
66366642
WRITE_CHAR(c);
@@ -6669,22 +6675,39 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
66696675
return NULL;
66706676
}
66716677

6678+
// Export for binary compatibility.
6679+
PyObject *
6680+
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6681+
Py_ssize_t size,
6682+
const char *errors,
6683+
Py_ssize_t *consumed,
6684+
const char **first_invalid_escape)
6685+
{
6686+
int first_invalid_escape_char;
6687+
return _PyUnicode_DecodeUnicodeEscapeInternal2(
6688+
s, size, errors, consumed,
6689+
&first_invalid_escape_char,
6690+
first_invalid_escape);
6691+
}
6692+
66726693
PyObject *
66736694
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
66746695
Py_ssize_t size,
66756696
const char *errors,
66766697
Py_ssize_t *consumed)
66776698
{
6678-
const char *first_invalid_escape;
6679-
PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6699+
int first_invalid_escape_char;
6700+
const char *first_invalid_escape_ptr;
6701+
PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
66806702
consumed,
6681-
&first_invalid_escape);
6703+
&first_invalid_escape_char,
6704+
&first_invalid_escape_ptr);
66826705
if (result == NULL)
66836706
return NULL;
6684-
if (first_invalid_escape != NULL) {
6707+
if (first_invalid_escape_char != -1) {
66856708
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
66866709
"invalid escape sequence '\\%c'",
6687-
(unsigned char)*first_invalid_escape) < 0) {
6710+
first_invalid_escape_char) < 0) {
66886711
Py_DECREF(result);
66896712
return NULL;
66906713
}

Parser/string_parser.c

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -114,12 +114,15 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
114114
len = p - buf;
115115
s = buf;
116116

117-
const char *first_invalid_escape;
118-
v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
119-
120-
if (v != NULL && first_invalid_escape != NULL) {
121-
if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
122-
/* We have not decref u before because first_invalid_escape points
117+
int first_invalid_escape_char;
118+
const char *first_invalid_escape_ptr;
119+
v = _PyUnicode_DecodeUnicodeEscapeInternal2(s, (Py_ssize_t)len, NULL, NULL,
120+
&first_invalid_escape_char,
121+
&first_invalid_escape_ptr);
122+
123+
if (v != NULL && first_invalid_escape_ptr != NULL) {
124+
if (warn_invalid_escape_sequence(parser, *first_invalid_escape_ptr, t) < 0) {
125+
/* We have not decref u before because first_invalid_escape_ptr points
123126
inside u. */
124127
Py_XDECREF(u);
125128
Py_DECREF(v);
@@ -133,14 +136,17 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
133136
static PyObject *
134137
decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
135138
{
136-
const char *first_invalid_escape;
137-
PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
139+
int first_invalid_escape_char;
140+
const char *first_invalid_escape_ptr;
141+
PyObject *result = _PyBytes_DecodeEscape2(s, len, NULL,
142+
&first_invalid_escape_char,
143+
&first_invalid_escape_ptr);
138144
if (result == NULL) {
139145
return NULL;
140146
}
141147

142-
if (first_invalid_escape != NULL) {
143-
if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
148+
if (first_invalid_escape_ptr != NULL) {
149+
if (warn_invalid_escape_sequence(p, *first_invalid_escape_ptr, t) < 0) {
144150
Py_DECREF(result);
145151
return NULL;
146152
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy