From db879703b7edaf5d1200d8bc3784fa1460edc7a3 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Mon, 26 May 2025 12:25:07 -0600 Subject: [PATCH 1/6] BUG: add bounds-checking to in-place string multiply --- numpy/_core/src/umath/string_buffer.h | 12 ++++++++++++ numpy/_core/src/umath/string_ufuncs.cpp | 13 +++++++++++-- numpy/_core/tests/test_strings.py | 8 ++++++++ numpy/typing/tests/data/pass/ma.py | 3 ++- 4 files changed, 33 insertions(+), 3 deletions(-) diff --git a/numpy/_core/src/umath/string_buffer.h b/numpy/_core/src/umath/string_buffer.h index 554f9ece5197..dafedcbc03ff 100644 --- a/numpy/_core/src/umath/string_buffer.h +++ b/numpy/_core/src/umath/string_buffer.h @@ -297,6 +297,18 @@ struct Buffer { return num_codepoints; } + inline size_t + buffer_width() + { + switch (enc) { + case ENCODING::ASCII: + case ENCODING::UTF8: + return after - buf; + case ENCODING::UTF32: + return (after - buf) / sizeof(npy_ucs4); + } + } + inline Buffer& operator+=(npy_int64 rhs) { diff --git a/numpy/_core/src/umath/string_ufuncs.cpp b/numpy/_core/src/umath/string_ufuncs.cpp index ffd757815364..7f7857c49c34 100644 --- a/numpy/_core/src/umath/string_ufuncs.cpp +++ b/numpy/_core/src/umath/string_ufuncs.cpp @@ -175,6 +175,12 @@ string_multiply(Buffer buf1, npy_int64 reps, Buffer out) return; } + size_t width = out.buffer_width(); + size_t pad = 0; + if (width < len1*reps) { + reps = width / len1; + pad = width % len1; + } if (len1 == 1) { out.buffer_memset(*buf1, reps); out.buffer_fill_with_zeros_after_index(reps); @@ -184,6 +190,8 @@ string_multiply(Buffer buf1, npy_int64 reps, Buffer out) buf1.buffer_memcpy(out, len1); out += len1; } + buf1.buffer_memcpy(out, pad); + out += pad; out.buffer_fill_with_zeros_after_index(0); } } @@ -752,10 +760,11 @@ string_multiply_resolve_descriptors( if (given_descrs[2] == NULL) { PyErr_SetString( PyExc_TypeError, - "The 'out' kwarg is necessary. Use numpy.strings.multiply without it."); + "The 'out' kwarg is necessary when using the string multiply ufunc " + "directly. Use numpy.strings.multiply to multiply strings without " + "specifying 'out'."); return _NPY_ERROR_OCCURRED_IN_CAST; } - loop_descrs[0] = NPY_DT_CALL_ensure_canonical(given_descrs[0]); if (loop_descrs[0] == NULL) { return _NPY_ERROR_OCCURRED_IN_CAST; diff --git a/numpy/_core/tests/test_strings.py b/numpy/_core/tests/test_strings.py index f6de208d7951..f87db9dcdd85 100644 --- a/numpy/_core/tests/test_strings.py +++ b/numpy/_core/tests/test_strings.py @@ -227,6 +227,14 @@ def test_multiply_raises(self, dt): with pytest.raises(MemoryError): np.strings.multiply(np.array("abc", dtype=dt), sys.maxsize) + def test_inplace_multiply(self, dt): + arr = np.array(['foo ', 'bar'], dtype=dt) + arr *= 2 + if dt != "T": + assert_array_equal(arr, np.array(['foo ', 'barb'], dtype=dt)) + else: + assert_array_equal(arr, ['foo foo ', 'barbar']) + @pytest.mark.parametrize("i_dt", [np.int8, np.int16, np.int32, np.int64, np.int_]) def test_multiply_integer_dtypes(self, i_dt, dt): diff --git a/numpy/typing/tests/data/pass/ma.py b/numpy/typing/tests/data/pass/ma.py index dc9474fe4069..e7915a583210 100644 --- a/numpy/typing/tests/data/pass/ma.py +++ b/numpy/typing/tests/data/pass/ma.py @@ -16,7 +16,8 @@ MAR_M_dt64: MaskedArray[np.datetime64] = np.ma.MaskedArray([np.datetime64(1, "D")]) MAR_S: MaskedArray[np.bytes_] = np.ma.MaskedArray([b'foo'], dtype=np.bytes_) MAR_U: MaskedArray[np.str_] = np.ma.MaskedArray(['foo'], dtype=np.str_) -MAR_T = cast(np.ma.MaskedArray[Any, np.dtypes.StringDType], np.ma.MaskedArray(["a"], "T")) +MAR_T = cast(np.ma.MaskedArray[Any, np.dtypes.StringDType], + np.ma.MaskedArray(["a"], dtype="T")) AR_b: npt.NDArray[np.bool] = np.array([True, False, True]) From 032c8cb3b68046f8b7654b01b3ae150d1676d184 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 27 May 2025 10:05:01 -0600 Subject: [PATCH 2/6] MNT: check for overflow and raise OverflowError --- doc/release/upcoming_changes/29060.change.rst | 3 ++ numpy/_core/src/umath/string_ufuncs.cpp | 49 ++++++++++++------- numpy/_core/src/umath/stringdtype_ufuncs.cpp | 6 +-- numpy/_core/strings.py | 2 +- numpy/_core/tests/test_strings.py | 5 +- 5 files changed, 43 insertions(+), 22 deletions(-) create mode 100644 doc/release/upcoming_changes/29060.change.rst diff --git a/doc/release/upcoming_changes/29060.change.rst b/doc/release/upcoming_changes/29060.change.rst new file mode 100644 index 000000000000..1561da7bf94e --- /dev/null +++ b/doc/release/upcoming_changes/29060.change.rst @@ -0,0 +1,3 @@ +* Multiplication between a string and integer now raises OverflowError instead + of MemoryError if the result of the multiplication would create a string that + is too large to be represented. This follows Python's behavior. diff --git a/numpy/_core/src/umath/string_ufuncs.cpp b/numpy/_core/src/umath/string_ufuncs.cpp index 7f7857c49c34..502062095f2d 100644 --- a/numpy/_core/src/umath/string_ufuncs.cpp +++ b/numpy/_core/src/umath/string_ufuncs.cpp @@ -15,6 +15,7 @@ #include "dtypemeta.h" #include "convert_datatype.h" #include "gil_utils.h" +#include "templ_common.h" /* for npy_mul_size_with_overflow_size_t */ #include "string_ufuncs.h" #include "string_fastsearch.h" @@ -166,34 +167,44 @@ string_add(Buffer buf1, Buffer buf2, Buffer out) template -static inline void +static inline int string_multiply(Buffer buf1, npy_int64 reps, Buffer out) { size_t len1 = buf1.num_codepoints(); if (reps < 1 || len1 == 0) { out.buffer_fill_with_zeros_after_index(0); - return; + return 0; } size_t width = out.buffer_width(); - size_t pad = 0; - if (width < len1*reps) { - reps = width / len1; - pad = width % len1; - } if (len1 == 1) { out.buffer_memset(*buf1, reps); out.buffer_fill_with_zeros_after_index(reps); + return 0; } - else { - for (npy_int64 i = 0; i < reps; i++) { - buf1.buffer_memcpy(out, len1); - out += len1; - } - buf1.buffer_memcpy(out, pad); - out += pad; - out.buffer_fill_with_zeros_after_index(0); + + size_t newlen; + if (NPY_UNLIKELY(npy_mul_with_overflow_size_t(&newlen, reps, len1) < 0) || newlen > PY_SSIZE_T_MAX) { + return -1; } + + size_t pad = 0; + if (width < newlen) { + reps = width / len1; + pad = width % len1; + } + + for (npy_int64 i = 0; i < reps; i++) { + buf1.buffer_memcpy(out, len1); + out += len1; + } + + buf1.buffer_memcpy(out, pad); + out += pad; + + out.buffer_fill_with_zeros_after_index(0); + + return 0; } @@ -246,7 +257,9 @@ string_multiply_strint_loop(PyArrayMethod_Context *context, while (N--) { Buffer buf(in1, elsize); Buffer outbuf(out, outsize); - string_multiply(buf, *(npy_int64 *)in2, outbuf); + if (NPY_UNLIKELY(string_multiply(buf, *(npy_int64 *)in2, outbuf) < 0)) { + npy_gil_error(PyExc_OverflowError, "Overflow detected in string multiply"); + } in1 += strides[0]; in2 += strides[1]; @@ -275,7 +288,9 @@ string_multiply_intstr_loop(PyArrayMethod_Context *context, while (N--) { Buffer buf(in2, elsize); Buffer outbuf(out, outsize); - string_multiply(buf, *(npy_int64 *)in1, outbuf); + if (NPY_UNLIKELY(string_multiply(buf, *(npy_int64 *)in1, outbuf) < 0)) { + npy_gil_error(PyExc_OverflowError, "Overflow detected in string multiply"); + } in1 += strides[0]; in2 += strides[1]; diff --git a/numpy/_core/src/umath/stringdtype_ufuncs.cpp b/numpy/_core/src/umath/stringdtype_ufuncs.cpp index adcbfd3b7480..36418a489136 100644 --- a/numpy/_core/src/umath/stringdtype_ufuncs.cpp +++ b/numpy/_core/src/umath/stringdtype_ufuncs.cpp @@ -137,9 +137,9 @@ static int multiply_loop_core( size_t newsize; int overflowed = npy_mul_with_overflow_size_t( &newsize, cursize, factor); - if (overflowed) { - npy_gil_error(PyExc_MemoryError, - "Failed to allocate string in string multiply"); + if (overflowed || newsize > PY_SSIZE_T_MAX) { + npy_gil_error(PyExc_OverflowError, + "Overflow encountered in string multiply"); goto fail; } diff --git a/numpy/_core/strings.py b/numpy/_core/strings.py index fc0a2d0b4d1a..b4dc1656024f 100644 --- a/numpy/_core/strings.py +++ b/numpy/_core/strings.py @@ -218,7 +218,7 @@ def multiply(a, i): # Ensure we can do a_len * i without overflow. if np.any(a_len > sys.maxsize / np.maximum(i, 1)): - raise MemoryError("repeated string is too long") + raise OverflowError("Overflow encountered in string multiply") buffersizes = a_len * i out_dtype = f"{a.dtype.char}{buffersizes.max()}" diff --git a/numpy/_core/tests/test_strings.py b/numpy/_core/tests/test_strings.py index f87db9dcdd85..56e928df4d7b 100644 --- a/numpy/_core/tests/test_strings.py +++ b/numpy/_core/tests/test_strings.py @@ -224,7 +224,7 @@ def test_multiply_raises(self, dt): with pytest.raises(TypeError, match="unsupported type"): np.strings.multiply(np.array("abc", dtype=dt), 3.14) - with pytest.raises(MemoryError): + with pytest.raises(OverflowError): np.strings.multiply(np.array("abc", dtype=dt), sys.maxsize) def test_inplace_multiply(self, dt): @@ -235,6 +235,9 @@ def test_inplace_multiply(self, dt): else: assert_array_equal(arr, ['foo foo ', 'barbar']) + with pytest.raises(OverflowError): + arr *= sys.maxsize + @pytest.mark.parametrize("i_dt", [np.int8, np.int16, np.int32, np.int64, np.int_]) def test_multiply_integer_dtypes(self, i_dt, dt): From 1903aa545d4570ba87cb79520f6117a8de6e29d1 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 27 May 2025 11:00:04 -0600 Subject: [PATCH 3/6] MNT: respond to review suggestion --- numpy/_core/src/umath/string_ufuncs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/_core/src/umath/string_ufuncs.cpp b/numpy/_core/src/umath/string_ufuncs.cpp index 502062095f2d..8d71dadddca0 100644 --- a/numpy/_core/src/umath/string_ufuncs.cpp +++ b/numpy/_core/src/umath/string_ufuncs.cpp @@ -184,7 +184,7 @@ string_multiply(Buffer buf1, npy_int64 reps, Buffer out) } size_t newlen; - if (NPY_UNLIKELY(npy_mul_with_overflow_size_t(&newlen, reps, len1) < 0) || newlen > PY_SSIZE_T_MAX) { + if (NPY_UNLIKELY(npy_mul_with_overflow_size_t(&newlen, reps, len1) != 0) || newlen > PY_SSIZE_T_MAX) { return -1; } From 24643be593de91ab68a959048a30256bcee4b5a4 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 27 May 2025 11:00:15 -0600 Subject: [PATCH 4/6] MNT: handle overflow in one more spot --- numpy/_core/src/umath/stringdtype_ufuncs.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/numpy/_core/src/umath/stringdtype_ufuncs.cpp b/numpy/_core/src/umath/stringdtype_ufuncs.cpp index 36418a489136..b0181d4186c9 100644 --- a/numpy/_core/src/umath/stringdtype_ufuncs.cpp +++ b/numpy/_core/src/umath/stringdtype_ufuncs.cpp @@ -1748,9 +1748,9 @@ center_ljust_rjust_strided_loop(PyArrayMethod_Context *context, width - num_codepoints); newsize += s1.size; - if (overflowed) { - npy_gil_error(PyExc_MemoryError, - "Failed to allocate string in %s", ufunc_name); + if (overflowed || newsize > PY_SSIZE_T_MAX) { + npy_gil_error(PyExc_OverflowError, + "Overflow encountered in %s", ufunc_name); goto fail; } From 03fa9292198bbcf2286620bfce306d241c0e9799 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 27 May 2025 11:00:29 -0600 Subject: [PATCH 5/6] MNT: make test behave the same on all architectures --- numpy/_core/tests/test_stringdtype.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numpy/_core/tests/test_stringdtype.py b/numpy/_core/tests/test_stringdtype.py index 1c15c4895eaf..9bab810d4421 100644 --- a/numpy/_core/tests/test_stringdtype.py +++ b/numpy/_core/tests/test_stringdtype.py @@ -128,8 +128,8 @@ def test_null_roundtripping(): def test_string_too_large_error(): arr = np.array(["a", "b", "c"], dtype=StringDType()) - with pytest.raises(MemoryError): - arr * (2**63 - 2) + with pytest.raises(OverflowError): + arr * (sys.maxsize + 1) @pytest.mark.parametrize( From 3272c4834ce5a0fb586b5205d5e41b9db66b9bfd Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 27 May 2025 11:15:54 -0600 Subject: [PATCH 6/6] MNT: reorder to avoid work in some cases --- numpy/_core/src/umath/string_ufuncs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/_core/src/umath/string_ufuncs.cpp b/numpy/_core/src/umath/string_ufuncs.cpp index 8d71dadddca0..95f30ccb109e 100644 --- a/numpy/_core/src/umath/string_ufuncs.cpp +++ b/numpy/_core/src/umath/string_ufuncs.cpp @@ -176,7 +176,6 @@ string_multiply(Buffer buf1, npy_int64 reps, Buffer out) return 0; } - size_t width = out.buffer_width(); if (len1 == 1) { out.buffer_memset(*buf1, reps); out.buffer_fill_with_zeros_after_index(reps); @@ -189,6 +188,7 @@ string_multiply(Buffer buf1, npy_int64 reps, Buffer out) } size_t pad = 0; + size_t width = out.buffer_width(); if (width < newlen) { reps = width / len1; pad = width % len1; pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy