Skip to content

BUG: add bounds-checking to in-place string multiply #29060

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 27, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/release/upcoming_changes/29060.change.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
* Multiplication between a string and integer now raises OverflowError instead
of MemoryError if the result of the multiplication would create a string that
is too large to be represented. This follows Python's behavior.
12 changes: 12 additions & 0 deletions numpy/_core/src/umath/string_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,18 @@ struct Buffer {
return num_codepoints;
}

inline size_t
buffer_width()
{
switch (enc) {
case ENCODING::ASCII:
case ENCODING::UTF8:
return after - buf;
case ENCODING::UTF32:
return (after - buf) / sizeof(npy_ucs4);
}
}

inline Buffer<enc>&
operator+=(npy_int64 rhs)
{
Expand Down
48 changes: 36 additions & 12 deletions numpy/_core/src/umath/string_ufuncs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "dtypemeta.h"
#include "convert_datatype.h"
#include "gil_utils.h"
#include "templ_common.h" /* for npy_mul_size_with_overflow_size_t */

#include "string_ufuncs.h"
#include "string_fastsearch.h"
Expand Down Expand Up @@ -166,26 +167,44 @@ string_add(Buffer<enc> buf1, Buffer<enc> buf2, Buffer<enc> out)


template <ENCODING enc>
static inline void
static inline int
string_multiply(Buffer<enc> buf1, npy_int64 reps, Buffer<enc> out)
{
size_t len1 = buf1.num_codepoints();
if (reps < 1 || len1 == 0) {
out.buffer_fill_with_zeros_after_index(0);
return;
return 0;
}

size_t width = out.buffer_width();
if (len1 == 1) {
out.buffer_memset(*buf1, reps);
out.buffer_fill_with_zeros_after_index(reps);
return 0;
}
else {
for (npy_int64 i = 0; i < reps; i++) {
buf1.buffer_memcpy(out, len1);
out += len1;
}
out.buffer_fill_with_zeros_after_index(0);

size_t newlen;
if (NPY_UNLIKELY(npy_mul_with_overflow_size_t(&newlen, reps, len1) < 0) || newlen > PY_SSIZE_T_MAX) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (NPY_UNLIKELY(npy_mul_with_overflow_size_t(&newlen, reps, len1) < 0) || newlen > PY_SSIZE_T_MAX) {
if (NPY_UNLIKELY(npy_mul_with_overflow_size_t(&newlen, reps, len1) != 0) || newlen > PY_SSIZE_T_MAX) {

I think this one isn't technically an error return, but a "has overflown" return... Maybe the test should run into the overflow path, but newlen > PY_SSIZE_T_MAX also kicks in due to fun overflows?

Overflow error or memory error is both fine to me (for this branch I would also be OK with just using the width < newlen path).

return -1;
}

size_t pad = 0;
if (width < newlen) {
reps = width / len1;
pad = width % len1;
}

for (npy_int64 i = 0; i < reps; i++) {
buf1.buffer_memcpy(out, len1);
out += len1;
}

buf1.buffer_memcpy(out, pad);
out += pad;

out.buffer_fill_with_zeros_after_index(0);

return 0;
}


Expand Down Expand Up @@ -238,7 +257,9 @@ string_multiply_strint_loop(PyArrayMethod_Context *context,
while (N--) {
Buffer<enc> buf(in1, elsize);
Buffer<enc> outbuf(out, outsize);
string_multiply<enc>(buf, *(npy_int64 *)in2, outbuf);
if (NPY_UNLIKELY(string_multiply<enc>(buf, *(npy_int64 *)in2, outbuf) < 0)) {
npy_gil_error(PyExc_OverflowError, "Overflow detected in string multiply");
}

in1 += strides[0];
in2 += strides[1];
Expand Down Expand Up @@ -267,7 +288,9 @@ string_multiply_intstr_loop(PyArrayMethod_Context *context,
while (N--) {
Buffer<enc> buf(in2, elsize);
Buffer<enc> outbuf(out, outsize);
string_multiply<enc>(buf, *(npy_int64 *)in1, outbuf);
if (NPY_UNLIKELY(string_multiply<enc>(buf, *(npy_int64 *)in1, outbuf) < 0)) {
npy_gil_error(PyExc_OverflowError, "Overflow detected in string multiply");
}

in1 += strides[0];
in2 += strides[1];
Expand Down Expand Up @@ -752,10 +775,11 @@ string_multiply_resolve_descriptors(
if (given_descrs[2] == NULL) {
PyErr_SetString(
PyExc_TypeError,
"The 'out' kwarg is necessary. Use numpy.strings.multiply without it.");
"The 'out' kwarg is necessary when using the string multiply ufunc "
"directly. Use numpy.strings.multiply to multiply strings without "
"specifying 'out'.");
return _NPY_ERROR_OCCURRED_IN_CAST;
}

loop_descrs[0] = NPY_DT_CALL_ensure_canonical(given_descrs[0]);
if (loop_descrs[0] == NULL) {
return _NPY_ERROR_OCCURRED_IN_CAST;
Expand Down
6 changes: 3 additions & 3 deletions numpy/_core/src/umath/stringdtype_ufuncs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,9 @@ static int multiply_loop_core(
size_t newsize;
int overflowed = npy_mul_with_overflow_size_t(
&newsize, cursize, factor);
if (overflowed) {
npy_gil_error(PyExc_MemoryError,
"Failed to allocate string in string multiply");
if (overflowed || newsize > PY_SSIZE_T_MAX) {
npy_gil_error(PyExc_OverflowError,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, how are these things handled in free threaded Python?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think free-threading changes anything in this code path

Copy link
Member Author

@ngoldbaum ngoldbaum May 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To say another way: we could probably rename npy_gil_error to e.g. npy_error_py_attached to make it clearer that you still need to explicitly attach and detach from the runtime in all builds.

"Overflow encountered in string multiply");
goto fail;
}

Expand Down
2 changes: 1 addition & 1 deletion numpy/_core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def multiply(a, i):

# Ensure we can do a_len * i without overflow.
if np.any(a_len > sys.maxsize / np.maximum(i, 1)):
raise MemoryError("repeated string is too long")
raise OverflowError("Overflow encountered in string multiply")

buffersizes = a_len * i
out_dtype = f"{a.dtype.char}{buffersizes.max()}"
Expand Down
13 changes: 12 additions & 1 deletion numpy/_core/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,9 +224,20 @@ def test_multiply_raises(self, dt):
with pytest.raises(TypeError, match="unsupported type"):
np.strings.multiply(np.array("abc", dtype=dt), 3.14)

with pytest.raises(MemoryError):
with pytest.raises(OverflowError):
np.strings.multiply(np.array("abc", dtype=dt), sys.maxsize)

def test_inplace_multiply(self, dt):
arr = np.array(['foo ', 'bar'], dtype=dt)
arr *= 2
if dt != "T":
assert_array_equal(arr, np.array(['foo ', 'barb'], dtype=dt))
else:
assert_array_equal(arr, ['foo foo ', 'barbar'])

with pytest.raises(OverflowError):
arr *= sys.maxsize

@pytest.mark.parametrize("i_dt", [np.int8, np.int16, np.int32,
np.int64, np.int_])
def test_multiply_integer_dtypes(self, i_dt, dt):
Expand Down
3 changes: 2 additions & 1 deletion numpy/typing/tests/data/pass/ma.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
MAR_M_dt64: MaskedArray[np.datetime64] = np.ma.MaskedArray([np.datetime64(1, "D")])
MAR_S: MaskedArray[np.bytes_] = np.ma.MaskedArray([b'foo'], dtype=np.bytes_)
MAR_U: MaskedArray[np.str_] = np.ma.MaskedArray(['foo'], dtype=np.str_)
MAR_T = cast(np.ma.MaskedArray[Any, np.dtypes.StringDType], np.ma.MaskedArray(["a"], "T"))
MAR_T = cast(np.ma.MaskedArray[Any, np.dtypes.StringDType],
np.ma.MaskedArray(["a"], dtype="T"))

AR_b: npt.NDArray[np.bool] = np.array([True, False, True])

Expand Down
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy