BUG: add bounds-checking to in-place string multiply (#29060)

* BUG: add bounds-checking to in-place string multiply * MNT: check for overflow and raise OverflowError * MNT: respond to review suggestion * MNT: handle overflow in one more spot * MNT: make test behave the same on all architectures * MNT: reorder to avoid work in some cases
numpy · charris · May 27, 2025 · May 27, 2025 · May 27, 2025 · ef57a14428ae4fe4693479701e09c2bb11882bfc
commit ef57a14428ae4fe4693479701e09c2bb11882bfc
diff --git a/doc/release/upcoming_changes/29060.change.rst b/doc/release/upcoming_changes/29060.change.rst
@@ -0,0 +1,3 @@
+* Multiplication between a string and integer now raises OverflowError instead
+  of MemoryError if the result of the multiplication would create a string that
+  is too large to be represented. This follows Python's behavior.
diff --git a/numpy/_core/src/umath/string_buffer.h b/numpy/_core/src/umath/string_buffer.h
@@ -297,6 +297,18 @@ struct Buffer {
         return num_codepoints;
     }
 
+    inline size_t
+    buffer_width()
+    {
+        switch (enc) {
+            case ENCODING::ASCII:
+            case ENCODING::UTF8:
+                return after - buf;
+            case ENCODING::UTF32:
+                return (after - buf) / sizeof(npy_ucs4);
+        }
+    }
+
     inline Buffer<enc>&
     operator+=(npy_int64 rhs)
     {

diff --git a/numpy/_core/src/umath/string_ufuncs.cpp b/numpy/_core/src/umath/string_ufuncs.cpp
@@ -15,6 +15,7 @@
 #include "dtypemeta.h"
 #include "convert_datatype.h"
 #include "gil_utils.h"
+#include "templ_common.h" /* for npy_mul_size_with_overflow_size_t */
 
 #include "string_ufuncs.h"
 #include "string_fastsearch.h"
@@ -166,26 +167,44 @@ string_add(Buffer<enc> buf1, Buffer<enc> buf2, Buffer<enc> out)
 
 
 template <ENCODING enc>
-static inline void
+static inline int
 string_multiply(Buffer<enc> buf1, npy_int64 reps, Buffer<enc> out)
 {
     size_t len1 = buf1.num_codepoints();
     if (reps < 1 || len1 == 0) {
         out.buffer_fill_with_zeros_after_index(0);
-        return;
+        return 0;
     }
 
     if (len1 == 1) {
         out.buffer_memset(*buf1, reps);
         out.buffer_fill_with_zeros_after_index(reps);
+        return 0;
     }
-    else {
-        for (npy_int64 i = 0; i < reps; i++) {
-            buf1.buffer_memcpy(out, len1);
-            out += len1;
-        }
-        out.buffer_fill_with_zeros_after_index(0);
+
+    size_t newlen;
+    if (NPY_UNLIKELY(npy_mul_with_overflow_size_t(&newlen, reps, len1) != 0) || newlen > PY_SSIZE_T_MAX) {
+        return -1;
+    }
+
+    size_t pad = 0;
+    size_t width = out.buffer_width();
+    if (width < newlen) {
+        reps = width / len1;
+        pad = width % len1;
     }
+
+    for (npy_int64 i = 0; i < reps; i++) {
+        buf1.buffer_memcpy(out, len1);
+        out += len1;
+    }
+
+    buf1.buffer_memcpy(out, pad);
+    out += pad;
+
+    out.buffer_fill_with_zeros_after_index(0);
+
+    return 0;
 }
 
 
@@ -238,7 +257,9 @@ string_multiply_strint_loop(PyArrayMethod_Context *context,
     while (N--) {
         Buffer<enc> buf(in1, elsize);
         Buffer<enc> outbuf(out, outsize);
-        string_multiply<enc>(buf, *(npy_int64 *)in2, outbuf);
+        if (NPY_UNLIKELY(string_multiply<enc>(buf, *(npy_int64 *)in2, outbuf) < 0)) {
+            npy_gil_error(PyExc_OverflowError, "Overflow detected in string multiply");
+        }
 
         in1 += strides[0];
         in2 += strides[1];
@@ -267,7 +288,9 @@ string_multiply_intstr_loop(PyArrayMethod_Context *context,
     while (N--) {
         Buffer<enc> buf(in2, elsize);
         Buffer<enc> outbuf(out, outsize);
-        string_multiply<enc>(buf, *(npy_int64 *)in1, outbuf);
+        if (NPY_UNLIKELY(string_multiply<enc>(buf, *(npy_int64 *)in1, outbuf) < 0)) {
+            npy_gil_error(PyExc_OverflowError, "Overflow detected in string multiply");
+        }
 
         in1 += strides[0];
         in2 += strides[1];
@@ -752,10 +775,11 @@ string_multiply_resolve_descriptors(
     if (given_descrs[2] == NULL) {
         PyErr_SetString(
             PyExc_TypeError,
-            "The 'out' kwarg is necessary. Use numpy.strings.multiply without it.");
+            "The 'out' kwarg is necessary when using the string multiply ufunc "
+            "directly. Use numpy.strings.multiply to multiply strings without "
+            "specifying 'out'.");
         return _NPY_ERROR_OCCURRED_IN_CAST;
     }
-
     loop_descrs[0] = NPY_DT_CALL_ensure_canonical(given_descrs[0]);
     if (loop_descrs[0] == NULL) {
         return _NPY_ERROR_OCCURRED_IN_CAST;

diff --git a/numpy/_core/src/umath/stringdtype_ufuncs.cpp b/numpy/_core/src/umath/stringdtype_ufuncs.cpp
@@ -137,9 +137,9 @@ static int multiply_loop_core(
         size_t newsize;
         int overflowed = npy_mul_with_overflow_size_t(
                 &newsize, cursize, factor);
-        if (overflowed) {
-            npy_gil_error(PyExc_MemoryError,
-                      "Failed to allocate string in string multiply");
+        if (overflowed || newsize > PY_SSIZE_T_MAX) {
+            npy_gil_error(PyExc_OverflowError,
+                      "Overflow encountered in string multiply");
             goto fail;
         }
 
@@ -1748,9 +1748,9 @@ center_ljust_rjust_strided_loop(PyArrayMethod_Context *context,
                     width - num_codepoints);
             newsize += s1.size;
 
-            if (overflowed) {
-                npy_gil_error(PyExc_MemoryError,
-                              "Failed to allocate string in %s", ufunc_name);
+            if (overflowed || newsize > PY_SSIZE_T_MAX) {
+                npy_gil_error(PyExc_OverflowError,
+                              "Overflow encountered in %s", ufunc_name);
                 goto fail;
             }
 

diff --git a/numpy/_core/strings.py b/numpy/_core/strings.py
@@ -218,7 +218,7 @@ def multiply(a, i):
 
     # Ensure we can do a_len * i without overflow.
     if np.any(a_len > sys.maxsize / np.maximum(i, 1)):
-        raise MemoryError("repeated string is too long")
+        raise OverflowError("Overflow encountered in string multiply")
 
     buffersizes = a_len * i
     out_dtype = f"{a.dtype.char}{buffersizes.max()}"

diff --git a/numpy/_core/tests/test_stringdtype.py b/numpy/_core/tests/test_stringdtype.py
@@ -128,8 +128,8 @@ def test_null_roundtripping():
 
 def test_string_too_large_error():
     arr = np.array(["a", "b", "c"], dtype=StringDType())
-    with pytest.raises(MemoryError):
-        arr * (2**63 - 2)
+    with pytest.raises(OverflowError):
+        arr * (sys.maxsize + 1)
 
 
 @pytest.mark.parametrize(

diff --git a/numpy/_core/tests/test_strings.py b/numpy/_core/tests/test_strings.py
@@ -224,9 +224,20 @@ def test_multiply_raises(self, dt):
         with pytest.raises(TypeError, match="unsupported type"):
             np.strings.multiply(np.array("abc", dtype=dt), 3.14)
 
-        with pytest.raises(MemoryError):
+        with pytest.raises(OverflowError):
             np.strings.multiply(np.array("abc", dtype=dt), sys.maxsize)
 
+    def test_inplace_multiply(self, dt):
+        arr = np.array(['foo ', 'bar'], dtype=dt)
+        arr *= 2
+        if dt != "T":
+            assert_array_equal(arr, np.array(['foo ', 'barb'], dtype=dt))
+        else:
+            assert_array_equal(arr, ['foo foo ', 'barbar'])
+
+        with pytest.raises(OverflowError):
+            arr *= sys.maxsize
+
     @pytest.mark.parametrize("i_dt", [np.int8, np.int16, np.int32,
                                       np.int64, np.int_])
     def test_multiply_integer_dtypes(self, i_dt, dt):

diff --git a/numpy/typing/tests/data/pass/ma.py b/numpy/typing/tests/data/pass/ma.py
@@ -16,7 +16,8 @@
 MAR_M_dt64: MaskedArray[np.datetime64] = np.ma.MaskedArray([np.datetime64(1, "D")])
 MAR_S: MaskedArray[np.bytes_] = np.ma.MaskedArray([b'foo'], dtype=np.bytes_)
 MAR_U: MaskedArray[np.str_] = np.ma.MaskedArray(['foo'], dtype=np.str_)
-MAR_T = cast(np.ma.MaskedArray[Any, np.dtypes.StringDType], np.ma.MaskedArray(["a"], "T"))
+MAR_T = cast(np.ma.MaskedArray[Any, np.dtypes.StringDType],
+             np.ma.MaskedArray(["a"], dtype="T"))
 
 AR_b: npt.NDArray[np.bool] = np.array([True, False, True])