numpy · ngoldbaum · Jun 20, 2025 · Apr 15, 2025 · Apr 16, 2025 · Apr 16, 2025
diff --git a/doc/release/upcoming_changes/28767.change.rst b/doc/release/upcoming_changes/28767.change.rst
@@ -0,0 +1,10 @@
+``unique_values`` for string dtypes may return unsorted data
+------------------------------------------------------------
+np.unique now supports hash‐based duplicate removal for string dtypes.
+This enhancement extends the hash-table algorithm to byte strings ('S'),
+Unicode strings ('U'), and the experimental string dtype ('T', StringDType).
+As a result, calling np.unique() on an array of strings will use
+the faster hash-based method to obtain unique values.
+Note that this hash-based method does not guarantee that the returned unique values will be sorted.
+This also works for StringDType arrays containing None (missing values)
+when using equal_nan=True (treating missing values as equal).
diff --git a/doc/release/upcoming_changes/28767.performance.rst b/doc/release/upcoming_changes/28767.performance.rst
@@ -0,0 +1,10 @@
+Performance improvements to ``np.unique`` for string dtypes
+-----------------------------------------------------------
+The hash-based algorithm for unique extraction provides
+an order-of-magnitude speedup on large string arrays. 
+In an internal benchmark with about 1 billion string elements,
+the hash-based np.unique completed in roughly 33.5 seconds,
+compared to 498 seconds with the sort-based method
+– about 15× faster for unsorted unique operations on strings.
+This improvement greatly reduces the time to find unique values
+in very large string datasets.
diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build
@@ -1206,6 +1206,7 @@ src_multiarray = multiarray_gen_headers + [
   # Remove this `arm64_exports.c` file once scipy macos arm64 build correctly
   # links to the arm64 npymath library, see gh-22673
   'src/npymath/arm64_exports.c',
+  'src/multiarray/fnv.c',
 ]
 
 src_umath = umath_gen_headers + [

diff --git a/numpy/_core/src/multiarray/fnv.c b/numpy/_core/src/multiarray/fnv.c
@@ -0,0 +1,85 @@
+/*
+  FNV-1a hash algorithm implementation
+  Based on the implementation from:
+  https://github.com/lcn2/fnv
+*/
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
+#include <Python.h>
+#include "numpy/npy_common.h"
+#include "fnv.h"
+
+
+#define FNV1A_32_INIT ((npy_uint32)0x811c9dc5)
+#define FNV1A_64_INIT ((npy_uint64)0xcbf29ce484222325ULL)
+
+/*
+  Compute a 32-bit FNV-1a hash of buffer
+  original implementation from:
+  https://github.com/lcn2/fnv/blob/b7fcbee95538ee6a15744e756e7e7f1c02862cb0/hash_32a.c
+*/
+npy_uint32 
+npy_fnv1a_32(const void *buf, size_t len, npy_uint32 hval)
+{
+    const unsigned char *bp = (const unsigned char *)buf;  /* start of buffer */
+    const unsigned char *be = bp + len;                    /* beyond end of buffer */
+
+    /*
+      FNV-1a hash each octet in the buffer
+    */
+    while (bp < be) {
+
+        /* xor the bottom with the current octet */
+        hval ^= (npy_uint32)*bp++;
+
+        /* multiply by the 32 bit FNV magic prime */
+        /* hval *= 0x01000193; */
+        hval += (hval<<1) + (hval<<4) + (hval<<7) + (hval<<8) + (hval<<24);
+    }
+
+    return hval;
+}
+
+/*
+  Compute a 64-bit FNV-1a hash of the given data
+  original implementation from:
+  https://github.com/lcn2/fnv/blob/b7fcbee95538ee6a15744e756e7e7f1c02862cb0/hash_64a.c
+*/
+npy_uint64 
+npy_fnv1a_64(const void *buf, size_t len, npy_uint64 hval)
+{
+    const unsigned char *bp = (const unsigned char *)buf;  /* start of buffer */
+    const unsigned char *be = bp + len;                    /* beyond end of buffer */
+
+    /*
+      FNV-1a hash each octet in the buffer
+    */
+    while (bp < be) {
+
+        /* xor the bottom with the current octet */
+        hval ^= (npy_uint64)*bp++;
+
+        /* multiply by the 64 bit FNV magic prime */
+        /* hval *= 0x100000001b3ULL; */
+        hval += (hval << 1) + (hval << 4) + (hval << 5) +
+		        (hval << 7) + (hval << 8) + (hval << 40);
+    }
+
+    return hval;
+}
+
+/*
+ * Compute a size_t FNV-1a hash of the given data
+ * This will use 32-bit or 64-bit hash depending on the size of size_t
+ */
+size_t 
+npy_fnv1a(const void *buf, size_t len)
+{
+#if NPY_SIZEOF_SIZE_T == 8
+    return (size_t)npy_fnv1a_64(buf, len, FNV1A_64_INIT);
+#else /* NPY_SIZEOF_SIZE_T == 4 */
+    return (size_t)npy_fnv1a_32(buf, len, FNV1A_32_INIT);
+#endif
+}
diff --git a/numpy/_core/src/multiarray/fnv.h b/numpy/_core/src/multiarray/fnv.h
@@ -0,0 +1,26 @@
+/*
+  FNV-1a hash algorithm implementation
+  Based on the implementation from:
+  https://github.com/lcn2/fnv
+*/
+
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_MULTIARRAY_FNV_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_MULTIARRAY_FNV_H_
+
+
+/*
+  Compute a size_t FNV-1a hash of the given data
+  This will use 32-bit or 64-bit hash depending on the size of size_t
+
+  Parameters:
+  -----------
+  buf - pointer to the data to be hashed
+  len - length of the data in bytes
+
+  Returns:
+  -----------
+  size_t hash value
+*/
+size_t npy_fnv1a(const void *buf, size_t len);
+
+#endif  // NUMPY_CORE_INCLUDE_NUMPY_MULTIARRAY_FNV_H_
diff --git a/numpy/_core/src/multiarray/multiarraymodule.c b/numpy/_core/src/multiarray/multiarraymodule.c
@@ -4571,7 +4571,7 @@ static struct PyMethodDef array_module_methods[] = {
     {"from_dlpack", (PyCFunction)from_dlpack,
         METH_FASTCALL | METH_KEYWORDS, NULL},
     {"_unique_hash",  (PyCFunction)array__unique_hash,
-        METH_O, "Collect unique values via a hash map."},
+        METH_FASTCALL | METH_KEYWORDS, "Collect unique values via a hash map."},
     {NULL, NULL, 0, NULL}                /* sentinel */
 };