Skip to content

Commit e834998

Browse files
add array-array div logic
1 parent 47b98c1 commit e834998

File tree

1 file changed

+128
-3
lines changed

1 file changed

+128
-3
lines changed

numpy/_core/src/umath/loops_arithmetic.dispatch.cpp

Lines changed: 128 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -169,10 +169,107 @@ T floor_div(T n, T d) {
169169
}
170170
return r;
171171
}
172+
// General divide implementation for arrays
173+
template <typename T>
174+
void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
175+
using D = hn::ScalableTag<T>;
176+
const D d;
177+
const size_t N = hn::Lanes(d);
178+
bool raise_overflow = false;
179+
bool raise_divbyzero = false;
180+
const auto vec_zero = hn::Zero(d);
181+
const auto vec_min_val = hn::Set(d, std::numeric_limits<T>::min());
182+
const auto vec_neg_one = hn::Set(d, static_cast<T>(-1));
183+
184+
size_t i = 0;
185+
for (; i + N <= static_cast<size_t>(len); i += N) {
186+
const auto vec_a = hn::LoadU(d, src1 + i);
187+
const auto vec_b = hn::LoadU(d, src2 + i);
188+
const auto b_is_zero = hn::Eq(vec_b, vec_zero);
189+
const auto a_is_min = hn::Eq(vec_a, vec_min_val);
190+
const auto b_is_neg_one = hn::Eq(vec_b, vec_neg_one);
191+
const auto overflow_cond = hn::And(a_is_min, b_is_neg_one);
192+
auto vec_div = hn::Div(vec_a, vec_b);
193+
const auto vec_mul = hn::Mul(vec_div, vec_b);
194+
const auto has_remainder = hn::Ne(vec_a, vec_mul);
195+
const auto a_sign = hn::Lt(vec_a, vec_zero);
196+
const auto b_sign = hn::Lt(vec_b, vec_zero);
197+
const auto different_signs = hn::Xor(a_sign, b_sign);
198+
auto adjustment = hn::And(different_signs, has_remainder);
199+
vec_div = hn::IfThenElse(adjustment,
200+
hn::Sub(vec_div, hn::Set(d, static_cast<T>(1))),
201+
vec_div);
202+
vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div);
203+
vec_div = hn::IfThenElse(overflow_cond, vec_min_val, vec_div);
204+
hn::StoreU(vec_div, d, dst + i);
205+
if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) {
206+
raise_divbyzero = true;
207+
}
208+
if (!raise_overflow && !hn::AllFalse(d, overflow_cond)) {
209+
raise_overflow = true;
210+
}
211+
}
212+
for (; i < static_cast<size_t>(len); i++) {
213+
T a = src1[i];
214+
T b = src2[i];
215+
216+
if (b == 0) {
217+
dst[i] = 0;
218+
raise_divbyzero = true;
219+
}
220+
else if (a == std::numeric_limits<T>::min() && b == -1) {
221+
dst[i] = std::numeric_limits<T>::min();
222+
raise_overflow = true;
223+
}
224+
else {
225+
dst[i] = floor_div(a, b);
226+
}
227+
}
228+
229+
set_float_status(raise_overflow, raise_divbyzero);
230+
}
231+
// Unsigned division for arrays
232+
template <typename T>
233+
void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) {
234+
using D = hn::ScalableTag<T>;
235+
const D d;
236+
const size_t N = hn::Lanes(d);
237+
238+
bool raise_divbyzero = false;
239+
const auto vec_zero = hn::Zero(d);
240+
241+
size_t i = 0;
242+
for (; i + N <= static_cast<size_t>(len); i += N) {
243+
const auto vec_a = hn::LoadU(d, src1 + i);
244+
const auto vec_b = hn::LoadU(d, src2 + i);
245+
const auto b_is_zero = hn::Eq(vec_b, vec_zero);
246+
auto vec_div = hn::Div(vec_a, vec_b);
247+
vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div);
248+
hn::StoreU(vec_div, d, dst + i);
249+
if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) {
250+
raise_divbyzero = true;
251+
}
252+
}
253+
for (; i < static_cast<size_t>(len); i++) {
254+
T a = src1[i];
255+
T b = src2[i];
256+
257+
if (b == 0) {
258+
dst[i] = 0;
259+
raise_divbyzero = true;
260+
} else {
261+
dst[i] = a / b;
262+
}
263+
}
264+
265+
set_float_status(false, raise_divbyzero);
266+
}
267+
172268

173269
// Dispatch functions for signed integer division
174270
template <typename T>
175271
void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
272+
npy_clear_floatstatus();
176273
if (IS_BINARY_REDUCE) {
177274
BINARY_REDUCE_LOOP(T) {
178275
const T divisor = *reinterpret_cast<T*>(ip2);
@@ -189,8 +286,22 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
189286
*reinterpret_cast<T*>(iop1) = io1;
190287
return;
191288
}
192-
#if NPY_SIMD
193-
if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
289+
#if NPY_SIMD
290+
// Handle array-array case
291+
if (IS_BLOCKABLE_BINARY(sizeof(T), NPY_SIMD_WIDTH))
292+
{
293+
bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) &&
294+
nomemoverlap(args[2], steps[2], args[1], steps[1], dimensions[0]);
295+
// Check if we can use SIMD for contiguous arrays - all steps must equal to sizeof(T)
296+
if (steps[0] == sizeof(T) && steps[1] == sizeof(T) && steps[2] == sizeof(T) && no_overlap) {
297+
T* src1 = (T*)args[0];
298+
T* src2 = (T*)args[1];
299+
T* dst = (T*)args[2];
300+
simd_divide_contig_signed(src1, src2, dst, dimensions[0]);
301+
return;
302+
}
303+
}
304+
else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
194305
*reinterpret_cast<T*>(args[1]) != 0)
195306
{
196307
bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]);
@@ -225,6 +336,7 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
225336
// Dispatch functions for unsigned integer division
226337
template <typename T>
227338
void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
339+
npy_clear_floatstatus();
228340
if (IS_BINARY_REDUCE) {
229341
BINARY_REDUCE_LOOP(T) {
230342
const T d = *reinterpret_cast<T*>(ip2);
@@ -239,7 +351,20 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
239351
return;
240352
}
241353
#if NPY_SIMD
242-
if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
354+
// Handle array-array case
355+
if (IS_BLOCKABLE_BINARY(sizeof(T), NPY_SIMD_WIDTH)) {
356+
bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) &&
357+
nomemoverlap(args[2], steps[2], args[1], steps[1], dimensions[0]);
358+
// Check if we can use SIMD for contiguous arrays - all steps must equal to sizeof(T)
359+
if (steps[0] == sizeof(T) && steps[1] == sizeof(T) && steps[2] == sizeof(T) && no_overlap) {
360+
T* src1 = (T*)args[0];
361+
T* src2 = (T*)args[1];
362+
T* dst = (T*)args[2];
363+
simd_divide_contig_unsigned(src1, src2, dst, dimensions[0]);
364+
return;
365+
}
366+
}
367+
else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
243368
*reinterpret_cast<T*>(args[1]) != 0)
244369
{
245370
bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]);

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy