Skip to content

Commit ae7e67e

Browse files
update logic for array-array div
1 parent 561c69d commit ae7e67e

File tree

1 file changed

+32
-16
lines changed

1 file changed

+32
-16
lines changed

numpy/_core/src/umath/loops_arithmetic.dispatch.cpp

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ T floor_div(T n, T d) {
169169
}
170170
return r;
171171
}
172+
172173
// General divide implementation for arrays
173174
template <typename T>
174175
void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
@@ -178,41 +179,52 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
178179
bool raise_overflow = false;
179180
bool raise_divbyzero = false;
180181
const auto vec_zero = hn::Zero(d);
182+
const auto vec_one = hn::Set(d, static_cast<T>(1));
181183
const auto vec_min_val = hn::Set(d, std::numeric_limits<T>::min());
182184
const auto vec_neg_one = hn::Set(d, static_cast<T>(-1));
183-
185+
184186
size_t i = 0;
185187
for (; i + N <= static_cast<size_t>(len); i += N) {
186188
const auto vec_a = hn::LoadU(d, src1 + i);
187189
const auto vec_b = hn::LoadU(d, src2 + i);
190+
188191
const auto b_is_zero = hn::Eq(vec_b, vec_zero);
189192
const auto a_is_min = hn::Eq(vec_a, vec_min_val);
190193
const auto b_is_neg_one = hn::Eq(vec_b, vec_neg_one);
191194
const auto overflow_cond = hn::And(a_is_min, b_is_neg_one);
192-
auto vec_div = hn::Div(vec_a, vec_b);
193-
const auto vec_mul = hn::Mul(vec_div, vec_b);
195+
196+
const auto safe_div_mask = hn::Not(hn::Or(b_is_zero, overflow_cond));
197+
const auto safe_b = hn::IfThenElse(hn::Or(b_is_zero, overflow_cond), vec_one, vec_b);
198+
199+
auto vec_div = hn::Div(vec_a, safe_b);
200+
201+
const auto vec_mul = hn::Mul(vec_div, safe_b);
194202
const auto has_remainder = hn::Ne(vec_a, vec_mul);
195203
const auto a_sign = hn::Lt(vec_a, vec_zero);
196-
const auto b_sign = hn::Lt(vec_b, vec_zero);
204+
const auto b_sign = hn::Lt(safe_b, vec_zero);
197205
const auto different_signs = hn::Xor(a_sign, b_sign);
198-
auto adjustment = hn::And(different_signs, has_remainder);
199-
vec_div = hn::IfThenElse(adjustment,
200-
hn::Sub(vec_div, hn::Set(d, static_cast<T>(1))),
201-
vec_div);
206+
const auto needs_adjustment = hn::And(safe_div_mask,
207+
hn::And(different_signs, has_remainder));
208+
209+
vec_div = hn::MaskedSubOr(vec_div, needs_adjustment, vec_div, vec_one);
210+
202211
vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div);
203212
vec_div = hn::IfThenElse(overflow_cond, vec_min_val, vec_div);
213+
204214
hn::StoreU(vec_div, d, dst + i);
215+
205216
if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) {
206217
raise_divbyzero = true;
207218
}
208219
if (!raise_overflow && !hn::AllFalse(d, overflow_cond)) {
209220
raise_overflow = true;
210221
}
211222
}
223+
212224
for (; i < static_cast<size_t>(len); i++) {
213225
T a = src1[i];
214226
T b = src2[i];
215-
227+
216228
if (b == 0) {
217229
dst[i] = 0;
218230
raise_divbyzero = true;
@@ -225,47 +237,51 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
225237
dst[i] = floor_div(a, b);
226238
}
227239
}
228-
240+
229241
set_float_status(raise_overflow, raise_divbyzero);
230242
}
243+
231244
// Unsigned division for arrays
232245
template <typename T>
233246
void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) {
234247
using D = hn::ScalableTag<T>;
235248
const D d;
236249
const size_t N = hn::Lanes(d);
237-
238250
bool raise_divbyzero = false;
239251
const auto vec_zero = hn::Zero(d);
240-
252+
const auto vec_one = hn::Set(d, static_cast<T>(1));
253+
241254
size_t i = 0;
242255
for (; i + N <= static_cast<size_t>(len); i += N) {
243256
const auto vec_a = hn::LoadU(d, src1 + i);
244257
const auto vec_b = hn::LoadU(d, src2 + i);
258+
245259
const auto b_is_zero = hn::Eq(vec_b, vec_zero);
246-
auto vec_div = hn::Div(vec_a, vec_b);
260+
261+
const auto safe_b = hn::IfThenElse(b_is_zero, vec_one, vec_b);
262+
auto vec_div = hn::Div(vec_a, safe_b);
247263
vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div);
248264
hn::StoreU(vec_div, d, dst + i);
249265
if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) {
250266
raise_divbyzero = true;
251267
}
252268
}
269+
270+
// Handle remaining elements
253271
for (; i < static_cast<size_t>(len); i++) {
254272
T a = src1[i];
255273
T b = src2[i];
256-
274+
257275
if (b == 0) {
258276
dst[i] = 0;
259277
raise_divbyzero = true;
260278
} else {
261279
dst[i] = a / b;
262280
}
263281
}
264-
265282
set_float_status(false, raise_divbyzero);
266283
}
267284

268-
269285
// Dispatch functions for signed integer division
270286
template <typename T>
271287
void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy