@@ -169,6 +169,7 @@ T floor_div(T n, T d) {
169
169
}
170
170
return r;
171
171
}
172
+
172
173
// General divide implementation for arrays
173
174
template <typename T>
174
175
void simd_divide_contig_signed (T* src1, T* src2, T* dst, npy_intp len) {
@@ -178,41 +179,52 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
178
179
bool raise_overflow = false ;
179
180
bool raise_divbyzero = false ;
180
181
const auto vec_zero = hn::Zero (d);
182
+ const auto vec_one = hn::Set (d, static_cast <T>(1 ));
181
183
const auto vec_min_val = hn::Set (d, std::numeric_limits<T>::min ());
182
184
const auto vec_neg_one = hn::Set (d, static_cast <T>(-1 ));
183
-
185
+
184
186
size_t i = 0 ;
185
187
for (; i + N <= static_cast <size_t >(len); i += N) {
186
188
const auto vec_a = hn::LoadU (d, src1 + i);
187
189
const auto vec_b = hn::LoadU (d, src2 + i);
190
+
188
191
const auto b_is_zero = hn::Eq (vec_b, vec_zero);
189
192
const auto a_is_min = hn::Eq (vec_a, vec_min_val);
190
193
const auto b_is_neg_one = hn::Eq (vec_b, vec_neg_one);
191
194
const auto overflow_cond = hn::And (a_is_min, b_is_neg_one);
192
- auto vec_div = hn::Div (vec_a, vec_b);
193
- const auto vec_mul = hn::Mul (vec_div, vec_b);
195
+
196
+ const auto safe_div_mask = hn::Not (hn::Or (b_is_zero, overflow_cond));
197
+ const auto safe_b = hn::IfThenElse (hn::Or (b_is_zero, overflow_cond), vec_one, vec_b);
198
+
199
+ auto vec_div = hn::Div (vec_a, safe_b);
200
+
201
+ const auto vec_mul = hn::Mul (vec_div, safe_b);
194
202
const auto has_remainder = hn::Ne (vec_a, vec_mul);
195
203
const auto a_sign = hn::Lt (vec_a, vec_zero);
196
- const auto b_sign = hn::Lt (vec_b , vec_zero);
204
+ const auto b_sign = hn::Lt (safe_b , vec_zero);
197
205
const auto different_signs = hn::Xor (a_sign, b_sign);
198
- auto adjustment = hn::And (different_signs, has_remainder);
199
- vec_div = hn::IfThenElse (adjustment,
200
- hn::Sub (vec_div, hn::Set (d, static_cast <T>(1 ))),
201
- vec_div);
206
+ const auto needs_adjustment = hn::And (safe_div_mask,
207
+ hn::And (different_signs, has_remainder));
208
+
209
+ vec_div = hn::MaskedSubOr (vec_div, needs_adjustment, vec_div, vec_one);
210
+
202
211
vec_div = hn::IfThenElse (b_is_zero, vec_zero, vec_div);
203
212
vec_div = hn::IfThenElse (overflow_cond, vec_min_val, vec_div);
213
+
204
214
hn::StoreU (vec_div, d, dst + i);
215
+
205
216
if (!raise_divbyzero && !hn::AllFalse (d, b_is_zero)) {
206
217
raise_divbyzero = true ;
207
218
}
208
219
if (!raise_overflow && !hn::AllFalse (d, overflow_cond)) {
209
220
raise_overflow = true ;
210
221
}
211
222
}
223
+
212
224
for (; i < static_cast <size_t >(len); i++) {
213
225
T a = src1[i];
214
226
T b = src2[i];
215
-
227
+
216
228
if (b == 0 ) {
217
229
dst[i] = 0 ;
218
230
raise_divbyzero = true ;
@@ -225,47 +237,51 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
225
237
dst[i] = floor_div (a, b);
226
238
}
227
239
}
228
-
240
+
229
241
set_float_status (raise_overflow, raise_divbyzero);
230
242
}
243
+
231
244
// Unsigned division for arrays
232
245
template <typename T>
233
246
void simd_divide_contig_unsigned (T* src1, T* src2, T* dst, npy_intp len) {
234
247
using D = hn::ScalableTag<T>;
235
248
const D d;
236
249
const size_t N = hn::Lanes (d);
237
-
238
250
bool raise_divbyzero = false ;
239
251
const auto vec_zero = hn::Zero (d);
240
-
252
+ const auto vec_one = hn::Set (d, static_cast <T>(1 ));
253
+
241
254
size_t i = 0 ;
242
255
for (; i + N <= static_cast <size_t >(len); i += N) {
243
256
const auto vec_a = hn::LoadU (d, src1 + i);
244
257
const auto vec_b = hn::LoadU (d, src2 + i);
258
+
245
259
const auto b_is_zero = hn::Eq (vec_b, vec_zero);
246
- auto vec_div = hn::Div (vec_a, vec_b);
260
+
261
+ const auto safe_b = hn::IfThenElse (b_is_zero, vec_one, vec_b);
262
+ auto vec_div = hn::Div (vec_a, safe_b);
247
263
vec_div = hn::IfThenElse (b_is_zero, vec_zero, vec_div);
248
264
hn::StoreU (vec_div, d, dst + i);
249
265
if (!raise_divbyzero && !hn::AllFalse (d, b_is_zero)) {
250
266
raise_divbyzero = true ;
251
267
}
252
268
}
269
+
270
+ // Handle remaining elements
253
271
for (; i < static_cast <size_t >(len); i++) {
254
272
T a = src1[i];
255
273
T b = src2[i];
256
-
274
+
257
275
if (b == 0 ) {
258
276
dst[i] = 0 ;
259
277
raise_divbyzero = true ;
260
278
} else {
261
279
dst[i] = a / b;
262
280
}
263
281
}
264
-
265
282
set_float_status (false , raise_divbyzero);
266
283
}
267
284
268
-
269
285
// Dispatch functions for signed integer division
270
286
template <typename T>
271
287
void TYPE_divide (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED (func)) {
0 commit comments