@@ -169,10 +169,107 @@ T floor_div(T n, T d) {
169
169
}
170
170
return r;
171
171
}
172
+ // General divide implementation for arrays
173
+ template <typename T>
174
+ void simd_divide_contig_signed (T* src1, T* src2, T* dst, npy_intp len) {
175
+ using D = hn::ScalableTag<T>;
176
+ const D d;
177
+ const size_t N = hn::Lanes (d);
178
+ bool raise_overflow = false ;
179
+ bool raise_divbyzero = false ;
180
+ const auto vec_zero = hn::Zero (d);
181
+ const auto vec_min_val = hn::Set (d, std::numeric_limits<T>::min ());
182
+ const auto vec_neg_one = hn::Set (d, static_cast <T>(-1 ));
183
+
184
+ size_t i = 0 ;
185
+ for (; i + N <= static_cast <size_t >(len); i += N) {
186
+ const auto vec_a = hn::LoadU (d, src1 + i);
187
+ const auto vec_b = hn::LoadU (d, src2 + i);
188
+ const auto b_is_zero = hn::Eq (vec_b, vec_zero);
189
+ const auto a_is_min = hn::Eq (vec_a, vec_min_val);
190
+ const auto b_is_neg_one = hn::Eq (vec_b, vec_neg_one);
191
+ const auto overflow_cond = hn::And (a_is_min, b_is_neg_one);
192
+ auto vec_div = hn::Div (vec_a, vec_b);
193
+ const auto vec_mul = hn::Mul (vec_div, vec_b);
194
+ const auto has_remainder = hn::Ne (vec_a, vec_mul);
195
+ const auto a_sign = hn::Lt (vec_a, vec_zero);
196
+ const auto b_sign = hn::Lt (vec_b, vec_zero);
197
+ const auto different_signs = hn::Xor (a_sign, b_sign);
198
+ auto adjustment = hn::And (different_signs, has_remainder);
199
+ vec_div = hn::IfThenElse (adjustment,
200
+ hn::Sub (vec_div, hn::Set (d, static_cast <T>(1 ))),
201
+ vec_div);
202
+ vec_div = hn::IfThenElse (b_is_zero, vec_zero, vec_div);
203
+ vec_div = hn::IfThenElse (overflow_cond, vec_min_val, vec_div);
204
+ hn::StoreU (vec_div, d, dst + i);
205
+ if (!raise_divbyzero && !hn::AllFalse (d, b_is_zero)) {
206
+ raise_divbyzero = true ;
207
+ }
208
+ if (!raise_overflow && !hn::AllFalse (d, overflow_cond)) {
209
+ raise_overflow = true ;
210
+ }
211
+ }
212
+ for (; i < static_cast <size_t >(len); i++) {
213
+ T a = src1[i];
214
+ T b = src2[i];
215
+
216
+ if (b == 0 ) {
217
+ dst[i] = 0 ;
218
+ raise_divbyzero = true ;
219
+ }
220
+ else if (a == std::numeric_limits<T>::min () && b == -1 ) {
221
+ dst[i] = std::numeric_limits<T>::min ();
222
+ raise_overflow = true ;
223
+ }
224
+ else {
225
+ dst[i] = floor_div (a, b);
226
+ }
227
+ }
228
+
229
+ set_float_status (raise_overflow, raise_divbyzero);
230
+ }
231
+ // Unsigned division for arrays
232
+ template <typename T>
233
+ void simd_divide_contig_unsigned (T* src1, T* src2, T* dst, npy_intp len) {
234
+ using D = hn::ScalableTag<T>;
235
+ const D d;
236
+ const size_t N = hn::Lanes (d);
237
+
238
+ bool raise_divbyzero = false ;
239
+ const auto vec_zero = hn::Zero (d);
240
+
241
+ size_t i = 0 ;
242
+ for (; i + N <= static_cast <size_t >(len); i += N) {
243
+ const auto vec_a = hn::LoadU (d, src1 + i);
244
+ const auto vec_b = hn::LoadU (d, src2 + i);
245
+ const auto b_is_zero = hn::Eq (vec_b, vec_zero);
246
+ auto vec_div = hn::Div (vec_a, vec_b);
247
+ vec_div = hn::IfThenElse (b_is_zero, vec_zero, vec_div);
248
+ hn::StoreU (vec_div, d, dst + i);
249
+ if (!raise_divbyzero && !hn::AllFalse (d, b_is_zero)) {
250
+ raise_divbyzero = true ;
251
+ }
252
+ }
253
+ for (; i < static_cast <size_t >(len); i++) {
254
+ T a = src1[i];
255
+ T b = src2[i];
256
+
257
+ if (b == 0 ) {
258
+ dst[i] = 0 ;
259
+ raise_divbyzero = true ;
260
+ } else {
261
+ dst[i] = a / b;
262
+ }
263
+ }
264
+
265
+ set_float_status (false , raise_divbyzero);
266
+ }
267
+
172
268
173
269
// Dispatch functions for signed integer division
174
270
template <typename T>
175
271
void TYPE_divide (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED (func)) {
272
+ npy_clear_floatstatus ();
176
273
if (IS_BINARY_REDUCE) {
177
274
BINARY_REDUCE_LOOP (T) {
178
275
const T divisor = *reinterpret_cast <T*>(ip2);
@@ -189,8 +286,22 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
189
286
*reinterpret_cast <T*>(iop1) = io1;
190
287
return ;
191
288
}
192
- #if NPY_SIMD
193
- if (IS_BLOCKABLE_BINARY_SCALAR2 (sizeof (T), NPY_SIMD_WIDTH) &&
289
+ #if NPY_SIMD
290
+ // Handle array-array case
291
+ if (IS_BLOCKABLE_BINARY (sizeof (T), NPY_SIMD_WIDTH))
292
+ {
293
+ bool no_overlap = nomemoverlap (args[2 ], steps[2 ], args[0 ], steps[0 ], dimensions[0 ]) &&
294
+ nomemoverlap (args[2 ], steps[2 ], args[1 ], steps[1 ], dimensions[0 ]);
295
+ // Check if we can use SIMD for contiguous arrays - all steps must equal to sizeof(T)
296
+ if (steps[0 ] == sizeof (T) && steps[1 ] == sizeof (T) && steps[2 ] == sizeof (T) && no_overlap) {
297
+ T* src1 = (T*)args[0 ];
298
+ T* src2 = (T*)args[1 ];
299
+ T* dst = (T*)args[2 ];
300
+ simd_divide_contig_signed (src1, src2, dst, dimensions[0 ]);
301
+ return ;
302
+ }
303
+ }
304
+ else if (IS_BLOCKABLE_BINARY_SCALAR2 (sizeof (T), NPY_SIMD_WIDTH) &&
194
305
*reinterpret_cast <T*>(args[1 ]) != 0 )
195
306
{
196
307
bool no_overlap = nomemoverlap (args[2 ], steps[2 ], args[0 ], steps[0 ], dimensions[0 ]);
@@ -225,6 +336,7 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
225
336
// Dispatch functions for unsigned integer division
226
337
template <typename T>
227
338
void TYPE_divide_unsigned (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED (func)) {
339
+ npy_clear_floatstatus ();
228
340
if (IS_BINARY_REDUCE) {
229
341
BINARY_REDUCE_LOOP (T) {
230
342
const T d = *reinterpret_cast <T*>(ip2);
@@ -239,7 +351,20 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
239
351
return ;
240
352
}
241
353
#if NPY_SIMD
242
- if (IS_BLOCKABLE_BINARY_SCALAR2 (sizeof (T), NPY_SIMD_WIDTH) &&
354
+ // Handle array-array case
355
+ if (IS_BLOCKABLE_BINARY (sizeof (T), NPY_SIMD_WIDTH)) {
356
+ bool no_overlap = nomemoverlap (args[2 ], steps[2 ], args[0 ], steps[0 ], dimensions[0 ]) &&
357
+ nomemoverlap (args[2 ], steps[2 ], args[1 ], steps[1 ], dimensions[0 ]);
358
+ // Check if we can use SIMD for contiguous arrays - all steps must equal to sizeof(T)
359
+ if (steps[0 ] == sizeof (T) && steps[1 ] == sizeof (T) && steps[2 ] == sizeof (T) && no_overlap) {
360
+ T* src1 = (T*)args[0 ];
361
+ T* src2 = (T*)args[1 ];
362
+ T* dst = (T*)args[2 ];
363
+ simd_divide_contig_unsigned (src1, src2, dst, dimensions[0 ]);
364
+ return ;
365
+ }
366
+ }
367
+ else if (IS_BLOCKABLE_BINARY_SCALAR2 (sizeof (T), NPY_SIMD_WIDTH) &&
243
368
*reinterpret_cast <T*>(args[1 ]) != 0 )
244
369
{
245
370
bool no_overlap = nomemoverlap (args[2 ], steps[2 ], args[0 ], steps[0 ], dimensions[0 ]);
0 commit comments