Skip to content

Commit 32f4afa

Browse files
authored
ENH: improve Timsort with powersort merge-policy (#29208)
Implement the improved merge policy for Timsort, as developed by Munro and Wild. Benchmarks show a significant improvement in performance.
1 parent f6a17f0 commit 32f4afa

File tree

1 file changed

+51
-59
lines changed

1 file changed

+51
-59
lines changed

numpy/_core/src/npysort/timsort.cpp

Lines changed: 51 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,9 @@
3939
#include <cstdlib>
4040
#include <utility>
4141

42-
/* enough for 32 * 1.618 ** 128 elements */
43-
#define TIMSORT_STACK_SIZE 128
42+
/* enough for 32 * 1.618 ** 128 elements.
43+
If powersort was used in all cases, 90 would suffice, as 32 * 2 ** 90 >= 32 * 1.618 ** 128 */
44+
#define RUN_STACK_SIZE 128
4445

4546
static npy_intp
4647
compute_min_run(npy_intp num)
@@ -58,6 +59,7 @@ compute_min_run(npy_intp num)
5859
typedef struct {
5960
npy_intp s; /* start pointer */
6061
npy_intp l; /* length */
62+
int power; /* node "level" for powersort merge strategy */
6163
} run;
6264

6365
/* buffer for argsort. Declared here to avoid multiple declarations. */
@@ -383,60 +385,51 @@ merge_at_(type *arr, const run *stack, const npy_intp at, buffer_<Tag> *buffer)
383385
return 0;
384386
}
385387

386-
template <typename Tag, typename type>
388+
/* See https://github.com/python/cpython/blob/ea23c897cd25702e72a04e06664f6864f07a7c5d/Objects/listsort.txt
389+
* for a detailed explanation.
390+
* In CPython, *num* is called *n*, but we changed it for consistency with the NumPy implementation.
391+
*/
387392
static int
388-
try_collapse_(type *arr, run *stack, npy_intp *stack_ptr, buffer_<Tag> *buffer)
393+
powerloop(npy_intp s1, npy_intp n1, npy_intp n2, npy_intp num)
389394
{
390-
int ret;
391-
npy_intp A, B, C, top;
392-
top = *stack_ptr;
393-
394-
while (1 < top) {
395-
B = stack[top - 2].l;
396-
C = stack[top - 1].l;
397-
398-
if ((2 < top && stack[top - 3].l <= B + C) ||
399-
(3 < top && stack[top - 4].l <= stack[top - 3].l + B)) {
400-
A = stack[top - 3].l;
401-
402-
if (A <= C) {
403-
ret = merge_at_<Tag>(arr, stack, top - 3, buffer);
404-
405-
if (NPY_UNLIKELY(ret < 0)) {
406-
return ret;
407-
}
408-
409-
stack[top - 3].l += B;
410-
stack[top - 2] = stack[top - 1];
411-
--top;
412-
}
413-
else {
414-
ret = merge_at_<Tag>(arr, stack, top - 2, buffer);
415-
416-
if (NPY_UNLIKELY(ret < 0)) {
417-
return ret;
418-
}
419-
420-
stack[top - 2].l += C;
421-
--top;
422-
}
395+
int result = 0;
396+
npy_intp a = 2 * s1 + n1; /* 2*a */
397+
npy_intp b = a + n1 + n2; /* 2*b */
398+
for (;;) {
399+
++result;
400+
if (a >= num) { /* both quotient bits are 1 */
401+
a -= num;
402+
b -= num;
423403
}
424-
else if (1 < top && B <= C) {
425-
ret = merge_at_<Tag>(arr, stack, top - 2, buffer);
404+
else if (b >= num) { /* a/num bit is 0, b/num bit is 1 */
405+
break;
406+
}
407+
a <<= 1;
408+
b <<= 1;
409+
}
410+
return result;
411+
}
426412

413+
template <typename Tag, typename type>
414+
static int
415+
found_new_run_(type *arr, run *stack, npy_intp *stack_ptr, npy_intp n2,
416+
npy_intp num, buffer_<Tag> *buffer)
417+
{
418+
int ret;
419+
if (*stack_ptr > 0) {
420+
npy_intp s1 = stack[*stack_ptr - 1].s;
421+
npy_intp n1 = stack[*stack_ptr - 1].l;
422+
int power = powerloop(s1, n1, n2, num);
423+
while (*stack_ptr > 1 && stack[*stack_ptr - 2].power > power) {
424+
ret = merge_at_<Tag>(arr, stack, *stack_ptr - 2, buffer);
427425
if (NPY_UNLIKELY(ret < 0)) {
428426
return ret;
429427
}
430-
431-
stack[top - 2].l += C;
432-
--top;
433-
}
434-
else {
435-
break;
428+
stack[*stack_ptr - 2].l += stack[*stack_ptr - 1].l;
429+
--(*stack_ptr);
436430
}
431+
stack[*stack_ptr - 1].power = power;
437432
}
438-
439-
*stack_ptr = top;
440433
return 0;
441434
}
442435

@@ -491,23 +484,22 @@ timsort_(void *start, npy_intp num)
491484
int ret;
492485
npy_intp l, n, stack_ptr, minrun;
493486
buffer_<Tag> buffer;
494-
run stack[TIMSORT_STACK_SIZE];
487+
run stack[RUN_STACK_SIZE];
495488
buffer.pw = NULL;
496489
buffer.size = 0;
497490
stack_ptr = 0;
498491
minrun = compute_min_run(num);
499492

500493
for (l = 0; l < num;) {
501494
n = count_run_<Tag>((type *)start, l, num, minrun);
495+
ret = found_new_run_<Tag>((type *)start, stack, &stack_ptr, n, num, &buffer);
496+
if (NPY_UNLIKELY(ret < 0))
497+
goto cleanup;
498+
499+
// Push the new run onto the stack.
502500
stack[stack_ptr].s = l;
503501
stack[stack_ptr].l = n;
504502
++stack_ptr;
505-
ret = try_collapse_<Tag>((type *)start, stack, &stack_ptr, &buffer);
506-
507-
if (NPY_UNLIKELY(ret < 0)) {
508-
goto cleanup;
509-
}
510-
511503
l += n;
512504
}
513505

@@ -897,7 +889,7 @@ atimsort_(void *v, npy_intp *tosort, npy_intp num)
897889
int ret;
898890
npy_intp l, n, stack_ptr, minrun;
899891
buffer_intp buffer;
900-
run stack[TIMSORT_STACK_SIZE];
892+
run stack[RUN_STACK_SIZE];
901893
buffer.pw = NULL;
902894
buffer.size = 0;
903895
stack_ptr = 0;
@@ -1371,7 +1363,7 @@ string_timsort_(void *start, npy_intp num, void *varr)
13711363
size_t len = elsize / sizeof(type);
13721364
int ret;
13731365
npy_intp l, n, stack_ptr, minrun;
1374-
run stack[TIMSORT_STACK_SIZE];
1366+
run stack[RUN_STACK_SIZE];
13751367
string_buffer_<Tag> buffer;
13761368

13771369
/* Items that have zero size don't make sense to sort */
@@ -1800,7 +1792,7 @@ string_atimsort_(void *start, npy_intp *tosort, npy_intp num, void *varr)
18001792
size_t len = elsize / sizeof(type);
18011793
int ret;
18021794
npy_intp l, n, stack_ptr, minrun;
1803-
run stack[TIMSORT_STACK_SIZE];
1795+
run stack[RUN_STACK_SIZE];
18041796
buffer_intp buffer;
18051797

18061798
/* Items that have zero size don't make sense to sort */
@@ -2253,7 +2245,7 @@ npy_timsort(void *start, npy_intp num, void *varr)
22532245
PyArray_CompareFunc *cmp = PyDataType_GetArrFuncs(PyArray_DESCR(arr))->compare;
22542246
int ret;
22552247
npy_intp l, n, stack_ptr, minrun;
2256-
run stack[TIMSORT_STACK_SIZE];
2248+
run stack[RUN_STACK_SIZE];
22572249
buffer_char buffer;
22582250

22592251
/* Items that have zero size don't make sense to sort */
@@ -2689,7 +2681,7 @@ npy_atimsort(void *start, npy_intp *tosort, npy_intp num, void *varr)
26892681
PyArray_CompareFunc *cmp = PyDataType_GetArrFuncs(PyArray_DESCR(arr))->compare;
26902682
int ret;
26912683
npy_intp l, n, stack_ptr, minrun;
2692-
run stack[TIMSORT_STACK_SIZE];
2684+
run stack[RUN_STACK_SIZE];
26932685
buffer_intp buffer;
26942686

26952687
/* Items that have zero size don't make sense to sort */

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy