Your code is fast - if you’re lucky

sort.h - a quicksort with sorting networks

// SPDX-License-Identifier: MIT
// sort.h - Branchless Quicksort
// (c) christof.kaser@gmail.com

#ifndef SORT_H
#define SORT_H

#ifndef BLQS_CMP
#define BLQS_CMP(a, b) ((a) < (b))
#endif

#include <stddef.h>
#include <string.h>
#define min(a, b) (((a) < (b)) ? (a) : (b))

#define SMALLPART 1024
#define SWSZ 512
#define UNROLL 16

#define sort2(a, b) do {  \
    unsigned m = BLQS_CMP(a, b); \
    BLQS_TYPE x = a; \
    a = m ? a : b;  \
    b = m ? b : x;  \
} while(0)

#define sort3(a, b, c) do { \
    sort2(a, b); sort2(b, c); sort2(a, b); \
} while(0)

#define sort4(a, b, c, d) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); \
} while(0)

#define sort5(a, b, c, d, e) do { \
    sort2(b, c); sort2(d, e); sort2(b, d); \
    sort2(a, c); sort2(a, d); sort2(c, e); \
    sort2(a, b); sort2(c, d); sort2(b, c); \
} while(0)

#define sort6(a, b, c, d, e, f) do { \
    sort2(a, b); sort2(c, d); sort2(e, f); \
    sort2(a, c); sort2(b, d); sort2(e, f); \
    sort2(a, e); sort2(b, f); sort2(c, e); \
    sort2(d, f); sort2(b, c); sort2(d, e); \
    sort2(c, d); \
} while(0)

#define sort7(a, b, c, d, e, f, g) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); sort2(e, f); \
    sort2(e, g); sort2(f, g); sort2(a, e); \
    sort2(b, f); sort2(c, g); sort2(b, e); \
    sort2(d, g); sort2(c, e); sort2(b, c); \
    sort2(d, f); sort2(d, e); \
} while(0)

#define sort8(a,b,c,d,e,f,g,h) do { \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
    sort2(b,c); sort2(f,g); \
    sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
    sort2(c,e); sort2(d,f); \
    sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)

#define sort9(a,b,c,d,e,f,g,h,i) do { \
    sort2(a,d); sort2(b,h); sort2(c,f); sort2(e,i); \
    sort2(a,h); sort2(c,e); sort2(d,i); sort2(f,g); \
    sort2(a,c); sort2(b,d); sort2(e,f); sort2(h,i); \
    sort2(b,e); sort2(d,g); sort2(f,h); \
    sort2(a,b); sort2(c,e); sort2(d,f); sort2(g,i); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)

#define sort10(a,b,c,d,e,f,g,h,i,j) do { \
    sort2(a,i); sort2(b,j); sort2(c,h); sort2(d,f); sort2(e,g); \
    sort2(a,c); sort2(b,e); sort2(f,i); sort2(h,j); \
    sort2(a,d); sort2(c,e); sort2(f,h); sort2(g,j); \
    sort2(a,b); sort2(d,g); sort2(i,j); \
    sort2(b,f); sort2(c,d); sort2(e,i); sort2(g,h); \
    sort2(b,c); sort2(d,f); sort2(e,g); sort2(h,i); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(d,e); sort2(f,g); \
} while (0)

#define sort11(a,b,c,d,e,f,g,h,i,j,k) do { \
    sort2(a,j); sort2(b,g); sort2(c,e); sort2(d,h); sort2(f,i); \
    sort2(a,b); sort2(d,f); sort2(e,k); sort2(g,j); sort2(h,i); \
    sort2(b,d); sort2(c,f); sort2(e,h); sort2(i,k); \
    sort2(a,e); sort2(b,c); sort2(d,h); sort2(f,j); sort2(g,i); \
    sort2(a,b); sort2(c,g); sort2(e,f); sort2(h,i); sort2(j,k); \
    sort2(c,e); sort2(d,g); sort2(f,h); sort2(i,j); \
    sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
} while (0)

#define sort12(a,b,c,d,e,f,g,h,i,j,k,l) do { \
    sort2(a,i); sort2(b,h); sort2(c,g); sort2(d,l); sort2(e,k); sort2(f,j); \
    sort2(a,c); sort2(b,e); sort2(d,f); sort2(g,i); sort2(h,k); sort2(j,l); \
    sort2(a,b); sort2(c,j); sort2(e,h); sort2(f,g); sort2(k,l); \
    sort2(b,d); sort2(c,h); sort2(e,j); sort2(i,k); \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
    sort2(b,c); sort2(d,f); sort2(g,i); sort2(j,k); \
    sort2(c,e); sort2(d,g); sort2(f,i); sort2(h,j); \
    sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)

static void sorting_network(BLQS_TYPE* l, int partszm1_min_1) {
    switch (partszm1_min_1) {
    case 0: break;
    case 1: sort2(l[0],l[1]); break;
    case 2: sort3(l[0],l[1],l[2]); break;
    case 3: sort4(l[0],l[1],l[2],l[3]); break;
    case 4: sort5(l[0],l[1],l[2],l[3],l[4]); break;
    case 5: sort6(l[0],l[1],l[2],l[3],l[4],l[5]); break;
    case 6: sort7(l[0],l[1],l[2],l[3],l[4],l[5],l[6]); break;
    case 7: sort8(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7]); break;
    case 8: sort9(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8]); break;
    case 9: sort10(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],l[9]); break;
    case 10: sort11(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],l[9],l[10]); break;
    case 11: sort12(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],l[9],l[10],l[11]); break;
    }
}
#define med5(a,b,c,d,e) do { \
    sort2(a,b); sort2(c,d); sort2(a,c); \
    sort2(b,d); sort2(b,c); sort2(c,e); \
    sort2(b,c); \
} while(0)


static BLQS_TYPE* partition_small(BLQS_TYPE* left, BLQS_TYPE* right) {

    BLQS_TYPE* outerleft = left;
    BLQS_TYPE* pivp = left + 6;

    BLQS_TYPE piv = *pivp;

    BLQS_TYPE l1 = left[1],l2 = left[2];
    BLQS_TYPE r1 = right[-1], r0 = *right;
    med5(l1, l2, piv, r1, r0);
    left[1] = l1; left[2] = l2;
    right[-1] = r1; *right = r0;
    left += 3;
    right -= 2;

    *pivp = *outerleft;

    BLQS_TYPE swbuf[SMALLPART];
    BLQS_TYPE* sw = swbuf;
    BLQS_TYPE* lwr = left;

    while (right - left >= UNROLL) for (int i = UNROLL; i--;) {
        BLQS_TYPE x = *left++;
        if (BLQS_CMP(x, piv)) { *lwr = x; lwr++; }
        else { *sw = x; sw++; }
    }
    while (left <= right) {
        BLQS_TYPE x = *left++;
        if (BLQS_CMP(x, piv)) { *lwr = x; lwr++; }
        else { *sw = x; sw++; }
    }
    memcpy(lwr, swbuf, (sw - swbuf) * sizeof(BLQS_TYPE));
    lwr -= 1;
    *outerleft = *lwr;
    *lwr = piv;
    return lwr;
}

static BLQS_TYPE* partition(BLQS_TYPE* left, BLQS_TYPE* right) {
    BLQS_TYPE* outerleft = left;
    BLQS_TYPE* pivp = left + (right - left) / 2;

    BLQS_TYPE piv = *pivp;

    med5(left[1],left[2],left[3],left[4],left[5]);
    med5(left[11],left[12],left[13],left[14],left[15]);
    med5(pivp[-2], pivp[-1], piv, pivp[1], pivp[2]);
    med5(right[-14], right[-13], right[-12], right[-11], right[-10]);
    med5(right[-4], right[-3], right[-2], right[-1], right[0]);
    med5(left[3], left[13], piv, right[-12], right[-2]);

    left += 1;
    *pivp = *outerleft;

    BLQS_TYPE swbuf[SWSZ];
    BLQS_TYPE *rwr = right, *sw = swbuf;
    BLQS_TYPE *lwr = left;

    while (UNROLL < SWSZ - (sw - swbuf) && left < right - UNROLL) {
        ptrdiff_t avail = min(right - left, SWSZ - (sw - swbuf));
        BLQS_TYPE* endp = right - avail;
        while (right > endp + UNROLL) {
            for (int i = UNROLL; i--;) {
                BLQS_TYPE x = *right--;
                if (BLQS_CMP(x, piv)) { *sw = x; sw++; }
                else { *rwr = x; rwr--; }
            }
        }
    }

    while (right - left >= UNROLL &&
            (rwr - right > UNROLL || left - lwr > UNROLL)) {

        while (rwr - right > UNROLL && right - left >= UNROLL) {
            for (int i = UNROLL; i--;) {
                BLQS_TYPE x = *left++;
                if (BLQS_CMP(x, piv)) { *lwr = x; lwr++; }
                else { *rwr = x; rwr--; }
            }
        }
        while (left - lwr > UNROLL && right - left >= UNROLL) {
            for (int i = UNROLL; i--;) {
                BLQS_TYPE x = *right--;
                if (BLQS_CMP(x, piv)) { *lwr = x; lwr++; }
                else { *rwr = x; rwr--; }
            }
        }
    }
    do {
        while (rwr > right && left <= right) {
            BLQS_TYPE x = *left++;
            if (BLQS_CMP(x, piv)) { *lwr = x; lwr++; }
            else { *rwr = x; rwr--; }
        }
        while (lwr < left && left <= right) {
            BLQS_TYPE x = *right--;
            if (BLQS_CMP(x, piv)) { *lwr = x; lwr++; }
            else { *rwr = x; rwr--; }
        }
    } while ((lwr < left||rwr > right) && left <= right);

    while (left <= right && !BLQS_CMP(*right, piv)) {
        right--;
        rwr--;
    }
    memcpy(lwr, swbuf, (sw - swbuf) * sizeof(BLQS_TYPE));
    *outerleft = *rwr;
    *rwr = piv;
    return rwr;
}

static void smallsort(BLQS_TYPE* left, BLQS_TYPE* right) {
    while (right - left > 11) {
        BLQS_TYPE* mid = partition_small(left, right);
        smallsort(left, mid - 1);
        left = mid + 1;
    }
    sorting_network(left, right - left);
}

static void sortr(BLQS_TYPE* left, BLQS_TYPE* right) {
    while (1) {
        ptrdiff_t partszm1 = right - left;
        if (partszm1 <= SMALLPART) break;
        BLQS_TYPE* mid = partition(left, right);

        if (mid - left < partszm1 / 16) {
            if (mid > left) sortr(left, mid - 1);
            BLQS_TYPE piv = *mid;
            mid += 1;
            // collect duplicates
            for (BLQS_TYPE* p = mid; p <= right; p++) {
                if (!BLQS_CMP(piv, *p)) {
                    BLQS_TYPE h = *mid;
                    *mid = *p;
                    *p = h;
                    mid++;
                }
            }
            left = mid;
            if (right - left < SMALLPART) break;
            mid = partition(left, right);
        }
        if (mid - left < right - mid) {
            sortr(left, mid - 1);
            left = mid + 1;
        }
        else {
            sortr(mid + 1, right);
            right = mid - 1;
        }
    }
    smallsort(left, right);
}


static void sort(BLQS_TYPE* data, int len) {
    if (len < 2) return;
    for (BLQS_TYPE* p = data + 1; p < data + len; p++) {
        if (BLQS_CMP(*p, *(p - 1))) goto not_sorted;
    }
    return;
not_sorted:
    sortr(data, data + len - 1);
}

#endif
test.c - sorting 50 million doubles

// SPDX-License-Identifier: MIT
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>

#define BLQS_CMP(a, b) ((a) < (b))
#define BLQS_TYPE double
#include "sort.h"

#define SIZE 50000000
double data[SIZE];

double ts(void) {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec + tv.tv_usec / 1000000.0;
}

int main() {
    double t0;
    for (int i = 0; i < SIZE; i++) data[i] = rand() / 1024.0;
    t0 = ts();
    sort(data, SIZE);
    printf("Time: %.2fs\n", ts() - t0);
}
On macOS/M1 (Clang, -O3):

Time: 4.39
C++ std::sort needs 1.33 seconds for this.

A few cosmetic changes

It is already micro-optimized using sorting networks and loop unrolling. Only a few cosmetic changes remain.

We rewrite this beginner‑friendly style, which explicitly shows exactly how the pointers are moved:

if (BLQS_CMP(x, piv)) { *lwr = x; lwr++; }
else { *rwr = x; rwr--; }
into a more idiomatic and compact C form:

if (BLQS_CMP(x, piv)) *lwr++ = x;
else *rwr-- = x;
sort.h - rewritten

// SPDX-License-Identifier: MIT
// blqsort.h - Branchless Quicksort
// (c) christof.kaser@gmail.com

#ifndef SORT_H
#define SORT_H

#ifndef BLQS_CMP
#define BLQS_CMP(a, b) ((a) < (b))
#endif

#include <stddef.h>
#include <string.h>
#define min(a, b) (((a) < (b)) ? (a) : (b))

#define SMALLPART 2048
#define SWSZ 512
#define UNROLL 16

#define sort2(a, b) do {  \
    unsigned m = BLQS_CMP(a, b); \
    BLQS_TYPE x = a; \
    a = m ? a : b;  \
    b = m ? b : x;  \
} while(0)

#define sort3(a, b, c) do { \
    sort2(a, b); sort2(b, c); sort2(a, b); \
} while(0)

#define sort4(a, b, c, d) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); \
} while(0)

#define sort5(a, b, c, d, e) do { \
    sort2(b, c); sort2(d, e); sort2(b, d); \
    sort2(a, c); sort2(a, d); sort2(c, e); \
    sort2(a, b); sort2(c, d); sort2(b, c); \
} while(0)

#define sort6(a, b, c, d, e, f) do { \
    sort2(a, b); sort2(c, d); sort2(e, f); \
    sort2(a, c); sort2(b, d); sort2(e, f); \
    sort2(a, e); sort2(b, f); sort2(c, e); \
    sort2(d, f); sort2(b, c); sort2(d, e); \
    sort2(c, d); \
} while(0)

#define sort7(a, b, c, d, e, f, g) do { \
    sort2(a, b); sort2(c, d); sort2(a, c); \
    sort2(b, d); sort2(b, c); sort2(e, f); \
    sort2(e, g); sort2(f, g); sort2(a, e); \
    sort2(b, f); sort2(c, g); sort2(b, e); \
    sort2(d, g); sort2(c, e); sort2(b, c); \
    sort2(d, f); sort2(d, e); \
} while(0)

#define sort8(a,b,c,d,e,f,g,h) do { \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
    sort2(b,c); sort2(f,g); \
    sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
    sort2(c,e); sort2(d,f); \
    sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)

#define sort9(a,b,c,d,e,f,g,h,i) do { \
    sort2(a,d); sort2(b,h); sort2(c,f); sort2(e,i); \
    sort2(a,h); sort2(c,e); sort2(d,i); sort2(f,g); \
    sort2(a,c); sort2(b,d); sort2(e,f); sort2(h,i); \
    sort2(b,e); sort2(d,g); sort2(f,h); \
    sort2(a,b); sort2(c,e); sort2(d,f); sort2(g,i); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)

#define sort10(a,b,c,d,e,f,g,h,i,j) do { \
    sort2(a,i); sort2(b,j); sort2(c,h); sort2(d,f); sort2(e,g); \
    sort2(a,c); sort2(b,e); sort2(f,i); sort2(h,j); \
    sort2(a,d); sort2(c,e); sort2(f,h); sort2(g,j); \
    sort2(a,b); sort2(d,g); sort2(i,j); \
    sort2(b,f); sort2(c,d); sort2(e,i); sort2(g,h); \
    sort2(b,c); sort2(d,f); sort2(e,g); sort2(h,i); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
    sort2(d,e); sort2(f,g); \
} while (0)

#define sort11(a,b,c,d,e,f,g,h,i,j,k) do { \
    sort2(a,j); sort2(b,g); sort2(c,e); sort2(d,h); sort2(f,i); \
    sort2(a,b); sort2(d,f); sort2(e,k); sort2(g,j); sort2(h,i); \
    sort2(b,d); sort2(c,f); sort2(e,h); sort2(i,k); \
    sort2(a,e); sort2(b,c); sort2(d,h); sort2(f,j); sort2(g,i); \
    sort2(a,b); sort2(c,g); sort2(e,f); sort2(h,i); sort2(j,k); \
    sort2(c,e); sort2(d,g); sort2(f,h); sort2(i,j); \
    sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); \
    sort2(c,d); sort2(e,f); sort2(g,h); \
} while (0)

#define sort12(a,b,c,d,e,f,g,h,i,j,k,l) do { \
    sort2(a,i); sort2(b,h); sort2(c,g); sort2(d,l); sort2(e,k); sort2(f,j); \
    sort2(a,c); sort2(b,e); sort2(d,f); sort2(g,i); sort2(h,k); sort2(j,l); \
    sort2(a,b); sort2(c,j); sort2(e,h); sort2(f,g); sort2(k,l); \
    sort2(b,d); sort2(c,h); sort2(e,j); sort2(i,k); \
    sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
    sort2(b,c); sort2(d,f); sort2(g,i); sort2(j,k); \
    sort2(c,e); sort2(d,g); sort2(f,i); sort2(h,j); \
    sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)

static void sorting_network(BLQS_TYPE* l, int partszm1_min_1) {
    switch (partszm1_min_1) {
    case 0: break;
    case 1: sort2(l[0],l[1]); break;
    case 2: sort3(l[0],l[1],l[2]); break;
    case 3: sort4(l[0],l[1],l[2],l[3]); break;
    case 4: sort5(l[0],l[1],l[2],l[3],l[4]); break;
    case 5: sort6(l[0],l[1],l[2],l[3],l[4],l[5]); break;
    case 6: sort7(l[0],l[1],l[2],l[3],l[4],l[5],l[6]); break;
    case 7: sort8(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7]); break;
    case 8: sort9(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8]); break;
    case 9: sort10(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],l[9]); break;
    case 10: sort11(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],l[9],l[10]); break;
    case 11: sort12(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],l[9],l[10],l[11]); break;
    }
}
#define med5(a,b,c,d,e) do { \
    sort2(a,b); sort2(c,d); sort2(a,c); \
    sort2(b,d); sort2(b,c); sort2(c,e); \
    sort2(b,c); \
} while(0)

static BLQS_TYPE* partition_small(BLQS_TYPE* left, BLQS_TYPE* right) {

    BLQS_TYPE* outerleft = left;
    BLQS_TYPE* pivp = left + 6;

    BLQS_TYPE piv = *pivp;

    BLQS_TYPE l1 = left[1],l2 = left[2];
    BLQS_TYPE r1 = right[-1], r0 = *right;
    med5(l1, l2, piv, r1, r0);
    left[1] = l1; left[2] = l2;
    right[-1] = r1; *right = r0;
    left += 3;
    right -= 2;

    *pivp = *outerleft;

    BLQS_TYPE swbuf[SMALLPART];
    BLQS_TYPE* sw = swbuf;
    BLQS_TYPE* lwr = left;

    while (right - left >= UNROLL) for (int i = UNROLL; i--;) {
        BLQS_TYPE x = *left++;
        if (BLQS_CMP(x, piv)) *lwr++ = x;
        else *sw++ = x;
    }
    while (left <= right) {
        BLQS_TYPE x = *left++;
        if (BLQS_CMP(x, piv)) *lwr++ = x;
        else *sw++ = x;
    }
    memcpy(lwr, swbuf, (sw - swbuf) * sizeof(BLQS_TYPE));
    lwr -= 1;
    *outerleft = *lwr;
    *lwr = piv;
    return lwr;
}

static BLQS_TYPE* partition(BLQS_TYPE* left, BLQS_TYPE* right) {
    BLQS_TYPE* outerleft = left;
    BLQS_TYPE* pivp = left + (right - left) / 2;

    BLQS_TYPE piv = *pivp;

    med5(left[1],left[2],left[3],left[4],left[5]);
    med5(left[11],left[12],left[13],left[14],left[15]);
    med5(pivp[-2], pivp[-1], piv, pivp[1], pivp[2]);
    med5(right[-14], right[-13], right[-12], right[-11], right[-10]);
    med5(right[-4], right[-3], right[-2], right[-1], right[0]);
    med5(left[3], left[13], piv, right[-12], right[-2]);

    left += 1;
    *pivp = *outerleft;

    BLQS_TYPE swbuf[SWSZ];
    BLQS_TYPE *rwr = right, *sw = swbuf;
    BLQS_TYPE *lwr = left;

    while (UNROLL < SWSZ - (sw - swbuf) && left < right - UNROLL) {
        ptrdiff_t avail = min(right - left, SWSZ - (sw - swbuf));
        BLQS_TYPE* endp = right - avail;
        while (right > endp + UNROLL) {
            for (int i = UNROLL; i--;) {
                BLQS_TYPE x = *right--;
                if (BLQS_CMP(x, piv)) *sw++ = x;
                else *rwr-- = x;

            }
        }
    }

    while (right - left >= UNROLL &&
            (rwr - right > UNROLL || left - lwr > UNROLL)) {

        while (rwr - right > UNROLL && right - left >= UNROLL) {
            for (int i = UNROLL; i--;) {
                BLQS_TYPE x = *left++;
                if (BLQS_CMP(x, piv)) *lwr++ = x;
                else *rwr-- = x;
            }
        }
        while (left - lwr > UNROLL && right - left >= UNROLL) {
            for (int i = UNROLL; i--;) {
                BLQS_TYPE x = *right--;
                if (BLQS_CMP(x, piv)) *lwr++ = x;
                else *rwr-- = x;
            }
        }
    }

    do {
        while (rwr > right && left <= right) {
            BLQS_TYPE x = *left++;
            if (BLQS_CMP(x, piv)) *lwr++ = x;
            else *rwr-- = x;
        }
        while (lwr < left && left <= right) {
            BLQS_TYPE x = *right--;
            if (BLQS_CMP(x, piv)) *lwr++ = x;
            else *rwr-- = x;
        }
    } while ((lwr < left||rwr > right) && left <= right);

    while (left <= right && !BLQS_CMP(*right, piv)) {
        right--;
        rwr--;
    }
    memcpy(lwr, swbuf, (sw - swbuf) * sizeof(BLQS_TYPE));
    *outerleft = *rwr;
    *rwr = piv;
    return rwr;
}

static void smallsort(BLQS_TYPE* left, BLQS_TYPE* right) {
    while (right - left > 11) {
        BLQS_TYPE* mid = partition_small(left, right);
        smallsort(left, mid - 1);
        left = mid + 1;
    }
    sorting_network(left, right - left);
}

static void sortr(BLQS_TYPE* left, BLQS_TYPE* right) {
    while (1) {
        ptrdiff_t partszm1 = right - left;
        if (partszm1 <= SMALLPART) break;
        BLQS_TYPE* mid = partition(left, right);

        if (mid - left < partszm1 / 16) {
            if (mid > left) sortr(left, mid - 1);
            BLQS_TYPE piv = *mid;
            mid += 1;
            // collect duplicates
            for (BLQS_TYPE* p = mid; p <= right; p++) {
                if (!BLQS_CMP(piv, *p)) {
                    BLQS_TYPE h = *mid;
                    *mid = *p;
                    *p = h;
                    mid++;
                }
            }
            left = mid;
            if (right - left < SMALLPART) break;
            mid = partition(left, right);
        }
        if (mid - left < right - mid) {
            sortr(left, mid - 1);
            left = mid + 1;
        }
        else {
            sortr(mid + 1, right);
            right = mid - 1;
        }
    }
    smallsort(left, right);
}

static void sort(BLQS_TYPE* data, int len) {
    if (len < 2) return;
    for (BLQS_TYPE* p = data + 1; p < data + len; p++) {
        if (BLQS_CMP(*p, *(p - 1))) goto not_sorted;
    }
    return;
not_sorted:
    sortr(data, data + len - 1);
}

#endif
A quick test to make sure everything still works.

Time: 0.70
More than 6 times faster than before, and nearly twice as fast as std::sort. That’s quite something.

But what actually happened?

This “small cosmetic” change causes Clang to replace branches with csel.

With branches

; x20 = left, x9 = right, d8 = pivot

loop:
    ldr  d0, [x12], #8
    fcmp d0, d8
    b.pl ge_case
    str  d0, [x20], #8      ; left++
    b    next
ge_case:
    str  d0, [x9], #-8      ; right--
next:
    cmp  x12, x_end
    b.lt loop
Fast with csel (branchless)

; x20 = left, x11 = right, d8 = pivot, x10 = 8

loop:
    ldr  d0, [x12], #8        ; load val
    fcmp d0, d8               ; compare
    csel x13, x20, x11, mi    ; if < pivot → left else right
    csel x14, x10, xzr, mi    ; left += 8 if <
    csel x15, x10, xzr, pl    ; right -= 8 if >=   (!! fix)
    str  d0, [x13]            ; store
    add  x20, x20, x14        ; update left
    sub  x11, x11, x15        ; update right
    cmp  x12, x_end
    b.lt loop
On x86, Clang behaves similarly: with the compact if, it generates branchless code using cmov (conditional move).

GCC does not exhibit this “quirk” (different code generation for logically equivalent source). It consistently generates the slower branch-based version.

Links

blqsort - Fast Quicksort with C and C++ Interface

When ‘if’ slows you down, avoid it

Interactive sorting demo


christof.kaser@gmail.com