sort.h - a quicksort with sorting networks
// SPDX-License-Identifier: MIT
// sort.h - Branchless Quicksort
// (c) christof.kaser@gmail.com
#ifndef SORT_H
#define SORT_H
#ifndef BLQS_CMP
#define BLQS_CMP(a, b) ((a) < (b))
#endif
#include <stddef.h>
#include <string.h>
#define min(a, b) (((a) < (b)) ? (a) : (b))
#define SMALLPART 1024
#define SWSZ 512
#define UNROLL 16
#define sort2(a, b) do { \
unsigned m = BLQS_CMP(a, b); \
BLQS_TYPE x = a; \
a = m ? a : b; \
b = m ? b : x; \
} while(0)
#define sort3(a, b, c) do { \
sort2(a, b); sort2(b, c); sort2(a, b); \
} while(0)
#define sort4(a, b, c, d) do { \
sort2(a, b); sort2(c, d); sort2(a, c); \
sort2(b, d); sort2(b, c); \
} while(0)
#define sort5(a, b, c, d, e) do { \
sort2(b, c); sort2(d, e); sort2(b, d); \
sort2(a, c); sort2(a, d); sort2(c, e); \
sort2(a, b); sort2(c, d); sort2(b, c); \
} while(0)
#define sort6(a, b, c, d, e, f) do { \
sort2(a, b); sort2(c, d); sort2(e, f); \
sort2(a, c); sort2(b, d); sort2(e, f); \
sort2(a, e); sort2(b, f); sort2(c, e); \
sort2(d, f); sort2(b, c); sort2(d, e); \
sort2(c, d); \
} while(0)
#define sort7(a, b, c, d, e, f, g) do { \
sort2(a, b); sort2(c, d); sort2(a, c); \
sort2(b, d); sort2(b, c); sort2(e, f); \
sort2(e, g); sort2(f, g); sort2(a, e); \
sort2(b, f); sort2(c, g); sort2(b, e); \
sort2(d, g); sort2(c, e); sort2(b, c); \
sort2(d, f); sort2(d, e); \
} while(0)
#define sort8(a,b,c,d,e,f,g,h) do { \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
sort2(b,c); sort2(f,g); \
sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
sort2(c,e); sort2(d,f); \
sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)
#define sort9(a,b,c,d,e,f,g,h,i) do { \
sort2(a,d); sort2(b,h); sort2(c,f); sort2(e,i); \
sort2(a,h); sort2(c,e); sort2(d,i); sort2(f,g); \
sort2(a,c); sort2(b,d); sort2(e,f); sort2(h,i); \
sort2(b,e); sort2(d,g); sort2(f,h); \
sort2(a,b); sort2(c,e); sort2(d,f); sort2(g,i); \
sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)
#define sort10(a,b,c,d,e,f,g,h,i,j) do { \
sort2(a,i); sort2(b,j); sort2(c,h); sort2(d,f); sort2(e,g); \
sort2(a,c); sort2(b,e); sort2(f,i); sort2(h,j); \
sort2(a,d); sort2(c,e); sort2(f,h); sort2(g,j); \
sort2(a,b); sort2(d,g); sort2(i,j); \
sort2(b,f); sort2(c,d); sort2(e,i); sort2(g,h); \
sort2(b,c); sort2(d,f); sort2(e,g); sort2(h,i); \
sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(d,e); sort2(f,g); \
} while (0)
#define sort11(a,b,c,d,e,f,g,h,i,j,k) do { \
sort2(a,j); sort2(b,g); sort2(c,e); sort2(d,h); sort2(f,i); \
sort2(a,b); sort2(d,f); sort2(e,k); sort2(g,j); sort2(h,i); \
sort2(b,d); sort2(c,f); sort2(e,h); sort2(i,k); \
sort2(a,e); sort2(b,c); sort2(d,h); sort2(f,j); sort2(g,i); \
sort2(a,b); sort2(c,g); sort2(e,f); sort2(h,i); sort2(j,k); \
sort2(c,e); sort2(d,g); sort2(f,h); sort2(i,j); \
sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); \
sort2(c,d); sort2(e,f); sort2(g,h); \
} while (0)
#define sort12(a,b,c,d,e,f,g,h,i,j,k,l) do { \
sort2(a,i); sort2(b,h); sort2(c,g); sort2(d,l); sort2(e,k); sort2(f,j); \
sort2(a,c); sort2(b,e); sort2(d,f); sort2(g,i); sort2(h,k); sort2(j,l); \
sort2(a,b); sort2(c,j); sort2(e,h); sort2(f,g); sort2(k,l); \
sort2(b,d); sort2(c,h); sort2(e,j); sort2(i,k); \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
sort2(b,c); sort2(d,f); sort2(g,i); sort2(j,k); \
sort2(c,e); sort2(d,g); sort2(f,i); sort2(h,j); \
sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)
static void sorting_network(BLQS_TYPE* l, int partszm1_min_1) {
switch (partszm1_min_1) {
case 0: break;
case 1: sort2(l[0],l[1]); break;
case 2: sort3(l[0],l[1],l[2]); break;
case 3: sort4(l[0],l[1],l[2],l[3]); break;
case 4: sort5(l[0],l[1],l[2],l[3],l[4]); break;
case 5: sort6(l[0],l[1],l[2],l[3],l[4],l[5]); break;
case 6: sort7(l[0],l[1],l[2],l[3],l[4],l[5],l[6]); break;
case 7: sort8(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7]); break;
case 8: sort9(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8]); break;
case 9: sort10(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],l[9]); break;
case 10: sort11(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],l[9],l[10]); break;
case 11: sort12(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],l[9],l[10],l[11]); break;
}
}
#define med5(a,b,c,d,e) do { \
sort2(a,b); sort2(c,d); sort2(a,c); \
sort2(b,d); sort2(b,c); sort2(c,e); \
sort2(b,c); \
} while(0)
static BLQS_TYPE* partition_small(BLQS_TYPE* left, BLQS_TYPE* right) {
BLQS_TYPE* outerleft = left;
BLQS_TYPE* pivp = left + 6;
BLQS_TYPE piv = *pivp;
BLQS_TYPE l1 = left[1],l2 = left[2];
BLQS_TYPE r1 = right[-1], r0 = *right;
med5(l1, l2, piv, r1, r0);
left[1] = l1; left[2] = l2;
right[-1] = r1; *right = r0;
left += 3;
right -= 2;
*pivp = *outerleft;
BLQS_TYPE swbuf[SMALLPART];
BLQS_TYPE* sw = swbuf;
BLQS_TYPE* lwr = left;
while (right - left >= UNROLL) for (int i = UNROLL; i--;) {
BLQS_TYPE x = *left++;
if (BLQS_CMP(x, piv)) { *lwr = x; lwr++; }
else { *sw = x; sw++; }
}
while (left <= right) {
BLQS_TYPE x = *left++;
if (BLQS_CMP(x, piv)) { *lwr = x; lwr++; }
else { *sw = x; sw++; }
}
memcpy(lwr, swbuf, (sw - swbuf) * sizeof(BLQS_TYPE));
lwr -= 1;
*outerleft = *lwr;
*lwr = piv;
return lwr;
}
static BLQS_TYPE* partition(BLQS_TYPE* left, BLQS_TYPE* right) {
BLQS_TYPE* outerleft = left;
BLQS_TYPE* pivp = left + (right - left) / 2;
BLQS_TYPE piv = *pivp;
med5(left[1],left[2],left[3],left[4],left[5]);
med5(left[11],left[12],left[13],left[14],left[15]);
med5(pivp[-2], pivp[-1], piv, pivp[1], pivp[2]);
med5(right[-14], right[-13], right[-12], right[-11], right[-10]);
med5(right[-4], right[-3], right[-2], right[-1], right[0]);
med5(left[3], left[13], piv, right[-12], right[-2]);
left += 1;
*pivp = *outerleft;
BLQS_TYPE swbuf[SWSZ];
BLQS_TYPE *rwr = right, *sw = swbuf;
BLQS_TYPE *lwr = left;
while (UNROLL < SWSZ - (sw - swbuf) && left < right - UNROLL) {
ptrdiff_t avail = min(right - left, SWSZ - (sw - swbuf));
BLQS_TYPE* endp = right - avail;
while (right > endp + UNROLL) {
for (int i = UNROLL; i--;) {
BLQS_TYPE x = *right--;
if (BLQS_CMP(x, piv)) { *sw = x; sw++; }
else { *rwr = x; rwr--; }
}
}
}
while (right - left >= UNROLL &&
(rwr - right > UNROLL || left - lwr > UNROLL)) {
while (rwr - right > UNROLL && right - left >= UNROLL) {
for (int i = UNROLL; i--;) {
BLQS_TYPE x = *left++;
if (BLQS_CMP(x, piv)) { *lwr = x; lwr++; }
else { *rwr = x; rwr--; }
}
}
while (left - lwr > UNROLL && right - left >= UNROLL) {
for (int i = UNROLL; i--;) {
BLQS_TYPE x = *right--;
if (BLQS_CMP(x, piv)) { *lwr = x; lwr++; }
else { *rwr = x; rwr--; }
}
}
}
do {
while (rwr > right && left <= right) {
BLQS_TYPE x = *left++;
if (BLQS_CMP(x, piv)) { *lwr = x; lwr++; }
else { *rwr = x; rwr--; }
}
while (lwr < left && left <= right) {
BLQS_TYPE x = *right--;
if (BLQS_CMP(x, piv)) { *lwr = x; lwr++; }
else { *rwr = x; rwr--; }
}
} while ((lwr < left||rwr > right) && left <= right);
while (left <= right && !BLQS_CMP(*right, piv)) {
right--;
rwr--;
}
memcpy(lwr, swbuf, (sw - swbuf) * sizeof(BLQS_TYPE));
*outerleft = *rwr;
*rwr = piv;
return rwr;
}
static void smallsort(BLQS_TYPE* left, BLQS_TYPE* right) {
while (right - left > 11) {
BLQS_TYPE* mid = partition_small(left, right);
smallsort(left, mid - 1);
left = mid + 1;
}
sorting_network(left, right - left);
}
static void sortr(BLQS_TYPE* left, BLQS_TYPE* right) {
while (1) {
ptrdiff_t partszm1 = right - left;
if (partszm1 <= SMALLPART) break;
BLQS_TYPE* mid = partition(left, right);
if (mid - left < partszm1 / 16) {
if (mid > left) sortr(left, mid - 1);
BLQS_TYPE piv = *mid;
mid += 1;
// collect duplicates
for (BLQS_TYPE* p = mid; p <= right; p++) {
if (!BLQS_CMP(piv, *p)) {
BLQS_TYPE h = *mid;
*mid = *p;
*p = h;
mid++;
}
}
left = mid;
if (right - left < SMALLPART) break;
mid = partition(left, right);
}
if (mid - left < right - mid) {
sortr(left, mid - 1);
left = mid + 1;
}
else {
sortr(mid + 1, right);
right = mid - 1;
}
}
smallsort(left, right);
}
static void sort(BLQS_TYPE* data, int len) {
if (len < 2) return;
for (BLQS_TYPE* p = data + 1; p < data + len; p++) {
if (BLQS_CMP(*p, *(p - 1))) goto not_sorted;
}
return;
not_sorted:
sortr(data, data + len - 1);
}
#endif
test.c - sorting 50 million doubles
// SPDX-License-Identifier: MIT
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#define BLQS_CMP(a, b) ((a) < (b))
#define BLQS_TYPE double
#include "sort.h"
#define SIZE 50000000
double data[SIZE];
double ts(void) {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + tv.tv_usec / 1000000.0;
}
int main() {
double t0;
for (int i = 0; i < SIZE; i++) data[i] = rand() / 1024.0;
t0 = ts();
sort(data, SIZE);
printf("Time: %.2fs\n", ts() - t0);
}
On macOS/M1 (Clang, -O3):
Time: 4.39
C++ std::sort needs 1.33 seconds for this.
It is already micro-optimized using sorting networks and loop unrolling. Only a few cosmetic changes remain.
We rewrite this beginner‑friendly style, which explicitly shows exactly how the pointers are moved:
if (BLQS_CMP(x, piv)) { *lwr = x; lwr++; }
else { *rwr = x; rwr--; }
into a more idiomatic and compact C form:
if (BLQS_CMP(x, piv)) *lwr++ = x;
else *rwr-- = x;
sort.h - rewritten
// SPDX-License-Identifier: MIT
// blqsort.h - Branchless Quicksort
// (c) christof.kaser@gmail.com
#ifndef SORT_H
#define SORT_H
#ifndef BLQS_CMP
#define BLQS_CMP(a, b) ((a) < (b))
#endif
#include <stddef.h>
#include <string.h>
#define min(a, b) (((a) < (b)) ? (a) : (b))
#define SMALLPART 2048
#define SWSZ 512
#define UNROLL 16
#define sort2(a, b) do { \
unsigned m = BLQS_CMP(a, b); \
BLQS_TYPE x = a; \
a = m ? a : b; \
b = m ? b : x; \
} while(0)
#define sort3(a, b, c) do { \
sort2(a, b); sort2(b, c); sort2(a, b); \
} while(0)
#define sort4(a, b, c, d) do { \
sort2(a, b); sort2(c, d); sort2(a, c); \
sort2(b, d); sort2(b, c); \
} while(0)
#define sort5(a, b, c, d, e) do { \
sort2(b, c); sort2(d, e); sort2(b, d); \
sort2(a, c); sort2(a, d); sort2(c, e); \
sort2(a, b); sort2(c, d); sort2(b, c); \
} while(0)
#define sort6(a, b, c, d, e, f) do { \
sort2(a, b); sort2(c, d); sort2(e, f); \
sort2(a, c); sort2(b, d); sort2(e, f); \
sort2(a, e); sort2(b, f); sort2(c, e); \
sort2(d, f); sort2(b, c); sort2(d, e); \
sort2(c, d); \
} while(0)
#define sort7(a, b, c, d, e, f, g) do { \
sort2(a, b); sort2(c, d); sort2(a, c); \
sort2(b, d); sort2(b, c); sort2(e, f); \
sort2(e, g); sort2(f, g); sort2(a, e); \
sort2(b, f); sort2(c, g); sort2(b, e); \
sort2(d, g); sort2(c, e); sort2(b, c); \
sort2(d, f); sort2(d, e); \
} while(0)
#define sort8(a,b,c,d,e,f,g,h) do { \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(a,c); sort2(b,d); sort2(e,g); sort2(f,h); \
sort2(b,c); sort2(f,g); \
sort2(a,e); sort2(b,f); sort2(c,g); sort2(d,h); \
sort2(c,e); sort2(d,f); \
sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)
#define sort9(a,b,c,d,e,f,g,h,i) do { \
sort2(a,d); sort2(b,h); sort2(c,f); sort2(e,i); \
sort2(a,h); sort2(c,e); sort2(d,i); sort2(f,g); \
sort2(a,c); sort2(b,d); sort2(e,f); sort2(h,i); \
sort2(b,e); sort2(d,g); sort2(f,h); \
sort2(a,b); sort2(c,e); sort2(d,f); sort2(g,i); \
sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(b,c); sort2(d,e); sort2(f,g); \
} while (0)
#define sort10(a,b,c,d,e,f,g,h,i,j) do { \
sort2(a,i); sort2(b,j); sort2(c,h); sort2(d,f); sort2(e,g); \
sort2(a,c); sort2(b,e); sort2(f,i); sort2(h,j); \
sort2(a,d); sort2(c,e); sort2(f,h); sort2(g,j); \
sort2(a,b); sort2(d,g); sort2(i,j); \
sort2(b,f); sort2(c,d); sort2(e,i); sort2(g,h); \
sort2(b,c); sort2(d,f); sort2(e,g); sort2(h,i); \
sort2(c,d); sort2(e,f); sort2(g,h); \
sort2(d,e); sort2(f,g); \
} while (0)
#define sort11(a,b,c,d,e,f,g,h,i,j,k) do { \
sort2(a,j); sort2(b,g); sort2(c,e); sort2(d,h); sort2(f,i); \
sort2(a,b); sort2(d,f); sort2(e,k); sort2(g,j); sort2(h,i); \
sort2(b,d); sort2(c,f); sort2(e,h); sort2(i,k); \
sort2(a,e); sort2(b,c); sort2(d,h); sort2(f,j); sort2(g,i); \
sort2(a,b); sort2(c,g); sort2(e,f); sort2(h,i); sort2(j,k); \
sort2(c,e); sort2(d,g); sort2(f,h); sort2(i,j); \
sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); \
sort2(c,d); sort2(e,f); sort2(g,h); \
} while (0)
#define sort12(a,b,c,d,e,f,g,h,i,j,k,l) do { \
sort2(a,i); sort2(b,h); sort2(c,g); sort2(d,l); sort2(e,k); sort2(f,j); \
sort2(a,c); sort2(b,e); sort2(d,f); sort2(g,i); sort2(h,k); sort2(j,l); \
sort2(a,b); sort2(c,j); sort2(e,h); sort2(f,g); sort2(k,l); \
sort2(b,d); sort2(c,h); sort2(e,j); sort2(i,k); \
sort2(a,b); sort2(c,d); sort2(e,f); sort2(g,h); sort2(i,j); sort2(k,l); \
sort2(b,c); sort2(d,f); sort2(g,i); sort2(j,k); \
sort2(c,e); sort2(d,g); sort2(f,i); sort2(h,j); \
sort2(b,c); sort2(d,e); sort2(f,g); sort2(h,i); sort2(j,k); \
} while (0)
static void sorting_network(BLQS_TYPE* l, int partszm1_min_1) {
switch (partszm1_min_1) {
case 0: break;
case 1: sort2(l[0],l[1]); break;
case 2: sort3(l[0],l[1],l[2]); break;
case 3: sort4(l[0],l[1],l[2],l[3]); break;
case 4: sort5(l[0],l[1],l[2],l[3],l[4]); break;
case 5: sort6(l[0],l[1],l[2],l[3],l[4],l[5]); break;
case 6: sort7(l[0],l[1],l[2],l[3],l[4],l[5],l[6]); break;
case 7: sort8(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7]); break;
case 8: sort9(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8]); break;
case 9: sort10(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],l[9]); break;
case 10: sort11(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],l[9],l[10]); break;
case 11: sort12(l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],l[9],l[10],l[11]); break;
}
}
#define med5(a,b,c,d,e) do { \
sort2(a,b); sort2(c,d); sort2(a,c); \
sort2(b,d); sort2(b,c); sort2(c,e); \
sort2(b,c); \
} while(0)
static BLQS_TYPE* partition_small(BLQS_TYPE* left, BLQS_TYPE* right) {
BLQS_TYPE* outerleft = left;
BLQS_TYPE* pivp = left + 6;
BLQS_TYPE piv = *pivp;
BLQS_TYPE l1 = left[1],l2 = left[2];
BLQS_TYPE r1 = right[-1], r0 = *right;
med5(l1, l2, piv, r1, r0);
left[1] = l1; left[2] = l2;
right[-1] = r1; *right = r0;
left += 3;
right -= 2;
*pivp = *outerleft;
BLQS_TYPE swbuf[SMALLPART];
BLQS_TYPE* sw = swbuf;
BLQS_TYPE* lwr = left;
while (right - left >= UNROLL) for (int i = UNROLL; i--;) {
BLQS_TYPE x = *left++;
if (BLQS_CMP(x, piv)) *lwr++ = x;
else *sw++ = x;
}
while (left <= right) {
BLQS_TYPE x = *left++;
if (BLQS_CMP(x, piv)) *lwr++ = x;
else *sw++ = x;
}
memcpy(lwr, swbuf, (sw - swbuf) * sizeof(BLQS_TYPE));
lwr -= 1;
*outerleft = *lwr;
*lwr = piv;
return lwr;
}
static BLQS_TYPE* partition(BLQS_TYPE* left, BLQS_TYPE* right) {
BLQS_TYPE* outerleft = left;
BLQS_TYPE* pivp = left + (right - left) / 2;
BLQS_TYPE piv = *pivp;
med5(left[1],left[2],left[3],left[4],left[5]);
med5(left[11],left[12],left[13],left[14],left[15]);
med5(pivp[-2], pivp[-1], piv, pivp[1], pivp[2]);
med5(right[-14], right[-13], right[-12], right[-11], right[-10]);
med5(right[-4], right[-3], right[-2], right[-1], right[0]);
med5(left[3], left[13], piv, right[-12], right[-2]);
left += 1;
*pivp = *outerleft;
BLQS_TYPE swbuf[SWSZ];
BLQS_TYPE *rwr = right, *sw = swbuf;
BLQS_TYPE *lwr = left;
while (UNROLL < SWSZ - (sw - swbuf) && left < right - UNROLL) {
ptrdiff_t avail = min(right - left, SWSZ - (sw - swbuf));
BLQS_TYPE* endp = right - avail;
while (right > endp + UNROLL) {
for (int i = UNROLL; i--;) {
BLQS_TYPE x = *right--;
if (BLQS_CMP(x, piv)) *sw++ = x;
else *rwr-- = x;
}
}
}
while (right - left >= UNROLL &&
(rwr - right > UNROLL || left - lwr > UNROLL)) {
while (rwr - right > UNROLL && right - left >= UNROLL) {
for (int i = UNROLL; i--;) {
BLQS_TYPE x = *left++;
if (BLQS_CMP(x, piv)) *lwr++ = x;
else *rwr-- = x;
}
}
while (left - lwr > UNROLL && right - left >= UNROLL) {
for (int i = UNROLL; i--;) {
BLQS_TYPE x = *right--;
if (BLQS_CMP(x, piv)) *lwr++ = x;
else *rwr-- = x;
}
}
}
do {
while (rwr > right && left <= right) {
BLQS_TYPE x = *left++;
if (BLQS_CMP(x, piv)) *lwr++ = x;
else *rwr-- = x;
}
while (lwr < left && left <= right) {
BLQS_TYPE x = *right--;
if (BLQS_CMP(x, piv)) *lwr++ = x;
else *rwr-- = x;
}
} while ((lwr < left||rwr > right) && left <= right);
while (left <= right && !BLQS_CMP(*right, piv)) {
right--;
rwr--;
}
memcpy(lwr, swbuf, (sw - swbuf) * sizeof(BLQS_TYPE));
*outerleft = *rwr;
*rwr = piv;
return rwr;
}
static void smallsort(BLQS_TYPE* left, BLQS_TYPE* right) {
while (right - left > 11) {
BLQS_TYPE* mid = partition_small(left, right);
smallsort(left, mid - 1);
left = mid + 1;
}
sorting_network(left, right - left);
}
static void sortr(BLQS_TYPE* left, BLQS_TYPE* right) {
while (1) {
ptrdiff_t partszm1 = right - left;
if (partszm1 <= SMALLPART) break;
BLQS_TYPE* mid = partition(left, right);
if (mid - left < partszm1 / 16) {
if (mid > left) sortr(left, mid - 1);
BLQS_TYPE piv = *mid;
mid += 1;
// collect duplicates
for (BLQS_TYPE* p = mid; p <= right; p++) {
if (!BLQS_CMP(piv, *p)) {
BLQS_TYPE h = *mid;
*mid = *p;
*p = h;
mid++;
}
}
left = mid;
if (right - left < SMALLPART) break;
mid = partition(left, right);
}
if (mid - left < right - mid) {
sortr(left, mid - 1);
left = mid + 1;
}
else {
sortr(mid + 1, right);
right = mid - 1;
}
}
smallsort(left, right);
}
static void sort(BLQS_TYPE* data, int len) {
if (len < 2) return;
for (BLQS_TYPE* p = data + 1; p < data + len; p++) {
if (BLQS_CMP(*p, *(p - 1))) goto not_sorted;
}
return;
not_sorted:
sortr(data, data + len - 1);
}
#endif
A quick test to make sure everything still works.
Time: 0.70
More than 6 times faster than before, and nearly twice as fast as std::sort. That’s quite something.
This “small cosmetic” change causes Clang to replace branches with csel.
With branches
; x20 = left, x9 = right, d8 = pivot
loop:
ldr d0, [x12], #8
fcmp d0, d8
b.pl ge_case
str d0, [x20], #8 ; left++
b next
ge_case:
str d0, [x9], #-8 ; right--
next:
cmp x12, x_end
b.lt loop
Fast with csel (branchless)
; x20 = left, x11 = right, d8 = pivot, x10 = 8
loop:
ldr d0, [x12], #8 ; load val
fcmp d0, d8 ; compare
csel x13, x20, x11, mi ; if < pivot → left else right
csel x14, x10, xzr, mi ; left += 8 if <
csel x15, x10, xzr, pl ; right -= 8 if >= (!! fix)
str d0, [x13] ; store
add x20, x20, x14 ; update left
sub x11, x11, x15 ; update right
cmp x12, x_end
b.lt loop
On x86, Clang behaves similarly: with the compact if, it generates branchless code using cmov (conditional move).
GCC does not exhibit this “quirk” (different code generation for logically equivalent source). It consistently generates the slower branch-based version.
blqsort - Fast Quicksort with C and C++ Interface
When ‘if’ slows you down, avoid it
christof.kaser@gmail.com