This gist will be to place steps and numbers for optimization of msd radix sort.
Benchmakring for Travis' radix_sort7
LSD radix sort implementation:
CPU Caches:
L1 Data 32K (x28)
L1 Instruction 32K (x28)
L2 Unified 1024K (x28)
L3 Unified 19712K (x2)
Load Average: 0.47, 0.44, 0.63
--------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------------------------------------------
SortingBmk_random_wholeRange/StdStableSort/10000 537 us 535 us 5230
SortingBmk_random_wholeRange/StdStableSort/60000 3921 us 3912 us 716
SortingBmk_random_wholeRange/StdStableSort/110000 7686 us 7668 us 365
SortingBmk_random_wholeRange/StdStableSort/160000 11455 us 11428 us 245
SortingBmk_random_wholeRange/StdStableSort/210000 15475 us 15438 us 181
SortingBmk_random_wholeRange/StdStableSort/260000 19690 us 19643 us 143
SortingBmk_random_wholeRange/StdStableSort/310000 23834 us 23778 us 118
SortingBmk_random_wholeRange/StdStableSort/360000 28007 us 27941 us 100
SortingBmk_random_wholeRange/StdStableSort/410000 32287 us 32211 us 87
SortingBmk_random_wholeRange/StdStableSort/460000 36361 us 36273 us 77
SortingBmk_random_wholeRange/StdStableSort/510000 40226 us 40130 us 70
SortingBmk_random_wholeRange/StdStableSort/560000 44344 us 44239 us 63
CPU Caches:
L1 Data 32K (x28)
L1 Instruction 32K (x28)
L2 Unified 1024K (x28)
L3 Unified 19712K (x2)
Benchmark Time CPU Iterations
-------------------------------------------------------------------------------------------
SortingBmk_random_wholeRange/MSDRadixSort/10000 195 us 195 us 14394
SortingBmk_random_wholeRange/MSDRadixSort/60000 1212 us 1209 us 2316
SortingBmk_random_wholeRange/MSDRadixSort/110000 2439 us 2433 us 1155
SortingBmk_random_wholeRange/MSDRadixSort/160000 3531 us 3522 us 795
SortingBmk_random_wholeRange/MSDRadixSort/210000 4624 us 4613 us 608
SortingBmk_random_wholeRange/MSDRadixSort/260000 5732 us 5718 us 491
SortingBmk_random_wholeRange/MSDRadixSort/310000 6789 us 6773 us 413
SortingBmk_random_wholeRange/MSDRadixSort/360000 7929 us 7910 us 353
SortingBmk_random_wholeRange/MSDRadixSort/410000 9034 us 9012 us 311
SortingBmk_random_wholeRange/MSDRadixSort/460000 10086 us 10062 us 278
SortingBmk_random_wholeRange/MSDRadixSort/510000 11157 us 11131 us 252
SortingBmk_random_wholeRange/MSDRadixSort/560000 12223 us 12194 us 230
CPU Caches:
L1 Data 32K (x28)
L1 Instruction 32K (x28)
L2 Unified 1024K (x28)
L3 Unified 19712K (x2)
Load Average: 0.99, 0.63, 0.68
-------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------------------------
SortingBmk_random_wholeRange/LSDRadixSort/10000 82.2 us 82.1 us 34223
SortingBmk_random_wholeRange/LSDRadixSort/60000 552 us 550 us 5092
SortingBmk_random_wholeRange/LSDRadixSort/110000 1442 us 1438 us 1941
SortingBmk_random_wholeRange/LSDRadixSort/160000 2103 us 2099 us 1337
SortingBmk_random_wholeRange/LSDRadixSort/210000 2781 us 2775 us 1009
SortingBmk_random_wholeRange/LSDRadixSort/260000 3461 us 3453 us 811
SortingBmk_random_wholeRange/LSDRadixSort/310000 4143 us 4133 us 678
SortingBmk_random_wholeRange/LSDRadixSort/360000 4835 us 4823 us 580
SortingBmk_random_wholeRange/LSDRadixSort/410000 5543 us 5530 us 505
SortingBmk_random_wholeRange/LSDRadixSort/460000 6233 us 6219 us 452
SortingBmk_random_wholeRange/LSDRadixSort/510000 6912 us 6895 us 406
SortingBmk_random_wholeRange/LSDRadixSort/560000 7599 us 7581 us 369
========================================================= The first version of MSD is below.
Code
#include <bits/stdc++.h>
using namespace std;
using T = uint64_t;
const size_t RADIX_BITS = 1;
const size_t RADIX_SIZE = (size_t)1 << RADIX_BITS;
const size_t RADIX_LEVELS = (63 / RADIX_BITS) + 1;
const size_t DO_FINAL_SWAP = RADIX_LEVELS & 1;
const T RADIX_MASK = RADIX_SIZE - 1;
constexpr auto partFunc = [](T v, size_t i) -> int { return (v >> i) & RADIX_MASK; }; // it looks to be inlined
// pass <- [7, 0]
void radix_msd_rec(vector<T>& from, vector<T>& to, size_t lo, size_t hi, size_t pass) {
//cout << "ITER " << lo << " " << hi << ", " << pass << endl;
size_t shift = pass * RADIX_BITS;
size_t freq[RADIX_SIZE] = {};
for (size_t i = lo; i < hi; ++i) {
T value = from[i];
++freq[partFunc(value, shift)];
}
T* queue_ptrs[RADIX_SIZE];
queue_ptrs[0] = &to[lo];
for (size_t i = 1; i < RADIX_SIZE; ++i)
queue_ptrs[i] = queue_ptrs[i-1] + freq[i-1];
for (size_t i = lo; i < hi; ++i) {
T value = from[i];
size_t index = partFunc(value, shift);
*queue_ptrs[index] = value;
++queue_ptrs[index];
}
auto newLo = lo;
auto newHi = lo + freq[0];
for (size_t i = 1; i < RADIX_SIZE; ++i) {
if (newHi - newLo > 1 && pass != 0) // at least one element to sort
radix_msd_rec(to, from, newLo, newHi, pass - 1);
newLo = newHi;
newHi += freq[i];
}
if (newHi - newLo > 1 && pass != 0)
radix_msd_rec(to, from, newLo, newHi, pass - 1);
}
void radix_sort_msd(vector<T>& data)
{
vector<T> buf(data.size());
radix_msd_rec(data, buf, 0, data.size(), 2);
if (DO_FINAL_SWAP)
swap(data, buf);
}
int main()
{
vector<T> data =
{0b11,
0b10,
0b01,
0b00};
radix_sort_msd(data);
for (auto& v : data) {
cout << bitset<2>(v) << endl;
}
for (auto& v : data) {
cout << v << " ";
}
cout << endl;
return 0;
}
Next steps: