-
-
Save pervognsen/fa2a8cba0dc852094067348a57492ecb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Branchless merge (7 cycles/elem on Zen 3) | |
void merge1(u32 *dest, const u32 *src1, const u32 *end1, const u32 *src2, const u32 *end2) { | |
for (;;) { | |
if (unlikely(src1 == end1)) { | |
memcpy(dest, src2, (end2 - src2) * sizeof(u32)); | |
return; | |
} | |
if (unlikely(src2 == end2)) { | |
memcpy(dest, src1, (end1 - src1) * sizeof(u32)); | |
return; | |
} | |
u32 val1 = *src1; | |
u32 val2 = *src2; | |
int cond = val1 <= val2; | |
*dest++ = cond ? val1 : val2; | |
src1 = cond ? src1 + 1 : src1; | |
src2 = cond ? src2 : src2 + 1; | |
} | |
} | |
// Bidirectional branchless merge (3.5 cycles/elem on Zen 3) | |
void merge2(u32 *dest, const u32 *src1, const u32 *end1, const u32 *src2, const u32 *end2) { | |
u32 *end = dest + (end1 - src1) + (end2 - src2); | |
for (;;) { | |
if (unlikely(src1 + 1 >= end1 || src2 + 1 >= end2)) | |
return merge1(dest, src1, end1, src2, end2); | |
// Front | |
u32 val1 = *src1; | |
u32 val2 = *src2; | |
int cond = val1 <= val2; | |
*dest++ = cond ? val1 : val2; | |
src1 = cond ? src1 + 1 : src1; | |
src2 = cond ? src2 : src2 + 1; | |
// Back | |
val1 = end1[-1]; | |
val2 = end2[-1]; | |
cond = val1 > val2; | |
*--end = cond ? val1 : val2; | |
end1 = cond ? end1 - 1 : end1; | |
end2 = cond ? end2 : end2 - 1; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
May I ask what is the license for this snippet? Would it be possible to reuse it in an LGPLv2 project?