Skip to content

Instantly share code, notes, and snippets.

@rygorous
Created July 26, 2021 01:51
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rygorous/4e85cb3f315583a18d748b8e7f36c924 to your computer and use it in GitHub Desktop.
Save rygorous/4e85cb3f315583a18d748b8e7f36c924 to your computer and use it in GitHub Desktop.
// Merge pass
static void merge_pass(S16 *out, const S16 *inA, const S16 *inB, size_t elemsPerRun)
{
// need pow2 elemsPerRun>=16!
const S16 *endA = inA + elemsPerRun;
const S16 *endB = inB + elemsPerRun;
Vec vMin0 = load8_s16(inA + 0);
Vec vMin1 = load8_s16(inA + 8);
Vec vMax0 = load8_s16(inB + 0);
Vec vMax1 = load8_s16(inB + 8);
inA += 16;
inB += 16;
while (inA < endA && inB < endB)
{
// Merge two partial 16-element runs
bitonic_merge16(vMin0,vMin1, vMax0,vMax1);
// Store the smaller 8 elements
store8_s16(out + 0, vMin0);
store8_s16(out + 8, vMin1);
out += 16;
// Insert next batch of elements from whichever vector has the
// next-smallest value we haven't inserted into the merger yet.
// Note: written to avoid branches!
bool next_is_A = *inA <= *inB;
const S16 *incA = inA + 16;
const S16 *incB = inB + 16;
const S16 *loadPtr = next_is_A ? inA : inB;
inA = next_is_A ? incA : inA;
inB = next_is_A ? inB : incB;
vMin0 = load8_s16(loadPtr + 0);
vMin1 = load8_s16(loadPtr + 8);
}
// One of the inputs hit the end; enter tail merging phase.
// Just swap things around so the remaining list is list A.
bool leftover_is_A = (inA < endA);
endA = leftover_is_A ? endA : endB;
inA = leftover_is_A ? inA : inB;
while (inA < endA)
{
// Merge two partial 16-element runs
bitonic_merge16(vMin0,vMin1, vMax0,vMax1);
// Store the smaller 8 elements
store8_s16(out + 0, vMin0);
store8_s16(out + 8, vMin1);
out += 16;
// Load next batch
vMin0 = load8_s16(inA + 0);
vMin1 = load8_s16(inA + 8);
inA += 16;
}
// Final batch
bitonic_merge16(vMin0,vMin1, vMax0,vMax1);
// Store the results
store8_s16(out + 0, vMin0);
store8_s16(out + 8, vMin1);
store8_s16(out + 16, vMax0);
store8_s16(out + 24, vMax1);
out += 32;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment