Skip to content

Instantly share code, notes, and snippets.

@jbarczak
Last active August 29, 2015 14:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jbarczak/300553226e4af57b9485 to your computer and use it in GitHub Desktop.
Save jbarczak/300553226e4af57b9485 to your computer and use it in GitHub Desktop.
Prefix sum improvements suggested by ryg
static void __fastcall ReorderRays( StackFrame& frame, size_t nGroups )
{
RayPacket** pPackets = frame.pActivePackets;
uint32 pIDs[MAX_TRACER_SIZE];
size_t nHitLoc = 0;
size_t nMissLoc = 8*nGroups;
const char* pRays = (const char*) frame.pRays;
for( size_t i=0; i<nGroups; i++ )
{
uint32* __restrict pPacketRayIDs = pPackets[i]->RayOffsets;
// Turn the 8-bit mask into 8 packed bytes
const unsigned __int64 ONE_BYTES = 0x0101010101010101;
unsigned __int64 hit = _pdep_u64( frame.pMasks[i], ONE_BYTES);
unsigned __int64 miss = hit ^ (ONE_BYTES);
__m128i vhit = _mm_cvtsi64_si128(hit);
__m128i vmiss = _mm_cvtsi64_si128(miss);
__m128i vhit_mask = _mm_sub_epi8(vmiss,_mm_cvtsi64_si128(ONE_BYTES)); // 0 if miss, 0xff if hit
vhit_mask = _mm_cvtepi8_epi16(vhit_mask); // 0 or 0xffff
// prefix sum via shifts+adds. Thanks to @rygorous for the idea
// We could also do this by packing into an __m256 and doing hit/miss in parallel
//
// But using __m128 is nicer bc Haswell can dual-issue nearly all of these ops
// and it avoids the expensive cross-lane pack/unpack
//
__m128i prefix_hit = _mm_add_epi8(vhit, _mm_slli_si128(vhit,1));
__m128i prefix_miss = _mm_add_epi8(vmiss, _mm_slli_si128(vmiss,1));
prefix_hit = _mm_add_epi8(prefix_hit, _mm_slli_si128(prefix_hit,2));
prefix_miss = _mm_add_epi8(prefix_miss, _mm_slli_si128(prefix_miss,2));
prefix_hit = _mm_add_epi8(prefix_hit, _mm_slli_si128(prefix_hit,4));
prefix_miss = _mm_add_epi8(prefix_miss, _mm_slli_si128(prefix_miss,4));
prefix_hit = _mm_sub_epi8(prefix_hit, vhit); // exclude ray itself from the prefix sum
prefix_miss = _mm_sub_epi8(prefix_miss,vmiss);
prefix_hit = _mm_cvtepi8_epi16(prefix_hit);
prefix_miss = _mm_cvtepi8_epi16(prefix_miss);
__m128i hitBase = _mm_broadcastw_epi16(_mm_cvtsi64_si128(nHitLoc));
__m128i missBase = _mm_broadcastw_epi16(_mm_cvtsi64_si128(nMissLoc-1));
prefix_hit = _mm_add_epi16(prefix_hit,hitBase);
prefix_miss = _mm_sub_epi16(missBase, prefix_miss);
size_t nHitPop = _mm_popcnt_u64(frame.pMasks[i]);
nHitLoc += nHitPop ;
nMissLoc -= (8-nHitPop);
__m128i addr = _mm_blendv_epi8(prefix_miss,prefix_hit,vhit_mask);
pIDs[ _mm_extract_epi16(addr,0) ] = pPacketRayIDs[0];
pIDs[ _mm_extract_epi16(addr,1) ] = pPacketRayIDs[1];
pIDs[ _mm_extract_epi16(addr,2) ] = pPacketRayIDs[2];
pIDs[ _mm_extract_epi16(addr,3) ] = pPacketRayIDs[3];
pIDs[ _mm_extract_epi16(addr,4) ] = pPacketRayIDs[4];
pIDs[ _mm_extract_epi16(addr,5) ] = pPacketRayIDs[5];
pIDs[ _mm_extract_epi16(addr,6) ] = pPacketRayIDs[6];
pIDs[ _mm_extract_epi16(addr,7) ] = pPacketRayIDs[7];
}
ReadRaysLoopArgs args;
args.pPackets = pPackets;
args.pRayIDs = pIDs;
args.pRays = (const byte*)pRays;
ReadRaysLoop(args,nGroups);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment