Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
static const __m128i SHUFFLE_TABLE[16] = {
_mm_setr_epi8(12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3),
_mm_setr_epi8( 0, 1, 2, 3,12,13,14,15, 8, 9,10,11, 4, 5, 6, 7),
_mm_setr_epi8( 4, 5, 6, 7,12,13,14,15, 8, 9,10,11, 0, 1, 2, 3),
_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7,12,13,14,15, 8, 9,10,11),
_mm_setr_epi8( 8, 9,10,11,12,13,14,15, 4, 5, 6, 7, 0, 1, 2, 3),
_mm_setr_epi8( 0, 1, 2, 3, 8, 9,10,11,12,13,14,15, 4, 5, 6, 7),
_mm_setr_epi8( 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, 0, 1, 2, 3),
_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15),
_mm_setr_epi8(12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3),
_mm_setr_epi8( 0, 1, 2, 3,12,13,14,15, 8, 9,10,11, 4, 5, 6, 7),
_mm_setr_epi8( 4, 5, 6, 7,12,13,14,15, 8, 9,10,11, 0, 1, 2, 3),
_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7,12,13,14,15, 8, 9,10,11),
_mm_setr_epi8( 8, 9,10,11,12,13,14,15, 4, 5, 6, 7, 0, 1, 2, 3),
_mm_setr_epi8( 0, 1, 2, 3, 8, 9,10,11,12,13,14,15, 4, 5, 6, 7),
_mm_setr_epi8( 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, 0, 1, 2, 3),
_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15),
};
static void __fastcall ReorderRays( StackFrame& frame, size_t nGroups )
{
RayPacket** pPackets = frame.pActivePackets;
uint32 pReorderIDs[MAX_TRACER_SIZE];
size_t nHits = 0;
size_t nFirstMiss = 8*nGroups;
const char* pRays = (const char*) frame.pRays;
for( size_t i=0; i<nGroups; i++ )
{
uint32* __restrict pPacketRayIDs = pPackets[i]->RayOffsets;
uint64 hits = frame.pMasks[i];
uint64 hit_lo = (hits & 0x0f);
uint64 hit_hi = (hits & 0xf0)>>4;
uint64 pop_lo = _mm_popcnt_u64(hit_lo);
uint64 pop_hi = _mm_popcnt_u64(hit_hi);
// load lo/hi ID pairs
__m128i id_lo = _mm_load_si128( (__m128i*) pPacketRayIDs );
__m128i id_hi = _mm_load_si128( (__m128i*) (pPacketRayIDs+4) );
// store hit/miss iDs
__m128i shuf_lo = _mm_shuffle_epi8( id_lo, SHUFFLE_TABLE[hit_lo] );
__m128i shuf_hi = _mm_shuffle_epi8( id_hi, SHUFFLE_TABLE[hit_hi] );
_mm_storeu_si128( (__m128i*)&pReorderIDs[nHits], shuf_lo );
nHits += pop_lo;
_mm_storeu_si128( (__m128i*)&pReorderIDs[nHits], shuf_hi );
nHits += pop_hi;
// NOTE: Hits MUST be written before misses, or a full-hit packet can corrupt the miss area
_mm_storeu_si128( (__m128i*)&pReorderIDs[nFirstMiss-4], shuf_lo );
nFirstMiss -= 4-pop_lo;
_mm_storeu_si128( (__m128i*)&pReorderIDs[nFirstMiss-4], shuf_hi );
nFirstMiss -= 4-pop_hi;
}
// full hit packets
ReadRaysLoopArgs args;
args.pPackets = pPackets;
args.pRayIDs = pReorderIDs;
args.pRays = (const byte*)pRays;
ReadRaysLoop(args,nGroups);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.