Skip to content

Instantly share code, notes, and snippets.

@jbarczak
Created June 14, 2015 02:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jbarczak/bb3f34df8999c06a076b to your computer and use it in GitHub Desktop.
Save jbarczak/bb3f34df8999c06a076b to your computer and use it in GitHub Desktop.
ray reordering with shuffle lut
// Tried this, and it was marginally slower
//
// Some notes about this:
// 1. Seperate hit/miss arrays force me to use a lot more stack than I did before, and
// probably doesn't use the cache quite as well.
// 2. The prefetching of the rays doesn't fit in quite as neatly, and doesn't help anymore if I stick it in there
// it might make more sense to move that elsewhere anyway
// 3. LUT is 256 bytes. Not too bad, but it's probably knocking a few rays out of the cache
// 4. Reordering can produce at least one packet that is partially miss and partially hit.
// Seperate arrays means I need to split the ray movement into 2 passes, with a special case in between for the half-full one.
static const __m128i SHUFFLE_TABLE[16] = {
_mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 0, 1, 2, 3,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 4, 5, 6, 7,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 8, 9,10,11,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 0, 1, 2, 3, 8, 9,10,11,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 4, 5, 6, 7, 8, 9,10,11,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,-1,-1,-1,-1),
_mm_setr_epi8(12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 0, 1, 2, 3,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 4, 5, 6, 7,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7,12,13,14,15,-1,-1,-1,-1),
_mm_setr_epi8( 8, 9,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 0, 1, 2, 3, 8, 9,10,11,12,13,14,15,-1,-1,-1,-1),
_mm_setr_epi8( 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,-1,-1,-1,-1),
_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15),
};
uint32 pHitIDs[MAX_TRACER_SIZE+8];
uint32 pMissIDs[MAX_TRACER_SIZE+8];
size_t nHits = 0;
size_t nMisses = 0;
const char* pRays = (const char*) frame.pRays;
for( size_t i=0; i<nGroups; i++ )
{
uint64 hits = frame.pMasks[i];
uint64 misses = hits ^ 0xff;
uint64 hit_lo = (hits & 0x0f);
uint64 hit_hi = (hits & 0xf0)>>4;
uint64 miss_lo = (misses & 0x0f);
uint64 miss_hi = (misses & 0xf0)>>4;
// load lo/hi ID pairs
// NOTE: These are 32-bit, because they're byte offsets from start of ray array
// This enables the ray read to avoid using shifts to multiply by sizeof(Ray)
// Could try doing the shuffles with m256, but stores would need an extract, which defeats the purpose
uint32* __restrict pPacketRayIDs = pPackets[i]->RayOffsets;
__m128i id_lo = _mm_load_si128( (__m128i*) pPacketRayIDs );
__m128i id_hi = _mm_load_si128( (__m128i*) (pPacketRayIDs+4) );
// store hit/miss iDs
__m128i vhit_lo = _mm_shuffle_epi8( id_lo, SHUFFLE_TABLE[hit_lo] );
__m128i vhit_hi = _mm_shuffle_epi8( id_hi, SHUFFLE_TABLE[hit_hi] );
__m128i vmiss_lo = _mm_shuffle_epi8( id_lo, SHUFFLE_TABLE[miss_lo] );
__m128i vmiss_hi = _mm_shuffle_epi8( id_hi, SHUFFLE_TABLE[miss_hi] );
_mm_storeu_si128( (__m128i*)&pHitIDs[nHits], vhit_lo );
_mm_storeu_si128( (__m128i*)&pMissIDs[nMisses], vmiss_lo );
// NOTE: Tried replacing _mm_popcnts with an 8-bit LUT. This was even slower
nHits += _mm_popcnt_u64(hit_lo);
nMisses += _mm_popcnt_u64(miss_lo);
_mm_storeu_si128( (__m128i*)&pHitIDs[nHits], vhit_hi );
_mm_storeu_si128( (__m128i*)&pMissIDs[nMisses], vmiss_hi );
nHits += _mm_popcnt_u64(hit_hi);
nMisses += _mm_popcnt_u64(miss_hi);
}
// Probably going to try an unrolled variation of the prefix sum. You're right that I could use 16bit for adds, but if I use 8-bit I can squeeze
// an extra ray group into the upper halves
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment