Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
ray reordering with shuffle lut
// Tried this, and it was marginally slower
//
// Some notes about this:
// 1. Seperate hit/miss arrays force me to use a lot more stack than I did before, and
// probably doesn't use the cache quite as well.
// 2. The prefetching of the rays doesn't fit in quite as neatly, and doesn't help anymore if I stick it in there
// it might make more sense to move that elsewhere anyway
// 3. LUT is 256 bytes. Not too bad, but it's probably knocking a few rays out of the cache
// 4. Reordering can produce at least one packet that is partially miss and partially hit.
// Seperate arrays means I need to split the ray movement into 2 passes, with a special case in between for the half-full one.
static const __m128i SHUFFLE_TABLE[16] = {
_mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 0, 1, 2, 3,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 4, 5, 6, 7,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 8, 9,10,11,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 0, 1, 2, 3, 8, 9,10,11,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 4, 5, 6, 7, 8, 9,10,11,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,-1,-1,-1,-1),
_mm_setr_epi8(12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 0, 1, 2, 3,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 4, 5, 6, 7,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7,12,13,14,15,-1,-1,-1,-1),
_mm_setr_epi8( 8, 9,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1),
_mm_setr_epi8( 0, 1, 2, 3, 8, 9,10,11,12,13,14,15,-1,-1,-1,-1),
_mm_setr_epi8( 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,-1,-1,-1,-1),
_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15),
};
uint32 pHitIDs[MAX_TRACER_SIZE+8];
uint32 pMissIDs[MAX_TRACER_SIZE+8];
size_t nHits = 0;
size_t nMisses = 0;
const char* pRays = (const char*) frame.pRays;
for( size_t i=0; i<nGroups; i++ )
{
uint64 hits = frame.pMasks[i];
uint64 misses = hits ^ 0xff;
uint64 hit_lo = (hits & 0x0f);
uint64 hit_hi = (hits & 0xf0)>>4;
uint64 miss_lo = (misses & 0x0f);
uint64 miss_hi = (misses & 0xf0)>>4;
// load lo/hi ID pairs
// NOTE: These are 32-bit, because they're byte offsets from start of ray array
// This enables the ray read to avoid using shifts to multiply by sizeof(Ray)
// Could try doing the shuffles with m256, but stores would need an extract, which defeats the purpose
uint32* __restrict pPacketRayIDs = pPackets[i]->RayOffsets;
__m128i id_lo = _mm_load_si128( (__m128i*) pPacketRayIDs );
__m128i id_hi = _mm_load_si128( (__m128i*) (pPacketRayIDs+4) );
// store hit/miss iDs
__m128i vhit_lo = _mm_shuffle_epi8( id_lo, SHUFFLE_TABLE[hit_lo] );
__m128i vhit_hi = _mm_shuffle_epi8( id_hi, SHUFFLE_TABLE[hit_hi] );
__m128i vmiss_lo = _mm_shuffle_epi8( id_lo, SHUFFLE_TABLE[miss_lo] );
__m128i vmiss_hi = _mm_shuffle_epi8( id_hi, SHUFFLE_TABLE[miss_hi] );
_mm_storeu_si128( (__m128i*)&pHitIDs[nHits], vhit_lo );
_mm_storeu_si128( (__m128i*)&pMissIDs[nMisses], vmiss_lo );
// NOTE: Tried replacing _mm_popcnts with an 8-bit LUT. This was even slower
nHits += _mm_popcnt_u64(hit_lo);
nMisses += _mm_popcnt_u64(miss_lo);
_mm_storeu_si128( (__m128i*)&pHitIDs[nHits], vhit_hi );
_mm_storeu_si128( (__m128i*)&pMissIDs[nMisses], vmiss_hi );
nHits += _mm_popcnt_u64(hit_hi);
nMisses += _mm_popcnt_u64(miss_hi);
}
// Probably going to try an unrolled variation of the prefix sum. You're right that I could use 16bit for adds, but if I use 8-bit I can squeeze
// an extra ray group into the upper halves
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.