Skip to content

Instantly share code, notes, and snippets.

Created June 14, 2015 05:47
Embed
What would you like to do?
ray reordering with shuffle LUT
// Tried this, and it was marginally slower
//
// Some notes about this:
// 1. Seperate hit/miss arrays force me to use a lot more stack than I did before, and
// probably doesn't use the cache quite as well.
//
// ->Answer: no, I was thinking still 1 array, still filling it from both ends,
// just with 16 slots worth of padding in the middle! Something like this.
// Group hits left->right, misses right->left
// Label is hit flag for lane 0, 1, 2, 3
//
// NOTE: top and bottom halves of this table are actually identical.
// This way is clearer, but if you want to save the cache space, feel free
// to only store half of it.
static const __m128i SHUFFLE_TABLE[16] = {
#define DWORD_IND(i) ((i)*4 + 0), ((i)*4 + 1), ((i)*4 + 2), ((i)+4 + 3)
#define SHUFFLE(a,b,c,d) _mm_setr_epi8(DWORD_IND(a), DWORD_IND(b), DWORD_IND(c), DWORD_IND(d))
SHUFFLE( 3,2,1,0), // 0 0 0 0
SHUFFLE(0, 3,2,1), // 1 0 0 0
SHUFFLE(1, 3,2,0), // 0 1 0 0
SHUFFLE(0,1, 3,2), // 1 1 0 0
SHUFFLE(2, 3,1,0), // 0 0 1 0
SHUFFLE(0,2, 3,1), // 1 0 1 0
SHUFFLE(1,2, 3,0), // 0 1 1 0
SHUFFLE(0,1,2, 3), // 1 1 1 0
SHUFFLE(3, 2,1,0), // 0 0 0 1
SHUFFLE(0,3, 2,1), // 1 0 0 1
SHUFFLE(1,3, 2,0), // 0 1 0 1
SHUFFLE(0,1,3, 2), // 1 1 0 1
SHUFFLE(2,3, 1,0), // 0 0 1 1
SHUFFLE(0,2,3, 1), // 1 0 1 1
SHUFFLE(1,2,3, 0), // 0 1 1 1
SHUFFLE(0,1,2,3 ), // 1 1 1 1
#undef SHUFFLE
#undef DWORD_IND
};
uint32 pHitMissIDs[MAX_TRACER_SIZE+16];
size_t nHits = 0;
size_t firstMiss = MAX_TRACER_SIZE + 16;
// Invariant:
// pHitMissIDs[0:nHits] contains hits (left bound inclusive, right exclusive)
// pHitMissIDs[firstMiss:MAX_TRACER_SIZE+16] contains misses
// rest of array contains garbage
const char* pRays = (const char*) frame.pRays;
for( size_t i=0; i<nGroups; i++ )
{
uint64 hits = frame.pMasks[i];
uint64 hit_lo = (hits & 0x0f);
uint64 hit_hi = (hits & 0xf0)>>4;
uint64 nHit_lo = _mm_popcnt_u64(hit_lo);
uint64 nHit_hi = _mm_popcnt_u64(hit_hi);
// load lo/hi ID pairs
// NOTE: These are 32-bit, because they're byte offsets from start of ray array
// This enables the ray read to avoid using shifts to multiply by sizeof(Ray)
// Could try doing the shuffles with m256, but stores would need an extract, which defeats the purpose
uint32* __restrict pPacketRayIDs = pPackets[i]->RayOffsets;
__m128i id_lo = _mm_load_si128( (__m128i*) pPacketRayIDs );
__m128i id_hi = _mm_load_si128( (__m128i*) (pPacketRayIDs+4) );
// store hit/miss IDs
__m128i vhitmiss_lo = _mm_shuffle_epi8( id_lo, SHUFFLE_TABLE[hit_lo] );
__m128i vhitmiss_hi = _mm_shuffle_epi8( id_hi, SHUFFLE_TABLE[hit_hi] );
_mm_storeu_si128( (__m128i*)&pHitMissIDs[nHits], vhitmiss_lo );
_mm_storeu_si128( (__m128i*)&pHitMissIDs[firstMiss-4], vhitmiss_lo );
nHits += nHit_lo;
firstMiss -= 4 - nHit_lo;
_mm_storeu_si128( (__m128i*)&pHitMissIDs[nHits], vhitmiss_hi );
_mm_storeu_si128( (__m128i*)&pHitMissIDs[firstMiss-4], vhitmiss_hi );
nHits += nHit_lo;
firstMiss -= 4 - nHit_lo;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment