jbarczak/Reorder_with_shuffle_LUT

## Reorder_with_shuffle_LUT

// Tried this, and it was marginally slower
//
//  Some notes about this:
//  1.  Seperate hit/miss arrays force me to use a lot more stack than I did before, and
//        probably doesn't use the cache quite as well.
//  2.  The prefetching of the rays doesn't fit in quite as neatly, and doesn't help anymore if I stick it in there
//        it might make more sense to move that elsewhere anyway
//  3. LUT is 256 bytes.  Not too bad, but it's probably knocking a few rays out of the cache
//  4.  Reordering can produce at least one packet that is partially miss and partially hit.
//        Seperate arrays means I need to split the ray movement into 2 passes, with a special case in between for the half-full one.

  static const __m128i SHUFFLE_TABLE[16] = {
        _mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
        _mm_setr_epi8( 0, 1, 2, 3,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
        _mm_setr_epi8( 4, 5, 6, 7,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
        _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7,-1,-1,-1,-1,-1,-1,-1,-1),
        _mm_setr_epi8( 8, 9,10,11,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
        _mm_setr_epi8( 0, 1, 2, 3, 8, 9,10,11,-1,-1,-1,-1,-1,-1,-1,-1),
        _mm_setr_epi8( 4, 5, 6, 7, 8, 9,10,11,-1,-1,-1,-1,-1,-1,-1,-1),
        _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,-1,-1,-1,-1),
        _mm_setr_epi8(12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
        _mm_setr_epi8( 0, 1, 2, 3,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1),
        _mm_setr_epi8( 4, 5, 6, 7,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1),
        _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7,12,13,14,15,-1,-1,-1,-1),
        _mm_setr_epi8( 8, 9,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1),
        _mm_setr_epi8( 0, 1, 2, 3, 8, 9,10,11,12,13,14,15,-1,-1,-1,-1),
        _mm_setr_epi8( 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,-1,-1,-1,-1),
        _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15),
    };

        uint32 pHitIDs[MAX_TRACER_SIZE+8];
        uint32 pMissIDs[MAX_TRACER_SIZE+8];

        size_t nHits   = 0;
        size_t nMisses = 0;

        const char* pRays = (const char*) frame.pRays;
        for( size_t i=0; i<nGroups; i++ )
        {

            uint64 hits    = frame.pMasks[i];
            uint64 misses  = hits ^ 0xff;
            uint64 hit_lo  = (hits & 0x0f);
            uint64 hit_hi  = (hits & 0xf0)>>4;
            uint64 miss_lo = (misses & 0x0f);
            uint64 miss_hi = (misses & 0xf0)>>4;

            // load lo/hi ID pairs
            // NOTE: These are 32-bit, because they're byte offsets from start of ray array
            // This enables the ray read to avoid using shifts to multiply by sizeof(Ray)
            //  Could try doing the shuffles with m256, but stores would need an extract, which defeats the purpose
            uint32* __restrict pPacketRayIDs = pPackets[i]->RayOffsets;

            __m128i id_lo = _mm_load_si128( (__m128i*) pPacketRayIDs );
            __m128i id_hi = _mm_load_si128( (__m128i*) (pPacketRayIDs+4) );

            // store hit/miss iDs
            __m128i vhit_lo   = _mm_shuffle_epi8( id_lo, SHUFFLE_TABLE[hit_lo] );
            __m128i vhit_hi   = _mm_shuffle_epi8( id_hi, SHUFFLE_TABLE[hit_hi] );
            __m128i vmiss_lo  = _mm_shuffle_epi8( id_lo, SHUFFLE_TABLE[miss_lo] );
            __m128i vmiss_hi  = _mm_shuffle_epi8( id_hi, SHUFFLE_TABLE[miss_hi] );
            _mm_storeu_si128( (__m128i*)&pHitIDs[nHits], vhit_lo );
            _mm_storeu_si128( (__m128i*)&pMissIDs[nMisses], vmiss_lo );

            // NOTE: Tried replacing _mm_popcnts with an 8-bit LUT.  This was even slower
            nHits   += _mm_popcnt_u64(hit_lo);
            nMisses += _mm_popcnt_u64(miss_lo);

            _mm_storeu_si128( (__m128i*)&pHitIDs[nHits], vhit_hi );
            _mm_storeu_si128( (__m128i*)&pMissIDs[nMisses], vmiss_hi );
            nHits   += _mm_popcnt_u64(hit_hi);
            nMisses += _mm_popcnt_u64(miss_hi);
        }

// Probably going to try an unrolled variation of the prefix sum.  You're right that I could use 16bit for adds, but if I use 8-bit I can squeeze
//   an extra ray group into the upper halves

	// Tried this, and it was marginally slower
	//
	// Some notes about this:
	// 1. Seperate hit/miss arrays force me to use a lot more stack than I did before, and
	// probably doesn't use the cache quite as well.
	// 2. The prefetching of the rays doesn't fit in quite as neatly, and doesn't help anymore if I stick it in there
	// it might make more sense to move that elsewhere anyway
	// 3. LUT is 256 bytes. Not too bad, but it's probably knocking a few rays out of the cache
	// 4. Reordering can produce at least one packet that is partially miss and partially hit.
	// Seperate arrays means I need to split the ray movement into 2 passes, with a special case in between for the half-full one.

	static const __m128i SHUFFLE_TABLE[16] = {
	_mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
	_mm_setr_epi8( 0, 1, 2, 3,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
	_mm_setr_epi8( 4, 5, 6, 7,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
	_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7,-1,-1,-1,-1,-1,-1,-1,-1),
	_mm_setr_epi8( 8, 9,10,11,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
	_mm_setr_epi8( 0, 1, 2, 3, 8, 9,10,11,-1,-1,-1,-1,-1,-1,-1,-1),
	_mm_setr_epi8( 4, 5, 6, 7, 8, 9,10,11,-1,-1,-1,-1,-1,-1,-1,-1),
	_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,-1,-1,-1,-1),
	_mm_setr_epi8(12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1),
	_mm_setr_epi8( 0, 1, 2, 3,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1),
	_mm_setr_epi8( 4, 5, 6, 7,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1),
	_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7,12,13,14,15,-1,-1,-1,-1),
	_mm_setr_epi8( 8, 9,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1),
	_mm_setr_epi8( 0, 1, 2, 3, 8, 9,10,11,12,13,14,15,-1,-1,-1,-1),
	_mm_setr_epi8( 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,-1,-1,-1,-1),
	_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15),
	};

	uint32 pHitIDs[MAX_TRACER_SIZE+8];
	uint32 pMissIDs[MAX_TRACER_SIZE+8];

	size_t nHits = 0;
	size_t nMisses = 0;

	const char* pRays = (const char*) frame.pRays;
	for( size_t i=0; i<nGroups; i++ )
	{

	uint64 hits = frame.pMasks[i];
	uint64 misses = hits ^ 0xff;
	uint64 hit_lo = (hits & 0x0f);
	uint64 hit_hi = (hits & 0xf0)>>4;
	uint64 miss_lo = (misses & 0x0f);
	uint64 miss_hi = (misses & 0xf0)>>4;

	// load lo/hi ID pairs
	// NOTE: These are 32-bit, because they're byte offsets from start of ray array
	// This enables the ray read to avoid using shifts to multiply by sizeof(Ray)
	// Could try doing the shuffles with m256, but stores would need an extract, which defeats the purpose
	uint32* __restrict pPacketRayIDs = pPackets[i]->RayOffsets;

	__m128i id_lo = _mm_load_si128( (__m128i*) pPacketRayIDs );
	__m128i id_hi = _mm_load_si128( (__m128i*) (pPacketRayIDs+4) );

	// store hit/miss iDs
	__m128i vhit_lo = _mm_shuffle_epi8( id_lo, SHUFFLE_TABLE[hit_lo] );
	__m128i vhit_hi = _mm_shuffle_epi8( id_hi, SHUFFLE_TABLE[hit_hi] );
	__m128i vmiss_lo = _mm_shuffle_epi8( id_lo, SHUFFLE_TABLE[miss_lo] );
	__m128i vmiss_hi = _mm_shuffle_epi8( id_hi, SHUFFLE_TABLE[miss_hi] );
	_mm_storeu_si128( (__m128i*)&pHitIDs[nHits], vhit_lo );
	_mm_storeu_si128( (__m128i*)&pMissIDs[nMisses], vmiss_lo );

	// NOTE: Tried replacing _mm_popcnts with an 8-bit LUT. This was even slower
	nHits += _mm_popcnt_u64(hit_lo);
	nMisses += _mm_popcnt_u64(miss_lo);

	_mm_storeu_si128( (__m128i*)&pHitIDs[nHits], vhit_hi );
	_mm_storeu_si128( (__m128i*)&pMissIDs[nMisses], vmiss_hi );
	nHits += _mm_popcnt_u64(hit_hi);
	nMisses += _mm_popcnt_u64(miss_hi);
	}

	// Probably going to try an unrolled variation of the prefix sum. You're right that I could use 16bit for adds, but if I use 8-bit I can squeeze
	// an extra ray group into the upper halves