Skip to content

Instantly share code, notes, and snippets.

@Pierre-Terdiman
Created February 17, 2017 16:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Pierre-Terdiman/d51dacd0f1edd7d356242e97657b12ed to your computer and use it in GitHub Desktop.
Save Pierre-Terdiman/d51dacd0f1edd7d356242e97657b12ed to your computer and use it in GitHub Desktop.
Preview of part 14 - merging my unrolled version (the "safe loop" here) with Ryg's version converted to C++
#ifdef SAFE_VERSION
#define SIMD_OVERLAP_TEST (_mm_movemask_ps(_mm_cmpngt_ps(b, _mm_load_ps(box)))==15)
#else
#define SIMD_OVERLAP_TEST (_mm_movemask_ps(_mm_cmpnle_ps(_mm_load_ps(box), b))==12)
#endif
bool Meshmerizer::CompleteBoxPruning(udword nb, const AABB* list, Container& pairs)
{
// Checkings
if(!nb || !list)
return false;
SIMD_AABB_X* BoxListX = new SIMD_AABB_X[nb+1+5];
SIMD_AABB_YZ* BoxListYZ = (SIMD_AABB_YZ*)_aligned_malloc(sizeof(SIMD_AABB_YZ)*(nb+1), 16);
udword* Remap;
// {
// Allocate some temporary data
float* PosList = new float[nb+1];
// 1) Build main list using the primary axis
for(udword i=0;i<nb;i++)
PosList[i] = list[i].mMin.x;
PosList[nb] = FLT_MAX;
// 2) Sort the list
static PRUNING_SORTER RS; // Static for coherence
Remap = RS.Sort(PosList, nb+1).GetRanks();
for(udword i=0;i<nb;i++)
{
const udword SortedIndex = Remap[i];
BoxListX[i].InitFrom(list[SortedIndex]);
BoxListYZ[i].InitFrom(list[SortedIndex]);
}
BoxListX[nb].mMinX = FLT_MAX;
BoxListX[nb+1].mMinX = FLT_MAX;
BoxListX[nb+2].mMinX = FLT_MAX;
BoxListX[nb+3].mMinX = FLT_MAX;
BoxListX[nb+4].mMinX = FLT_MAX;
DELETEARRAY(PosList);
// }
// 3) Prune the list
udword RunningAddress = 0;
udword Index0 = 0;
while(RunningAddress<nb && Index0<nb)
{
const SIMD_AABB_X& Box0X = BoxListX[Index0];
const float MinLimit = Box0X.mMinX;
while(BoxListX[RunningAddress++].mMinX<MinLimit);
const SIMD_AABB_YZ& Box0YZ = BoxListYZ[Index0];
SIMD_OVERLAP_INIT(Box0YZ)
const float MaxLimit = Box0X.mMaxX;
const udword RIndex0 = Remap[Index0];
udword Offset = 0;
const char* const CurrentBoxListYZ = (const char*)&BoxListYZ[RunningAddress];
const char* const CurrentBoxListX = (const char*)&BoxListX[RunningAddress];
goto FastLoop;
_asm align 16
FoundSlot2:
Offset += 8;
_asm align 16
FoundSlot1:
Offset += 8;
_asm align 16
FoundSlot0:
Offset += 8;
_asm align 16
FastFoundOne:
{
const udword Index = (CurrentBoxListX + Offset - 8 - (const char*)BoxListX)>>3;
outputPair(RIndex0, Index, pairs, Remap);
}
_asm align 16
FastLoop:
while(*(const float*)(CurrentBoxListX + Offset + 8*4)<=MaxLimit)
{
const float* box = (const float*)(CurrentBoxListYZ + Offset*2);
if(SIMD_OVERLAP_TEST)
goto FoundSlot0;
box = (const float*)(CurrentBoxListYZ + Offset*2 + 16);
if(SIMD_OVERLAP_TEST)
goto FoundSlot1;
box = (const float*)(CurrentBoxListYZ + Offset*2 + 16*2);
if(SIMD_OVERLAP_TEST)
goto FoundSlot2;
box = (const float*)(CurrentBoxListYZ + Offset*2 + 16*3);
Offset += 32;
if(SIMD_OVERLAP_TEST)
goto FastFoundOne;
}
#define BLOCK if(*(const float*)(CurrentBoxListX + Offset)<=MaxLimit) \
{const float* box = (const float*)(CurrentBoxListYZ + Offset*2);\
if(SIMD_OVERLAP_TEST) \
goto BeforeLoop; \
Offset += 8;
goto StartLoop;
_asm align 16
BeforeLoop:
{const udword Index = (CurrentBoxListX + Offset - (const char*)BoxListX)>>3;
outputPair(RIndex0, Index, pairs, Remap);
Offset += 8;
}
_asm align 16
StartLoop:
BLOCK
BLOCK
BLOCK
BLOCK
BLOCK
}
}
}
}
goto StartLoop;
}
Index0++;
}
_aligned_free(BoxListYZ);
DELETEARRAY(BoxListX);
return true;
}
@sfiruch
Copy link

sfiruch commented Feb 17, 2017

Ryg FTW! :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment