Created
February 17, 2017 16:00
-
-
Save Pierre-Terdiman/d51dacd0f1edd7d356242e97657b12ed to your computer and use it in GitHub Desktop.
Preview of part 14 - merging my unrolled version (the "safe loop" here) with Ryg's version converted to C++
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifdef SAFE_VERSION | |
#define SIMD_OVERLAP_TEST (_mm_movemask_ps(_mm_cmpngt_ps(b, _mm_load_ps(box)))==15) | |
#else | |
#define SIMD_OVERLAP_TEST (_mm_movemask_ps(_mm_cmpnle_ps(_mm_load_ps(box), b))==12) | |
#endif | |
bool Meshmerizer::CompleteBoxPruning(udword nb, const AABB* list, Container& pairs) | |
{ | |
// Checkings | |
if(!nb || !list) | |
return false; | |
SIMD_AABB_X* BoxListX = new SIMD_AABB_X[nb+1+5]; | |
SIMD_AABB_YZ* BoxListYZ = (SIMD_AABB_YZ*)_aligned_malloc(sizeof(SIMD_AABB_YZ)*(nb+1), 16); | |
udword* Remap; | |
// { | |
// Allocate some temporary data | |
float* PosList = new float[nb+1]; | |
// 1) Build main list using the primary axis | |
for(udword i=0;i<nb;i++) | |
PosList[i] = list[i].mMin.x; | |
PosList[nb] = FLT_MAX; | |
// 2) Sort the list | |
static PRUNING_SORTER RS; // Static for coherence | |
Remap = RS.Sort(PosList, nb+1).GetRanks(); | |
for(udword i=0;i<nb;i++) | |
{ | |
const udword SortedIndex = Remap[i]; | |
BoxListX[i].InitFrom(list[SortedIndex]); | |
BoxListYZ[i].InitFrom(list[SortedIndex]); | |
} | |
BoxListX[nb].mMinX = FLT_MAX; | |
BoxListX[nb+1].mMinX = FLT_MAX; | |
BoxListX[nb+2].mMinX = FLT_MAX; | |
BoxListX[nb+3].mMinX = FLT_MAX; | |
BoxListX[nb+4].mMinX = FLT_MAX; | |
DELETEARRAY(PosList); | |
// } | |
// 3) Prune the list | |
udword RunningAddress = 0; | |
udword Index0 = 0; | |
while(RunningAddress<nb && Index0<nb) | |
{ | |
const SIMD_AABB_X& Box0X = BoxListX[Index0]; | |
const float MinLimit = Box0X.mMinX; | |
while(BoxListX[RunningAddress++].mMinX<MinLimit); | |
const SIMD_AABB_YZ& Box0YZ = BoxListYZ[Index0]; | |
SIMD_OVERLAP_INIT(Box0YZ) | |
const float MaxLimit = Box0X.mMaxX; | |
const udword RIndex0 = Remap[Index0]; | |
udword Offset = 0; | |
const char* const CurrentBoxListYZ = (const char*)&BoxListYZ[RunningAddress]; | |
const char* const CurrentBoxListX = (const char*)&BoxListX[RunningAddress]; | |
goto FastLoop; | |
_asm align 16 | |
FoundSlot2: | |
Offset += 8; | |
_asm align 16 | |
FoundSlot1: | |
Offset += 8; | |
_asm align 16 | |
FoundSlot0: | |
Offset += 8; | |
_asm align 16 | |
FastFoundOne: | |
{ | |
const udword Index = (CurrentBoxListX + Offset - 8 - (const char*)BoxListX)>>3; | |
outputPair(RIndex0, Index, pairs, Remap); | |
} | |
_asm align 16 | |
FastLoop: | |
while(*(const float*)(CurrentBoxListX + Offset + 8*4)<=MaxLimit) | |
{ | |
const float* box = (const float*)(CurrentBoxListYZ + Offset*2); | |
if(SIMD_OVERLAP_TEST) | |
goto FoundSlot0; | |
box = (const float*)(CurrentBoxListYZ + Offset*2 + 16); | |
if(SIMD_OVERLAP_TEST) | |
goto FoundSlot1; | |
box = (const float*)(CurrentBoxListYZ + Offset*2 + 16*2); | |
if(SIMD_OVERLAP_TEST) | |
goto FoundSlot2; | |
box = (const float*)(CurrentBoxListYZ + Offset*2 + 16*3); | |
Offset += 32; | |
if(SIMD_OVERLAP_TEST) | |
goto FastFoundOne; | |
} | |
#define BLOCK if(*(const float*)(CurrentBoxListX + Offset)<=MaxLimit) \ | |
{const float* box = (const float*)(CurrentBoxListYZ + Offset*2);\ | |
if(SIMD_OVERLAP_TEST) \ | |
goto BeforeLoop; \ | |
Offset += 8; | |
goto StartLoop; | |
_asm align 16 | |
BeforeLoop: | |
{const udword Index = (CurrentBoxListX + Offset - (const char*)BoxListX)>>3; | |
outputPair(RIndex0, Index, pairs, Remap); | |
Offset += 8; | |
} | |
_asm align 16 | |
StartLoop: | |
BLOCK | |
BLOCK | |
BLOCK | |
BLOCK | |
BLOCK | |
} | |
} | |
} | |
} | |
goto StartLoop; | |
} | |
Index0++; | |
} | |
_aligned_free(BoxListYZ); | |
DELETEARRAY(BoxListX); | |
return true; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Ryg FTW! :)