Created
April 20, 2018 16:55
-
-
Save jdryg/f76f159f7e8eae15dba58dd5f66211ee to your computer and use it in GitHub Desktop.
cullRects SIMD benchmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define TEST_SIMD 3 | |
struct Rect | |
{ | |
float minx, miny; | |
float maxx, maxy; | |
}; | |
struct RectSoA | |
{ | |
float* minx; | |
float* miny; | |
float* maxx; | |
float* maxy; | |
}; | |
__declspec(noinline) | |
void cullRects(const Rect* rects, uint32_t numRects, const Rect* camRect, bool* results) | |
{ | |
const float camRectMinX = camRect->minx; | |
const float camRectMinY = camRect->miny; | |
const float camRectMaxX = camRect->maxx; | |
const float camRectMaxY = camRect->maxy; | |
for (uint32_t i = 0; i < numRects; ++i) { | |
const Rect* r = &rects[i]; | |
const float minX = r->minx; | |
const float minY = r->miny; | |
const float maxX = r->maxx; | |
const float maxY = r->maxy; | |
if (maxX < camRectMinX || | |
maxY < camRectMinY || | |
minX > camRectMaxX || | |
minY > camRectMaxY) | |
{ | |
results[i] = false; | |
} else { | |
results[i] = true; | |
} | |
} | |
} | |
__declspec(noinline) | |
void cullRects2(const Rect* rects, uint32_t numRects, const Rect* camRect, bool* results) | |
{ | |
const float camRectMinX = camRect->minx; | |
const float camRectMinY = camRect->miny; | |
const float camRectMaxX = camRect->maxx; | |
const float camRectMaxY = camRect->maxy; | |
for (uint32_t i = 0; i < numRects; ++i) { | |
const Rect* r = &rects[i]; | |
const uint32_t culled = 0 | |
| ((r->minx > camRectMaxX) ? 1 : 0) | |
| ((r->miny > camRectMaxY) ? 1 : 0) | |
| ((r->maxx < camRectMinX) ? 1 : 0) | |
| ((r->maxy < camRectMinY) ? 1 : 0); | |
results[i] = culled == 1 ? 0 : 1; | |
} | |
} | |
static const uint32_t visibilityResults16[16] = { | |
0x01010101, // { 0, 0, 0, 0 } | |
0x01010100, // { 0, 0, 0, 1 } | |
0x01010001, // { 0, 0, 1, 0 } | |
0x01010000, // { 0, 0, 1, 1 } | |
0x01000101, // { 0, 1, 0, 0 } | |
0x01000100, // { 0, 1, 0, 1 } | |
0x01000001, // { 0, 1, 1, 0 } | |
0x01000000, // { 0, 1, 1, 1 } | |
0x00010101, // { 1, 0, 0, 0 } | |
0x00010100, // { 1, 0, 0, 1 } | |
0x00010001, // { 1, 0, 1, 0 } | |
0x00010000, // { 1, 0, 1, 1 } | |
0x00000101, // { 1, 1, 0, 0 } | |
0x00000100, // { 1, 1, 0, 1 } | |
0x00000001, // { 1, 1, 1, 0 } | |
0x00000000, // { 1, 1, 1, 1 } | |
}; | |
__declspec(noinline) | |
void cullRectsSoA(const RectSoA* rects, uint32_t numRects, const Rect* camRect, bool* results) | |
{ | |
const float camRectMinX = camRect->minx; | |
const float camRectMinY = camRect->miny; | |
const float camRectMaxX = camRect->maxx; | |
const float camRectMaxY = camRect->maxy; | |
const float* minx = rects->minx; | |
const float* miny = rects->miny; | |
const float* maxx = rects->maxx; | |
const float* maxy = rects->maxy; | |
for (uint32_t i = 0; i < numRects; ++i) { | |
if (maxx[i] < camRectMinX || | |
maxy[i] < camRectMinY || | |
minx[i] > camRectMaxX || | |
miny[i] > camRectMaxY) { | |
results[i] = false; | |
} else { | |
results[i] = true; | |
} | |
} | |
} | |
__declspec(noinline) | |
void cullRectsSSE(const Rect* rects, uint32_t numRects, const Rect* camRect, bool* results) | |
{ | |
const float* rectData = (const float*)rects; | |
uint32_t* resData = (uint32_t*)results; | |
const __m128 camMinX = _mm_set_ps1(camRect->minx); | |
const __m128 camMinY = _mm_set_ps1(camRect->miny); | |
const __m128 camMaxX = _mm_set_ps1(camRect->maxx); | |
const __m128 camMaxY = _mm_set_ps1(camRect->maxy); | |
const uint32_t iter = numRects >> 2; | |
for (uint32_t i = 0; i < iter; ++i) { | |
const __m128 r0 = _mm_load_ps(rectData + 0); // { r0.minx, r0.miny, r0.maxx, r0.maxy } | |
const __m128 r1 = _mm_load_ps(rectData + 4); // { r1.minx, r1.miny, r1.maxx, r1.maxy } | |
const __m128 r2 = _mm_load_ps(rectData + 8); // { r2.minx, r2.miny, r2.maxx, r2.maxy } | |
const __m128 r3 = _mm_load_ps(rectData + 12); // { r3.minx, r3.miny, r3.maxx, r3.maxy } | |
const __m128 r01_min = _mm_movelh_ps(r0, r1); // { r0.minx, r0.miny, r1.minx, r1.miny } | |
const __m128 r01_max = _mm_movehl_ps(r1, r0); // { r0.maxx, r0.maxy, r1.maxx, r1.maxy } | |
const __m128 r23_min = _mm_movelh_ps(r2, r3); // { r2.minx, r2.miny, r3.minx, r3.miny } | |
const __m128 r23_max = _mm_movehl_ps(r3, r2); // { r2.maxx, r2.maxy, r3.maxx, r3.maxy } | |
const __m128 minx = _mm_shuffle_ps(r01_min, r23_min, _MM_SHUFFLE(2, 0, 2, 0)); // { r0.minx, r1.minx, r2.minx, r3.minx } | |
const __m128 miny = _mm_shuffle_ps(r01_min, r23_min, _MM_SHUFFLE(3, 1, 3, 1)); // { r0.miny, r1.miny, r2.miny, r3.miny } | |
const __m128 maxx = _mm_shuffle_ps(r01_max, r23_max, _MM_SHUFFLE(2, 0, 2, 0)); // { r0.maxx, r1.maxx, r2.maxx, r3.maxx } | |
const __m128 maxy = _mm_shuffle_ps(r01_max, r23_max, _MM_SHUFFLE(3, 1, 3, 1)); // { r0.maxy, r1.maxy, r2.maxy, r3.maxy } | |
const __m128 minx_gt_camMaxX = _mm_cmpgt_ps(minx, camMaxX); | |
const __m128 miny_gt_camMaxY = _mm_cmpgt_ps(miny, camMaxY); | |
const __m128 maxx_lt_camMinX = _mm_cmplt_ps(maxx, camMinX); | |
const __m128 maxy_lt_camMinY = _mm_cmplt_ps(maxy, camMinY); | |
const __m128 culled0 = _mm_or_ps(minx_gt_camMaxX, miny_gt_camMaxY); | |
const __m128 culled1 = _mm_or_ps(maxx_lt_camMinX, maxy_lt_camMinY); | |
const __m128 culled = _mm_or_ps(culled0, culled1); | |
int mask = _mm_movemask_ps(culled); | |
*resData = visibilityResults16[mask]; | |
rectData += 16; | |
++resData; | |
} | |
} | |
__declspec(noinline) | |
void cullRectsSSE2(const Rect* rects, uint32_t numRects, const Rect* camRect, bool* results) | |
{ | |
const float* rectData = (const float*)rects; | |
uint32_t* resData = (uint32_t*)results; | |
const __m128 camMinX = _mm_set_ps1(camRect->minx); | |
const __m128 camMinY = _mm_set_ps1(camRect->miny); | |
const __m128 camMaxX = _mm_set_ps1(camRect->maxx); | |
const __m128 camMaxY = _mm_set_ps1(camRect->maxy); | |
const uint32_t iter = numRects >> 3; | |
for (uint32_t i = 0; i < iter; ++i) { | |
const __m128 r0 = _mm_load_ps(rectData + 0); // { r0.minx, r0.miny, r0.maxx, r0.maxy } | |
const __m128 r1 = _mm_load_ps(rectData + 4); // { r1.minx, r1.miny, r1.maxx, r1.maxy } | |
const __m128 r2 = _mm_load_ps(rectData + 8); // { r2.minx, r2.miny, r2.maxx, r2.maxy } | |
const __m128 r3 = _mm_load_ps(rectData + 12); // { r3.minx, r3.miny, r3.maxx, r3.maxy } | |
const __m128 r4 = _mm_load_ps(rectData + 16); | |
const __m128 r5 = _mm_load_ps(rectData + 20); | |
const __m128 r6 = _mm_load_ps(rectData + 24); | |
const __m128 r7 = _mm_load_ps(rectData + 28); | |
const __m128 r01_min = _mm_movelh_ps(r0, r1); // { r0.minx, r0.miny, r1.minx, r1.miny } | |
const __m128 r01_max = _mm_movehl_ps(r1, r0); // { r0.maxx, r0.maxy, r1.maxx, r1.maxy } | |
const __m128 r23_min = _mm_movelh_ps(r2, r3); // { r2.minx, r2.miny, r3.minx, r3.miny } | |
const __m128 r23_max = _mm_movehl_ps(r3, r2); // { r2.maxx, r2.maxy, r3.maxx, r3.maxy } | |
const __m128 r45_min = _mm_movelh_ps(r4, r5); | |
const __m128 r45_max = _mm_movehl_ps(r5, r4); | |
const __m128 r67_min = _mm_movelh_ps(r6, r7); | |
const __m128 r67_max = _mm_movehl_ps(r7, r6); | |
const __m128 minx0123 = _mm_shuffle_ps(r01_min, r23_min, _MM_SHUFFLE(2, 0, 2, 0)); // { r0.minx, r1.minx, r2.minx, r3.minx } | |
const __m128 miny0123 = _mm_shuffle_ps(r01_min, r23_min, _MM_SHUFFLE(3, 1, 3, 1)); // { r0.miny, r1.miny, r2.miny, r3.miny } | |
const __m128 maxx0123 = _mm_shuffle_ps(r01_max, r23_max, _MM_SHUFFLE(2, 0, 2, 0)); // { r0.maxx, r1.maxx, r2.maxx, r3.maxx } | |
const __m128 maxy0123 = _mm_shuffle_ps(r01_max, r23_max, _MM_SHUFFLE(3, 1, 3, 1)); // { r0.maxy, r1.maxy, r2.maxy, r3.maxy } | |
const __m128 minx4567 = _mm_shuffle_ps(r45_min, r67_min, _MM_SHUFFLE(2, 0, 2, 0)); | |
const __m128 miny4567 = _mm_shuffle_ps(r45_min, r67_min, _MM_SHUFFLE(3, 1, 3, 1)); | |
const __m128 maxx4567 = _mm_shuffle_ps(r45_max, r67_max, _MM_SHUFFLE(2, 0, 2, 0)); | |
const __m128 maxy4567 = _mm_shuffle_ps(r45_max, r67_max, _MM_SHUFFLE(3, 1, 3, 1)); | |
const __m128 minx0123_gt_camMaxX = _mm_cmpgt_ps(minx0123, camMaxX); | |
const __m128 miny0123_gt_camMaxY = _mm_cmpgt_ps(miny0123, camMaxY); | |
const __m128 maxx0123_lt_camMinX = _mm_cmplt_ps(maxx0123, camMinX); | |
const __m128 maxy0123_lt_camMinY = _mm_cmplt_ps(maxy0123, camMinY); | |
const __m128 minx4567_gt_camMaxX = _mm_cmpgt_ps(minx4567, camMaxX); | |
const __m128 miny4567_gt_camMaxY = _mm_cmpgt_ps(miny4567, camMaxY); | |
const __m128 maxx4567_lt_camMinX = _mm_cmplt_ps(maxx4567, camMinX); | |
const __m128 maxy4567_lt_camMinY = _mm_cmplt_ps(maxy4567, camMinY); | |
const __m128 culled0_0123 = _mm_or_ps(minx0123_gt_camMaxX, miny0123_gt_camMaxY); | |
const __m128 culled1_0123 = _mm_or_ps(maxx0123_lt_camMinX, maxy0123_lt_camMinY); | |
const __m128 culled0123 = _mm_or_ps(culled0_0123, culled1_0123); | |
const __m128 culled0_4567 = _mm_or_ps(minx4567_gt_camMaxX, miny4567_gt_camMaxY); | |
const __m128 culled1_4567 = _mm_or_ps(maxx4567_lt_camMinX, maxy4567_lt_camMinY); | |
const __m128 culled4567 = _mm_or_ps(culled0_4567, culled1_4567); | |
int mask0123 = _mm_movemask_ps(culled0123); | |
int mask4567 = _mm_movemask_ps(culled4567); | |
resData[0] = visibilityResults16[mask0123]; | |
resData[1] = visibilityResults16[mask4567]; | |
rectData += 32; | |
resData += 2; | |
} | |
} | |
__declspec(noinline) | |
void cullRectsSSE3(const RectSoA* rects, uint32_t numRects, const Rect* camRect, bool* results) | |
{ | |
const float* minx = rects->minx; | |
const float* miny = rects->miny; | |
const float* maxx = rects->maxx; | |
const float* maxy = rects->maxy; | |
uint32_t* resData = (uint32_t*)results; | |
const __m128 cam = _mm_loadu_ps((float*)camRect); | |
const __m128 camMinX = _mm_shuffle_ps(cam, cam, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 camMinY = _mm_shuffle_ps(cam, cam, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 camMaxX = _mm_shuffle_ps(cam, cam, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 camMaxY = _mm_shuffle_ps(cam, cam, _MM_SHUFFLE(3, 3, 3, 3)); | |
const uint32_t iter = numRects >> 3; | |
for (uint32_t i = 0; i < iter; ++i) { | |
const __m128 culled0_0123 = _mm_or_ps(_mm_cmpgt_ps(_mm_load_ps(minx), camMaxX), _mm_cmpgt_ps(_mm_load_ps(miny), camMaxY)); | |
const __m128 culled1_0123 = _mm_or_ps(_mm_cmplt_ps(_mm_load_ps(maxx), camMinX), _mm_cmplt_ps(_mm_load_ps(maxy), camMinY)); | |
const __m128 culled0_4567 = _mm_or_ps(_mm_cmpgt_ps(_mm_load_ps(minx + 4), camMaxX), _mm_cmpgt_ps(_mm_load_ps(miny + 4), camMaxY)); | |
const __m128 culled1_4567 = _mm_or_ps(_mm_cmplt_ps(_mm_load_ps(maxx + 4), camMinX), _mm_cmplt_ps(_mm_load_ps(maxy + 4), camMinY)); | |
const __m128 culled0123 = _mm_or_ps(culled0_0123, culled1_0123); | |
const __m128 culled4567 = _mm_or_ps(culled0_4567, culled1_4567); | |
int mask0123 = _mm_movemask_ps(culled0123); | |
int mask4567 = _mm_movemask_ps(culled4567); | |
resData[0] = visibilityResults16[mask0123]; | |
resData[1] = visibilityResults16[mask4567]; | |
resData += 2; | |
minx += 8; | |
miny += 8; | |
maxx += 8; | |
maxy += 8; | |
} | |
uint32_t rem = numRects & 7; | |
if (rem >= 4) { | |
const __m128 minx0123_gt_camMaxX = _mm_cmpgt_ps(_mm_load_ps(minx), camMaxX); | |
const __m128 miny0123_gt_camMaxY = _mm_cmpgt_ps(_mm_load_ps(miny), camMaxY); | |
const __m128 culled0_0123 = _mm_or_ps(minx0123_gt_camMaxX, miny0123_gt_camMaxY); | |
const __m128 maxx0123_lt_camMinX = _mm_cmplt_ps(_mm_load_ps(maxx), camMinX); | |
const __m128 maxy0123_lt_camMinY = _mm_cmplt_ps(_mm_load_ps(maxy), camMinY); | |
const __m128 culled1_0123 = _mm_or_ps(maxx0123_lt_camMinX, maxy0123_lt_camMinY); | |
const __m128 culled0123 = _mm_or_ps(culled0_0123, culled1_0123); | |
int mask0123 = _mm_movemask_ps(culled0123); | |
resData[0] = visibilityResults16[mask0123]; | |
minx += 4; | |
miny += 4; | |
maxx += 4; | |
maxy += 4; | |
resData++; | |
rem -= 4; | |
} | |
} | |
__declspec(noinline) | |
void cullRectsSSE4(const RectSoA* rects, uint32_t numRects, const Rect* camRect, bool* results) | |
{ | |
const float* minx = rects->minx; | |
const float* miny = rects->miny; | |
const float* maxx = rects->maxx; | |
const float* maxy = rects->maxy; | |
uint32_t* resData = (uint32_t*)results; | |
const __m128 cam = _mm_loadu_ps((float*)camRect); | |
const __m128 camMinX = _mm_shuffle_ps(cam, cam, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 camMinY = _mm_shuffle_ps(cam, cam, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 camMaxX = _mm_shuffle_ps(cam, cam, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 camMaxY = _mm_shuffle_ps(cam, cam, _MM_SHUFFLE(3, 3, 3, 3)); | |
static const uint32_t allBits[] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; | |
const __m128 xmm_all = _mm_loadu_ps((float*)&allBits); | |
const uint32_t iter = numRects >> 4; | |
for (uint32_t i = 0; i < iter; ++i) { | |
const __m128 culled0_0123 = _mm_or_ps(_mm_cmpgt_ps(_mm_load_ps(minx), camMaxX), _mm_cmpgt_ps(_mm_load_ps(miny), camMaxY)); | |
const __m128 culled1_0123 = _mm_or_ps(_mm_cmplt_ps(_mm_load_ps(maxx), camMinX), _mm_cmplt_ps(_mm_load_ps(maxy), camMinY)); | |
const __m128 culled0_4567 = _mm_or_ps(_mm_cmpgt_ps(_mm_load_ps(minx + 4), camMaxX), _mm_cmpgt_ps(_mm_load_ps(miny + 4), camMaxY)); | |
const __m128 culled1_4567 = _mm_or_ps(_mm_cmplt_ps(_mm_load_ps(maxx + 4), camMinX), _mm_cmplt_ps(_mm_load_ps(maxy + 4), camMinY)); | |
const __m128 culled0_891011 = _mm_or_ps(_mm_cmpgt_ps(_mm_load_ps(minx + 8), camMaxX), _mm_cmpgt_ps(_mm_load_ps(miny + 8), camMaxY)); | |
const __m128 culled1_891011 = _mm_or_ps(_mm_cmplt_ps(_mm_load_ps(maxx + 8), camMinX), _mm_cmplt_ps(_mm_load_ps(maxy + 8), camMinY)); | |
const __m128 culled0_12131415 = _mm_or_ps(_mm_cmpgt_ps(_mm_load_ps(minx + 12), camMaxX), _mm_cmpgt_ps(_mm_load_ps(miny + 12), camMaxY)); | |
const __m128 culled1_12131415 = _mm_or_ps(_mm_cmplt_ps(_mm_load_ps(maxx + 12), camMinX), _mm_cmplt_ps(_mm_load_ps(maxy + 12), camMinY)); | |
const __m128 culled0123 = _mm_or_ps(culled0_0123, culled1_0123); | |
const __m128 culled4567 = _mm_or_ps(culled0_4567, culled1_4567); | |
const __m128 culled891011 = _mm_or_ps(culled0_891011, culled1_891011); | |
const __m128 culled12131415 = _mm_or_ps(culled0_12131415, culled1_12131415); | |
int mask0123 = _mm_movemask_ps(culled0123); | |
int mask4567 = _mm_movemask_ps(culled4567); | |
int mask891011 = _mm_movemask_ps(culled891011); | |
int mask12131415 = _mm_movemask_ps(culled12131415); | |
resData[0] = visibilityResults16[mask0123]; | |
resData[1] = visibilityResults16[mask4567]; | |
resData[2] = visibilityResults16[mask891011]; | |
resData[3] = visibilityResults16[mask12131415]; | |
resData += 4; | |
minx += 16; | |
miny += 16; | |
maxx += 16; | |
maxy += 16; | |
} | |
} | |
float randomFloat(float minValue, float maxValue) | |
{ | |
const float t = (float)rand() / (float)RAND_MAX; | |
return minValue + t * (maxValue - minValue); | |
} | |
volatile uint32_t NUM_RECTS = 1024; | |
int main() | |
{ | |
#ifdef _DEBUG | |
const uint32_t NUM_ITERATIONS = 1; | |
#else | |
const uint32_t NUM_ITERATIONS = 10000000; | |
#endif | |
SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS); | |
SetThreadAffinityMask(GetCurrentThread(), 0x00000001); | |
srand(1); | |
Rect* rects = (Rect*)_aligned_malloc(sizeof(Rect) * NUM_RECTS, 16); | |
for (uint32_t i = 0; i < NUM_RECTS; ++i) { | |
const float x = randomFloat(-100.0f, 100.0f); | |
const float y = randomFloat(-100.0f, 100.0f); | |
const float w = randomFloat(0.0f, 20.0f); | |
const float h = randomFloat(0.0f, 20.0f); | |
rects[i].minx = x; | |
rects[i].miny = y; | |
rects[i].maxx = x + w; | |
rects[i].maxy = y + h; | |
} | |
RectSoA rectSoA; | |
float* soaData = (float*)_aligned_malloc(sizeof(float) * 4 * NUM_RECTS, 16); | |
rectSoA.minx = soaData; | |
rectSoA.miny = soaData + NUM_RECTS; | |
rectSoA.maxx = soaData + NUM_RECTS * 2; | |
rectSoA.maxy = soaData + NUM_RECTS * 3; | |
for (uint32_t i = 0; i < NUM_RECTS; ++i) { | |
rectSoA.minx[i] = rects[i].minx; | |
rectSoA.miny[i] = rects[i].miny; | |
rectSoA.maxx[i] = rects[i].maxx; | |
rectSoA.maxy[i] = rects[i].maxy; | |
} | |
bool* results = (bool*)_aligned_malloc(sizeof(bool) * NUM_RECTS, 16); | |
#if 1 | |
// ~1/4 of the view | |
const float camX = -50.0f; | |
const float camY = -50.0f; | |
const float camW = 100.0f; | |
const float camH = 100.0f; | |
#else | |
#if 0 | |
// All visible | |
const float camX = -200.0f; | |
const float camY = -200.0f; | |
const float camW = 400.0f; | |
const float camH = 400.0f; | |
#else | |
// None visible | |
const float camX = 200.0f; | |
const float camY = 200.0f; | |
const float camW = 400.0f; | |
const float camH = 400.0f; | |
#endif | |
#endif | |
const Rect cameraRect = { | |
camX, camY, camX + camW, camY + camH | |
}; | |
int64_t start = __rdtsc(); | |
for (uint32_t i = 0; i < NUM_ITERATIONS; ++i) { | |
#if TEST_SIMD == 1 | |
cullRectsSSE(rects, NUM_RECTS, &cameraRect, results); | |
#elif TEST_SIMD == 2 | |
cullRectsSSE2(rects, NUM_RECTS, &cameraRect, results); | |
#elif TEST_SIMD == 3 | |
cullRectsSSE3(&rectSoA, NUM_RECTS, &cameraRect, results); | |
#elif TEST_SIMD == 4 | |
cullRectsSSE4(&rectSoA, NUM_RECTS, &cameraRect, results); | |
#elif TEST_SIMD == 5 | |
cullRects2(rects, NUM_RECTS, &cameraRect, results); | |
#else | |
#if 1 | |
cullRects(rects, NUM_RECTS, &cameraRect, results); | |
#else | |
cullRectsSoA(&rectSoA, NUM_RECTS, &cameraRect, results); | |
#endif | |
#endif | |
} | |
int64_t elapsed = __rdtsc() - start; | |
printf("(%lld) %.0f\n", elapsed, (double)elapsed / (double)NUM_ITERATIONS); | |
#if TEST_SIMD | |
bool* refResults = (bool*)_aligned_malloc(sizeof(bool) * NUM_RECTS, 16); | |
cullRects(rects, NUM_RECTS, &cameraRect, refResults); | |
bool err = false; | |
for (uint32_t i = 0; i < NUM_RECTS; ++i) { | |
if (refResults[i] != results[i]) { | |
err = true; | |
break; | |
} | |
} | |
if (err) { | |
printf("Invalid results\n"); | |
} | |
#endif // TEST_SIMD | |
uint32_t numVisible = 0; | |
for (uint32_t i = 0; i < NUM_RECTS; ++i) { | |
numVisible += results[i] ? 1 : 0; | |
} | |
printf("# visible: %u\n", numVisible); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment