Skip to content

Instantly share code, notes, and snippets.

@jdryg
Created April 20, 2018 16:55
Show Gist options
  • Save jdryg/f76f159f7e8eae15dba58dd5f66211ee to your computer and use it in GitHub Desktop.
Save jdryg/f76f159f7e8eae15dba58dd5f66211ee to your computer and use it in GitHub Desktop.
cullRects SIMD benchmark
#define TEST_SIMD 3
struct Rect
{
float minx, miny;
float maxx, maxy;
};
struct RectSoA
{
float* minx;
float* miny;
float* maxx;
float* maxy;
};
__declspec(noinline)
void cullRects(const Rect* rects, uint32_t numRects, const Rect* camRect, bool* results)
{
const float camRectMinX = camRect->minx;
const float camRectMinY = camRect->miny;
const float camRectMaxX = camRect->maxx;
const float camRectMaxY = camRect->maxy;
for (uint32_t i = 0; i < numRects; ++i) {
const Rect* r = &rects[i];
const float minX = r->minx;
const float minY = r->miny;
const float maxX = r->maxx;
const float maxY = r->maxy;
if (maxX < camRectMinX ||
maxY < camRectMinY ||
minX > camRectMaxX ||
minY > camRectMaxY)
{
results[i] = false;
} else {
results[i] = true;
}
}
}
__declspec(noinline)
void cullRects2(const Rect* rects, uint32_t numRects, const Rect* camRect, bool* results)
{
const float camRectMinX = camRect->minx;
const float camRectMinY = camRect->miny;
const float camRectMaxX = camRect->maxx;
const float camRectMaxY = camRect->maxy;
for (uint32_t i = 0; i < numRects; ++i) {
const Rect* r = &rects[i];
const uint32_t culled = 0
| ((r->minx > camRectMaxX) ? 1 : 0)
| ((r->miny > camRectMaxY) ? 1 : 0)
| ((r->maxx < camRectMinX) ? 1 : 0)
| ((r->maxy < camRectMinY) ? 1 : 0);
results[i] = culled == 1 ? 0 : 1;
}
}
static const uint32_t visibilityResults16[16] = {
0x01010101, // { 0, 0, 0, 0 }
0x01010100, // { 0, 0, 0, 1 }
0x01010001, // { 0, 0, 1, 0 }
0x01010000, // { 0, 0, 1, 1 }
0x01000101, // { 0, 1, 0, 0 }
0x01000100, // { 0, 1, 0, 1 }
0x01000001, // { 0, 1, 1, 0 }
0x01000000, // { 0, 1, 1, 1 }
0x00010101, // { 1, 0, 0, 0 }
0x00010100, // { 1, 0, 0, 1 }
0x00010001, // { 1, 0, 1, 0 }
0x00010000, // { 1, 0, 1, 1 }
0x00000101, // { 1, 1, 0, 0 }
0x00000100, // { 1, 1, 0, 1 }
0x00000001, // { 1, 1, 1, 0 }
0x00000000, // { 1, 1, 1, 1 }
};
__declspec(noinline)
void cullRectsSoA(const RectSoA* rects, uint32_t numRects, const Rect* camRect, bool* results)
{
const float camRectMinX = camRect->minx;
const float camRectMinY = camRect->miny;
const float camRectMaxX = camRect->maxx;
const float camRectMaxY = camRect->maxy;
const float* minx = rects->minx;
const float* miny = rects->miny;
const float* maxx = rects->maxx;
const float* maxy = rects->maxy;
for (uint32_t i = 0; i < numRects; ++i) {
if (maxx[i] < camRectMinX ||
maxy[i] < camRectMinY ||
minx[i] > camRectMaxX ||
miny[i] > camRectMaxY) {
results[i] = false;
} else {
results[i] = true;
}
}
}
__declspec(noinline)
void cullRectsSSE(const Rect* rects, uint32_t numRects, const Rect* camRect, bool* results)
{
const float* rectData = (const float*)rects;
uint32_t* resData = (uint32_t*)results;
const __m128 camMinX = _mm_set_ps1(camRect->minx);
const __m128 camMinY = _mm_set_ps1(camRect->miny);
const __m128 camMaxX = _mm_set_ps1(camRect->maxx);
const __m128 camMaxY = _mm_set_ps1(camRect->maxy);
const uint32_t iter = numRects >> 2;
for (uint32_t i = 0; i < iter; ++i) {
const __m128 r0 = _mm_load_ps(rectData + 0); // { r0.minx, r0.miny, r0.maxx, r0.maxy }
const __m128 r1 = _mm_load_ps(rectData + 4); // { r1.minx, r1.miny, r1.maxx, r1.maxy }
const __m128 r2 = _mm_load_ps(rectData + 8); // { r2.minx, r2.miny, r2.maxx, r2.maxy }
const __m128 r3 = _mm_load_ps(rectData + 12); // { r3.minx, r3.miny, r3.maxx, r3.maxy }
const __m128 r01_min = _mm_movelh_ps(r0, r1); // { r0.minx, r0.miny, r1.minx, r1.miny }
const __m128 r01_max = _mm_movehl_ps(r1, r0); // { r0.maxx, r0.maxy, r1.maxx, r1.maxy }
const __m128 r23_min = _mm_movelh_ps(r2, r3); // { r2.minx, r2.miny, r3.minx, r3.miny }
const __m128 r23_max = _mm_movehl_ps(r3, r2); // { r2.maxx, r2.maxy, r3.maxx, r3.maxy }
const __m128 minx = _mm_shuffle_ps(r01_min, r23_min, _MM_SHUFFLE(2, 0, 2, 0)); // { r0.minx, r1.minx, r2.minx, r3.minx }
const __m128 miny = _mm_shuffle_ps(r01_min, r23_min, _MM_SHUFFLE(3, 1, 3, 1)); // { r0.miny, r1.miny, r2.miny, r3.miny }
const __m128 maxx = _mm_shuffle_ps(r01_max, r23_max, _MM_SHUFFLE(2, 0, 2, 0)); // { r0.maxx, r1.maxx, r2.maxx, r3.maxx }
const __m128 maxy = _mm_shuffle_ps(r01_max, r23_max, _MM_SHUFFLE(3, 1, 3, 1)); // { r0.maxy, r1.maxy, r2.maxy, r3.maxy }
const __m128 minx_gt_camMaxX = _mm_cmpgt_ps(minx, camMaxX);
const __m128 miny_gt_camMaxY = _mm_cmpgt_ps(miny, camMaxY);
const __m128 maxx_lt_camMinX = _mm_cmplt_ps(maxx, camMinX);
const __m128 maxy_lt_camMinY = _mm_cmplt_ps(maxy, camMinY);
const __m128 culled0 = _mm_or_ps(minx_gt_camMaxX, miny_gt_camMaxY);
const __m128 culled1 = _mm_or_ps(maxx_lt_camMinX, maxy_lt_camMinY);
const __m128 culled = _mm_or_ps(culled0, culled1);
int mask = _mm_movemask_ps(culled);
*resData = visibilityResults16[mask];
rectData += 16;
++resData;
}
}
__declspec(noinline)
void cullRectsSSE2(const Rect* rects, uint32_t numRects, const Rect* camRect, bool* results)
{
const float* rectData = (const float*)rects;
uint32_t* resData = (uint32_t*)results;
const __m128 camMinX = _mm_set_ps1(camRect->minx);
const __m128 camMinY = _mm_set_ps1(camRect->miny);
const __m128 camMaxX = _mm_set_ps1(camRect->maxx);
const __m128 camMaxY = _mm_set_ps1(camRect->maxy);
const uint32_t iter = numRects >> 3;
for (uint32_t i = 0; i < iter; ++i) {
const __m128 r0 = _mm_load_ps(rectData + 0); // { r0.minx, r0.miny, r0.maxx, r0.maxy }
const __m128 r1 = _mm_load_ps(rectData + 4); // { r1.minx, r1.miny, r1.maxx, r1.maxy }
const __m128 r2 = _mm_load_ps(rectData + 8); // { r2.minx, r2.miny, r2.maxx, r2.maxy }
const __m128 r3 = _mm_load_ps(rectData + 12); // { r3.minx, r3.miny, r3.maxx, r3.maxy }
const __m128 r4 = _mm_load_ps(rectData + 16);
const __m128 r5 = _mm_load_ps(rectData + 20);
const __m128 r6 = _mm_load_ps(rectData + 24);
const __m128 r7 = _mm_load_ps(rectData + 28);
const __m128 r01_min = _mm_movelh_ps(r0, r1); // { r0.minx, r0.miny, r1.minx, r1.miny }
const __m128 r01_max = _mm_movehl_ps(r1, r0); // { r0.maxx, r0.maxy, r1.maxx, r1.maxy }
const __m128 r23_min = _mm_movelh_ps(r2, r3); // { r2.minx, r2.miny, r3.minx, r3.miny }
const __m128 r23_max = _mm_movehl_ps(r3, r2); // { r2.maxx, r2.maxy, r3.maxx, r3.maxy }
const __m128 r45_min = _mm_movelh_ps(r4, r5);
const __m128 r45_max = _mm_movehl_ps(r5, r4);
const __m128 r67_min = _mm_movelh_ps(r6, r7);
const __m128 r67_max = _mm_movehl_ps(r7, r6);
const __m128 minx0123 = _mm_shuffle_ps(r01_min, r23_min, _MM_SHUFFLE(2, 0, 2, 0)); // { r0.minx, r1.minx, r2.minx, r3.minx }
const __m128 miny0123 = _mm_shuffle_ps(r01_min, r23_min, _MM_SHUFFLE(3, 1, 3, 1)); // { r0.miny, r1.miny, r2.miny, r3.miny }
const __m128 maxx0123 = _mm_shuffle_ps(r01_max, r23_max, _MM_SHUFFLE(2, 0, 2, 0)); // { r0.maxx, r1.maxx, r2.maxx, r3.maxx }
const __m128 maxy0123 = _mm_shuffle_ps(r01_max, r23_max, _MM_SHUFFLE(3, 1, 3, 1)); // { r0.maxy, r1.maxy, r2.maxy, r3.maxy }
const __m128 minx4567 = _mm_shuffle_ps(r45_min, r67_min, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 miny4567 = _mm_shuffle_ps(r45_min, r67_min, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 maxx4567 = _mm_shuffle_ps(r45_max, r67_max, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 maxy4567 = _mm_shuffle_ps(r45_max, r67_max, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 minx0123_gt_camMaxX = _mm_cmpgt_ps(minx0123, camMaxX);
const __m128 miny0123_gt_camMaxY = _mm_cmpgt_ps(miny0123, camMaxY);
const __m128 maxx0123_lt_camMinX = _mm_cmplt_ps(maxx0123, camMinX);
const __m128 maxy0123_lt_camMinY = _mm_cmplt_ps(maxy0123, camMinY);
const __m128 minx4567_gt_camMaxX = _mm_cmpgt_ps(minx4567, camMaxX);
const __m128 miny4567_gt_camMaxY = _mm_cmpgt_ps(miny4567, camMaxY);
const __m128 maxx4567_lt_camMinX = _mm_cmplt_ps(maxx4567, camMinX);
const __m128 maxy4567_lt_camMinY = _mm_cmplt_ps(maxy4567, camMinY);
const __m128 culled0_0123 = _mm_or_ps(minx0123_gt_camMaxX, miny0123_gt_camMaxY);
const __m128 culled1_0123 = _mm_or_ps(maxx0123_lt_camMinX, maxy0123_lt_camMinY);
const __m128 culled0123 = _mm_or_ps(culled0_0123, culled1_0123);
const __m128 culled0_4567 = _mm_or_ps(minx4567_gt_camMaxX, miny4567_gt_camMaxY);
const __m128 culled1_4567 = _mm_or_ps(maxx4567_lt_camMinX, maxy4567_lt_camMinY);
const __m128 culled4567 = _mm_or_ps(culled0_4567, culled1_4567);
int mask0123 = _mm_movemask_ps(culled0123);
int mask4567 = _mm_movemask_ps(culled4567);
resData[0] = visibilityResults16[mask0123];
resData[1] = visibilityResults16[mask4567];
rectData += 32;
resData += 2;
}
}
__declspec(noinline)
void cullRectsSSE3(const RectSoA* rects, uint32_t numRects, const Rect* camRect, bool* results)
{
const float* minx = rects->minx;
const float* miny = rects->miny;
const float* maxx = rects->maxx;
const float* maxy = rects->maxy;
uint32_t* resData = (uint32_t*)results;
const __m128 cam = _mm_loadu_ps((float*)camRect);
const __m128 camMinX = _mm_shuffle_ps(cam, cam, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 camMinY = _mm_shuffle_ps(cam, cam, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 camMaxX = _mm_shuffle_ps(cam, cam, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 camMaxY = _mm_shuffle_ps(cam, cam, _MM_SHUFFLE(3, 3, 3, 3));
const uint32_t iter = numRects >> 3;
for (uint32_t i = 0; i < iter; ++i) {
const __m128 culled0_0123 = _mm_or_ps(_mm_cmpgt_ps(_mm_load_ps(minx), camMaxX), _mm_cmpgt_ps(_mm_load_ps(miny), camMaxY));
const __m128 culled1_0123 = _mm_or_ps(_mm_cmplt_ps(_mm_load_ps(maxx), camMinX), _mm_cmplt_ps(_mm_load_ps(maxy), camMinY));
const __m128 culled0_4567 = _mm_or_ps(_mm_cmpgt_ps(_mm_load_ps(minx + 4), camMaxX), _mm_cmpgt_ps(_mm_load_ps(miny + 4), camMaxY));
const __m128 culled1_4567 = _mm_or_ps(_mm_cmplt_ps(_mm_load_ps(maxx + 4), camMinX), _mm_cmplt_ps(_mm_load_ps(maxy + 4), camMinY));
const __m128 culled0123 = _mm_or_ps(culled0_0123, culled1_0123);
const __m128 culled4567 = _mm_or_ps(culled0_4567, culled1_4567);
int mask0123 = _mm_movemask_ps(culled0123);
int mask4567 = _mm_movemask_ps(culled4567);
resData[0] = visibilityResults16[mask0123];
resData[1] = visibilityResults16[mask4567];
resData += 2;
minx += 8;
miny += 8;
maxx += 8;
maxy += 8;
}
uint32_t rem = numRects & 7;
if (rem >= 4) {
const __m128 minx0123_gt_camMaxX = _mm_cmpgt_ps(_mm_load_ps(minx), camMaxX);
const __m128 miny0123_gt_camMaxY = _mm_cmpgt_ps(_mm_load_ps(miny), camMaxY);
const __m128 culled0_0123 = _mm_or_ps(minx0123_gt_camMaxX, miny0123_gt_camMaxY);
const __m128 maxx0123_lt_camMinX = _mm_cmplt_ps(_mm_load_ps(maxx), camMinX);
const __m128 maxy0123_lt_camMinY = _mm_cmplt_ps(_mm_load_ps(maxy), camMinY);
const __m128 culled1_0123 = _mm_or_ps(maxx0123_lt_camMinX, maxy0123_lt_camMinY);
const __m128 culled0123 = _mm_or_ps(culled0_0123, culled1_0123);
int mask0123 = _mm_movemask_ps(culled0123);
resData[0] = visibilityResults16[mask0123];
minx += 4;
miny += 4;
maxx += 4;
maxy += 4;
resData++;
rem -= 4;
}
}
__declspec(noinline)
void cullRectsSSE4(const RectSoA* rects, uint32_t numRects, const Rect* camRect, bool* results)
{
const float* minx = rects->minx;
const float* miny = rects->miny;
const float* maxx = rects->maxx;
const float* maxy = rects->maxy;
uint32_t* resData = (uint32_t*)results;
const __m128 cam = _mm_loadu_ps((float*)camRect);
const __m128 camMinX = _mm_shuffle_ps(cam, cam, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 camMinY = _mm_shuffle_ps(cam, cam, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 camMaxX = _mm_shuffle_ps(cam, cam, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 camMaxY = _mm_shuffle_ps(cam, cam, _MM_SHUFFLE(3, 3, 3, 3));
static const uint32_t allBits[] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
const __m128 xmm_all = _mm_loadu_ps((float*)&allBits);
const uint32_t iter = numRects >> 4;
for (uint32_t i = 0; i < iter; ++i) {
const __m128 culled0_0123 = _mm_or_ps(_mm_cmpgt_ps(_mm_load_ps(minx), camMaxX), _mm_cmpgt_ps(_mm_load_ps(miny), camMaxY));
const __m128 culled1_0123 = _mm_or_ps(_mm_cmplt_ps(_mm_load_ps(maxx), camMinX), _mm_cmplt_ps(_mm_load_ps(maxy), camMinY));
const __m128 culled0_4567 = _mm_or_ps(_mm_cmpgt_ps(_mm_load_ps(minx + 4), camMaxX), _mm_cmpgt_ps(_mm_load_ps(miny + 4), camMaxY));
const __m128 culled1_4567 = _mm_or_ps(_mm_cmplt_ps(_mm_load_ps(maxx + 4), camMinX), _mm_cmplt_ps(_mm_load_ps(maxy + 4), camMinY));
const __m128 culled0_891011 = _mm_or_ps(_mm_cmpgt_ps(_mm_load_ps(minx + 8), camMaxX), _mm_cmpgt_ps(_mm_load_ps(miny + 8), camMaxY));
const __m128 culled1_891011 = _mm_or_ps(_mm_cmplt_ps(_mm_load_ps(maxx + 8), camMinX), _mm_cmplt_ps(_mm_load_ps(maxy + 8), camMinY));
const __m128 culled0_12131415 = _mm_or_ps(_mm_cmpgt_ps(_mm_load_ps(minx + 12), camMaxX), _mm_cmpgt_ps(_mm_load_ps(miny + 12), camMaxY));
const __m128 culled1_12131415 = _mm_or_ps(_mm_cmplt_ps(_mm_load_ps(maxx + 12), camMinX), _mm_cmplt_ps(_mm_load_ps(maxy + 12), camMinY));
const __m128 culled0123 = _mm_or_ps(culled0_0123, culled1_0123);
const __m128 culled4567 = _mm_or_ps(culled0_4567, culled1_4567);
const __m128 culled891011 = _mm_or_ps(culled0_891011, culled1_891011);
const __m128 culled12131415 = _mm_or_ps(culled0_12131415, culled1_12131415);
int mask0123 = _mm_movemask_ps(culled0123);
int mask4567 = _mm_movemask_ps(culled4567);
int mask891011 = _mm_movemask_ps(culled891011);
int mask12131415 = _mm_movemask_ps(culled12131415);
resData[0] = visibilityResults16[mask0123];
resData[1] = visibilityResults16[mask4567];
resData[2] = visibilityResults16[mask891011];
resData[3] = visibilityResults16[mask12131415];
resData += 4;
minx += 16;
miny += 16;
maxx += 16;
maxy += 16;
}
}
float randomFloat(float minValue, float maxValue)
{
const float t = (float)rand() / (float)RAND_MAX;
return minValue + t * (maxValue - minValue);
}
volatile uint32_t NUM_RECTS = 1024;
int main()
{
#ifdef _DEBUG
const uint32_t NUM_ITERATIONS = 1;
#else
const uint32_t NUM_ITERATIONS = 10000000;
#endif
SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS);
SetThreadAffinityMask(GetCurrentThread(), 0x00000001);
srand(1);
Rect* rects = (Rect*)_aligned_malloc(sizeof(Rect) * NUM_RECTS, 16);
for (uint32_t i = 0; i < NUM_RECTS; ++i) {
const float x = randomFloat(-100.0f, 100.0f);
const float y = randomFloat(-100.0f, 100.0f);
const float w = randomFloat(0.0f, 20.0f);
const float h = randomFloat(0.0f, 20.0f);
rects[i].minx = x;
rects[i].miny = y;
rects[i].maxx = x + w;
rects[i].maxy = y + h;
}
RectSoA rectSoA;
float* soaData = (float*)_aligned_malloc(sizeof(float) * 4 * NUM_RECTS, 16);
rectSoA.minx = soaData;
rectSoA.miny = soaData + NUM_RECTS;
rectSoA.maxx = soaData + NUM_RECTS * 2;
rectSoA.maxy = soaData + NUM_RECTS * 3;
for (uint32_t i = 0; i < NUM_RECTS; ++i) {
rectSoA.minx[i] = rects[i].minx;
rectSoA.miny[i] = rects[i].miny;
rectSoA.maxx[i] = rects[i].maxx;
rectSoA.maxy[i] = rects[i].maxy;
}
bool* results = (bool*)_aligned_malloc(sizeof(bool) * NUM_RECTS, 16);
#if 1
// ~1/4 of the view
const float camX = -50.0f;
const float camY = -50.0f;
const float camW = 100.0f;
const float camH = 100.0f;
#else
#if 0
// All visible
const float camX = -200.0f;
const float camY = -200.0f;
const float camW = 400.0f;
const float camH = 400.0f;
#else
// None visible
const float camX = 200.0f;
const float camY = 200.0f;
const float camW = 400.0f;
const float camH = 400.0f;
#endif
#endif
const Rect cameraRect = {
camX, camY, camX + camW, camY + camH
};
int64_t start = __rdtsc();
for (uint32_t i = 0; i < NUM_ITERATIONS; ++i) {
#if TEST_SIMD == 1
cullRectsSSE(rects, NUM_RECTS, &cameraRect, results);
#elif TEST_SIMD == 2
cullRectsSSE2(rects, NUM_RECTS, &cameraRect, results);
#elif TEST_SIMD == 3
cullRectsSSE3(&rectSoA, NUM_RECTS, &cameraRect, results);
#elif TEST_SIMD == 4
cullRectsSSE4(&rectSoA, NUM_RECTS, &cameraRect, results);
#elif TEST_SIMD == 5
cullRects2(rects, NUM_RECTS, &cameraRect, results);
#else
#if 1
cullRects(rects, NUM_RECTS, &cameraRect, results);
#else
cullRectsSoA(&rectSoA, NUM_RECTS, &cameraRect, results);
#endif
#endif
}
int64_t elapsed = __rdtsc() - start;
printf("(%lld) %.0f\n", elapsed, (double)elapsed / (double)NUM_ITERATIONS);
#if TEST_SIMD
bool* refResults = (bool*)_aligned_malloc(sizeof(bool) * NUM_RECTS, 16);
cullRects(rects, NUM_RECTS, &cameraRect, refResults);
bool err = false;
for (uint32_t i = 0; i < NUM_RECTS; ++i) {
if (refResults[i] != results[i]) {
err = true;
break;
}
}
if (err) {
printf("Invalid results\n");
}
#endif // TEST_SIMD
uint32_t numVisible = 0;
for (uint32_t i = 0; i < NUM_RECTS; ++i) {
numVisible += results[i] ? 1 : 0;
}
printf("# visible: %u\n", numVisible);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment