Last active
January 29, 2016 15:19
-
-
Save pendingchaos/6b6f1e43040e55fce5cd to your computer and use it in GitHub Desktop.
AABB transformation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//-O0: SSE is ~6.3x faster | |
//-O1: SSE is ~4.6x faster | |
//-O2: SSE is ~2.5x faster | |
//-O3: SSE is ~2.2x faster | |
//-Og: SSE is ~5.5x faster | |
//-Ofast: SSE is ~1.7x faster | |
//-Os: SSE is ~2.3x faster | |
//You probably should not trust these timings. | |
//Compiled with GCC 5.3.1 | |
//Ran on a Intel(R) Core(TM) i7-3770K CPU | |
//Matrices are in column-major layout (or OpenGL or non-Direct3D layout). | |
#include <xmmintrin.h> | |
#include <math.h> | |
void transformAABB_SSE(const float matrix_[4][4], float min[4], float max[4]) { | |
__m128 rmin = _mm_loadu_ps((const float*)matrix_+12); | |
__m128 rmax = rmin; | |
#define ITER(j) {\ | |
__m128 x = _mm_mul_ps(_mm_set1_ps(min[j]), _mm_loadu_ps((const float*)&matrix_[j]));\ | |
__m128 y = _mm_mul_ps(_mm_set1_ps(max[j]), _mm_loadu_ps((const float*)&matrix_[j]));\ | |
rmin = _mm_add_ps(rmin, _mm_min_ps(x, y));\ | |
rmax = _mm_add_ps(rmax, _mm_max_ps(x, y));\ | |
} | |
ITER(0) | |
ITER(1) | |
ITER(2) | |
#undef ITER | |
_mm_store_ps(min, rmin); | |
_mm_store_ps(max, rmax); | |
} | |
void transformAABB(const float matrix[4][4], float min[4], float max[4]) { | |
float rmin[3] = {matrix[3][0], matrix[3][1], matrix[3][2]}; | |
float rmax[3] = {matrix[3][0], matrix[3][1], matrix[3][2]}; | |
#define ITER(i, j) {\ | |
float x = min[j] * matrix[j][i];\ | |
float y = max[j] * matrix[j][i];\ | |
rmin[i] += fmin(x, y);\ | |
rmax[i] += fmax(x, y);\ | |
} | |
ITER(0, 0) | |
ITER(0, 1) | |
ITER(0, 2) | |
ITER(1, 0) | |
ITER(1, 1) | |
ITER(1, 2) | |
ITER(2, 0) | |
ITER(2, 1) | |
ITER(2, 2) | |
#undef ITER | |
min[0] = rmin[0]; | |
min[1] = rmin[1]; | |
min[2] = rmin[2]; | |
max[0] = rmax[0]; | |
max[1] = rmax[1]; | |
max[2] = rmax[2]; | |
} | |
int main(int argc) { | |
const float matrix[4][4]; | |
float min[4]; | |
float max[4]; | |
for (size_t i = 0; i < 10000000; i++) | |
transformAABB_SSE(matrix, min, max); | |
//Stop the compiler from optimizing stuff away. | |
return min[0] + min[1] + min[2] + max[0] + max[1] + max[2]; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment