View frustum culling optimization: Balancing the pipes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// gcc -std=c99 -O3 (gcc 4.1.1) | |
// 93 cycles per iteration (w/out unrolling) | |
// 74 cycles per iteration (w/4x unrolling) | |
// better balanced clip space version | |
#include <stdbool.h> | |
#include <spu_intrinsics.h> | |
// shuffle helpers | |
#define L0 0x00010203 | |
#define L1 0x04050607 | |
#define L2 0x08090a0b | |
#define L3 0x0c0d0e0f | |
#define R0 0x10111213 | |
#define R1 0x14151617 | |
#define R2 0x18191a1b | |
#define R3 0x1c1d1e1f | |
#define ZERO 0x80808080 | |
#define SHUFFLE(l, r, x, y, z, w) si_shufb(l, r, ((qword)(vec_uint4){x, y, z, w})) | |
// splat helper | |
#define SPLAT(v, idx) si_shufb(v, v, (qword)(vec_uint4)(L ## idx)) | |
struct matrix_t | |
{ | |
vec_float4 row0; | |
vec_float4 row1; | |
vec_float4 row2; | |
vec_float4 row3; | |
}; | |
struct aabb_t | |
{ | |
vec_float4 min; | |
vec_float4 max; | |
}; | |
static inline void transform_points_4(qword* dest, qword x, qword y, qword z, const struct matrix_t* mat) | |
{ | |
#define COMP(c) \ | |
qword res_ ## c = SPLAT((qword)mat->row3, c); \ | |
res_ ## c = si_fma(x, SPLAT((qword)mat->row0, c), res_ ## c); \ | |
res_ ## c = si_fma(y, SPLAT((qword)mat->row1, c), res_ ## c); \ | |
res_ ## c = si_fma(z, SPLAT((qword)mat->row2, c), res_ ## c); \ | |
dest[c] = res_ ## c; | |
COMP(0); | |
COMP(1); | |
COMP(2); | |
COMP(3); | |
#undef COMP | |
} | |
static inline void transform_matrix(struct matrix_t* dest, const struct matrix_t* lhs, const struct matrix_t* rhs) | |
{ | |
#define COMP_0(c) \ | |
qword res_ ## c = si_fm((qword)lhs->row2, SPLAT((qword)rhs->row ## c, 2)); \ | |
res_ ## c = si_fma((qword)lhs->row1, SPLAT((qword)rhs->row ## c, 1), res_ ## c); \ | |
res_ ## c = si_fma((qword)lhs->row0, SPLAT((qword)rhs->row ## c, 0), res_ ## c); \ | |
dest->row ## c = (vec_float4)res_ ## c; | |
#define COMP_1(c) \ | |
qword res_ ## c = si_fma((qword)lhs->row2, SPLAT((qword)rhs->row ## c, 2), (qword)lhs->row3); \ | |
res_ ## c = si_fma((qword)lhs->row1, SPLAT((qword)rhs->row ## c, 1), res_ ## c); \ | |
res_ ## c = si_fma((qword)lhs->row0, SPLAT((qword)rhs->row ## c, 0), res_ ## c); \ | |
dest->row ## c = (vec_float4)res_ ## c; | |
COMP_0(0); | |
COMP_0(1); | |
COMP_0(2); | |
COMP_1(3); | |
#undef COMP_0 | |
#undef COMP_1 | |
} | |
inline unsigned int is_visible_impl(const struct matrix_t* transform, const struct aabb_t* aabb, const struct matrix_t* frustum) | |
{ | |
qword min = (qword)aabb->min; | |
qword max = (qword)aabb->max; | |
// get aabb points (SoA) | |
qword minmax_x = SHUFFLE(min, max, L0, R0, L0, R0); // x X x X | |
qword minmax_y = SHUFFLE(min, max, L1, L1, R1, R1); // y y Y Y | |
qword minmax_z_0 = SPLAT(min, 2); // z z z z | |
qword minmax_z_1 = SPLAT(max, 2); // Z Z Z Z | |
// get clipping matrix | |
struct matrix_t clip; | |
transform_matrix(&clip, frustum, transform); | |
// transform points to clip space | |
qword points_cs_0[4]; | |
qword points_cs_1[4]; | |
transform_points_4(points_cs_0, minmax_x, minmax_y, minmax_z_0, &clip); | |
transform_points_4(points_cs_1, minmax_x, minmax_y, minmax_z_1, &clip); | |
// for each plane... | |
#define NOUT(op, idx0, idx1) si_orx(si_nand(op(points_cs_0[idx0], points_cs_0[idx1]), op(points_cs_1[idx0], points_cs_1[idx1]))) | |
qword nout0 = NOUT(si_fa, 0, 3); // (x + w) >= 0 for any point | |
qword nout1 = NOUT(si_fs, 3, 0); // (w - x) >= 0 for any point | |
qword nout2 = NOUT(si_fa, 1, 3); // (y + w) >= 0 for any point | |
qword nout3 = NOUT(si_fs, 3, 1); // (w - y) >= 0 for any point | |
qword nout4 = si_orx(si_nand(points_cs_0[2], points_cs_1[2])); // z >= 0 for any point | |
qword nout5 = NOUT(si_fs, 3, 2); // (w - z) >= 0 for any point | |
#undef NOUT | |
// merge "not outside" flags | |
qword nout01 = si_and(nout0, nout1); | |
qword nout34 = si_and(nout3, nout4); | |
qword nout012 = si_and(nout01, nout2); | |
qword nout345 = si_and(nout34, nout5); | |
qword nout = si_and(nout012, nout345); | |
return si_to_int(nout) >> 31; | |
} | |
__attribute__((noinline)) void is_visible(qword* result, const struct matrix_t* transform, const struct aabb_t* aabb, unsigned int count, const struct matrix_t* frustum) | |
{ | |
for (unsigned int i = 0; i < count; i += 4) | |
{ | |
qword r0 = si_from_uint(is_visible_impl(transform + i + 0, aabb + i + 0, frustum)); | |
qword r1 = si_from_uint(is_visible_impl(transform + i + 1, aabb + i + 1, frustum)); | |
qword r2 = si_from_uint(is_visible_impl(transform + i + 2, aabb + i + 2, frustum)); | |
qword r3 = si_from_uint(is_visible_impl(transform + i + 3, aabb + i + 3, frustum)); | |
result[i + 0] = r0; | |
result[i + 1] = r1; | |
result[i + 2] = r2; | |
result[i + 3] = r3; | |
} | |
} | |
#define COUNT 1024 | |
// simple ortho frustum | |
struct matrix_t frustum = | |
{ | |
{ 0.1f, 0, 0, 0 }, | |
{ 0, 0.1f, 0, 0 }, | |
{ 0, 0, 0.1f, 0 }, | |
{ 0, 0, 0, 1 } | |
}; | |
// small box | |
struct aabb_t aabb[COUNT] = | |
{ | |
{ -1, -2, -3 }, | |
{ 1, 2, 3 } | |
}; | |
// and some weird matrix | |
struct matrix_t transform[COUNT] = | |
{ | |
{ 0.123f, 0.456f, 0.789f }, | |
{ 0.456f, 0.123f, 0.789f }, | |
{ 0.789f, 0.123f, 0.456f }, | |
{ 1.f, -1.f, 1.f } | |
}; | |
qword result[COUNT]; | |
int main() | |
{ | |
is_visible(result, transform, aabb, COUNT, &frustum); | |
si_stop(0); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment