Created
June 26, 2012 07:48
-
-
Save rikusalminen/2994204 to your computer and use it in GitHub Desktop.
Some 3D math with x86 SIMD: SSE, AVX, FMA, et al
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <x86intrin.h> | |
#include <stdio.h> | |
void printv(__m128 vec) | |
{ | |
float x[4]; | |
_mm_storeu_ps(x, vec); | |
printf("%2.3f\t%2.3f\t%2.3f\t%2.3f\n", x[0], x[1], x[2], x[3]); | |
} | |
void printvi(__m128 vec) | |
{ | |
uint32_t x[4]; | |
_mm_storeu_ps((float*)x, vec); | |
printf("%8x\t%8x\t%8x\t%8x\n", x[0], x[1], x[2], x[3]); | |
} | |
typedef float vec4 __attribute__((vector_size(16))); | |
static inline vec4 vadd(vec4 x, vec4 y) { return _mm_add_ps(x, y); } | |
static inline vec4 vsub(vec4 x, vec4 y) { return _mm_sub_ps(x, y); } | |
static inline vec4 vmul(vec4 x, vec4 y) { return _mm_mul_ps(x, y); } | |
static inline vec4 vdiv(vec4 x, vec4 y) { return _mm_div_ps(x, y); } | |
static inline vec4 vmin(vec4 x, vec4 y) { return _mm_min_ps(x, y); } | |
static inline vec4 vmax(vec4 x, vec4 y) { return _mm_max_ps(x, y); } | |
static inline vec4 vaddf(vec4 x, vec4 y) { return _mm_add_ss(x, y); } | |
static inline vec4 vsubf(vec4 x, vec4 y) { return _mm_sub_ss(x, y); } | |
static inline vec4 vmulf(vec4 x, vec4 y) { return _mm_mul_ss(x, y); } | |
static inline vec4 vdivf(vec4 x, vec4 y) { return _mm_div_ss(x, y); } | |
static inline vec4 vminf(vec4 x, vec4 y) { return _mm_min_ss(x, y); } | |
static inline vec4 vmaxf(vec4 x, vec4 y) { return _mm_max_ss(x, y); } | |
static inline vec4 vsqrt(vec4 x) { return _mm_sqrt_ps(x); } | |
static inline vec4 vsqrtf(vec4 x) { return _mm_sqrt_ss(x); } | |
static inline vec4 vrcp(vec4 x) { return _mm_rcp_ps(x); } | |
static inline vec4 vrcpf(vec4 x) { return _mm_rcp_ss(x); } | |
static inline vec4 vrsqrt(vec4 x) { return _mm_rsqrt_ps(x); } | |
static inline vec4 vrsqrtf(vec4 x) { return _mm_rsqrt_ss(x); } | |
#define vshuffle_mask(a, b, c, d) (((a) << 0) | ((b) << 2) | ((c) << 4) | ((d) << 6)) | |
#define vshuffle(x, y, a, b, c, d) (__builtin_ia32_shufps((x), (y), vshuffle_mask((a),(b),(c),(d)))) | |
#define vsplat(v, x) vshuffle((v), (v), (x), (x), (x), (x)) | |
#ifdef __FMA4__ | |
static inline vec4 vmadd(vec4 a, vec4 b, vec4 c) | |
{ | |
return _mm_macc_ps(a, b, c); | |
} | |
static inline vec4 vnmadd(vec4 a, vec4 b, vec4 c) | |
{ | |
return _mm_nmacc_ps(a, b, c); | |
} | |
static inline vec4 vmsub(vec4 a, vec4 b, vec4 c) | |
{ | |
return _mm_msub_ps(a, b, c); | |
} | |
static inline vec4 vnmsub(vec4 a, vec4 b, vec4 c) | |
{ | |
return _mm_nmsub_ps(a, b, c); | |
} | |
#else | |
static inline vec4 vmadd(vec4 a, vec4 b, vec4 c) | |
{ | |
return a * b + c; | |
} | |
static inline vec4 vnmadd(vec4 a, vec4 b, vec4 c) | |
{ | |
return -(a * b) + c; | |
} | |
static inline vec4 vmsub(vec4 a, vec4 b, vec4 c) | |
{ | |
return a * b - c; | |
} | |
static inline vec4 vnmsub(vec4 a, vec4 b, vec4 c) | |
{ | |
return -(a * b) - c; | |
} | |
#endif | |
#ifdef __SSE4_1__ | |
static inline vec4 vblend(vec4 a, vec4 b, vec4 mask) { return _mm_blendv_ps(a, b, mask); } | |
static inline vec4 vblend_mask(vec4 a, vec4 b, const int mask) { return _mm_blend_ps(a, b, mask); } | |
static inline vec4 vdot(vec4 x, vec4 y) | |
{ | |
return _mm_dp_ps(x, y, 0xf); | |
} | |
static inline vec4 vdot3(vec4 x, vec4 y) | |
{ | |
return _mm_dp_ps(x, y, 0x7); | |
} | |
#else | |
static inline vec4 vblend(vec4 a, vec4 b, vec4 mask) | |
{ | |
vec4 extended_mask = _mm_cmplt_ps(mask, _mm_setzero_ps()); | |
return _mm_or_ps( | |
_mm_andnot_ps(extended_mask, a), | |
_mm_and_ps(extended_mask, b)); | |
} | |
static inline vec4 vblend_mask(vec4 a, vec4 b, const int mask); | |
#ifdef __SSE3__ | |
static inline vec4 vdot(vec4 x, vec4 y) | |
{ | |
vec4 prod = x * y; | |
vec4 sum1 = _mm_hadd_ps(prod, prod); | |
vec4 sum2 = _mm_hadd_ps(sum1, sum1); | |
return sum2; | |
} | |
#else | |
static inline vec4 vdot(vec4 x, vec4 y) | |
{ | |
vec4 prod = x * y; | |
vec4 sum1 = prod + vshuffle(prod, prod, 1, 0, 3, 2); | |
vec4 sum2 = sum1 + vshuffle(sum1, sum1, 2, 2, 0, 0); | |
return sum2; | |
} | |
#endif | |
static inline vec4 vdot3(vec4 x, vec4 y) | |
{ | |
vec4 zero = { 0, 0, 0, 0 }; | |
return vdot( | |
vshuffle(x, vshuffle(x, zero, 2, 3, 0, 0), 0, 1, 0, 3), | |
vshuffle(y, vshuffle(y, zero, 2, 3, 0, 0), 0, 1, 0, 3)); | |
} | |
#endif | |
static inline vec4 vmag(vec4 x) | |
{ | |
return vsqrt(vdot(x, x)); | |
} | |
static inline vec4 vmagf(vec4 x) | |
{ | |
return vsqrtf(vdot(x, x)); | |
} | |
static inline vec4 vunit(vec4 x) | |
{ | |
return x * vrsqrt(vdot(x, x)); | |
} | |
struct mat4_t | |
{ | |
vec4 rows[4]; | |
} __attribute__((aligned(16))); | |
typedef struct mat4_t mat4; | |
void printm(mat4 m) | |
{ | |
for(int i = 0; i < 4; ++i) | |
printv(m.rows[i]); | |
printf("\n"); | |
} | |
static inline vec4 mvmul_rows(vec4 x, vec4 y, vec4 z, vec4 w, vec4 v) | |
{ | |
return vshuffle( | |
vshuffle( | |
vdot(x, v), | |
vdot(y, v), | |
0, 0, 0, 0), | |
vshuffle( | |
vdot(z, v), | |
vdot(w, v), | |
0, 0, 0, 0), | |
0, 2, 0, 2); | |
} | |
static inline vec4 mvmul(mat4 m, vec4 v) | |
{ | |
return mvmul_rows(m.rows[0], m.rows[1], m.rows[2], m.rows[3], v); | |
} | |
static inline mat4 mmmul(mat4 l, mat4 r) | |
{ | |
_MM_TRANSPOSE4_PS(r.rows[0], r.rows[1], r.rows[2], r.rows[3]); | |
vec4 row0 = vshuffle( | |
vshuffle(vdot(l.rows[0], r.rows[0]), vdot(l.rows[0], r.rows[1]), 0, 0, 0, 0), | |
vshuffle(vdot(l.rows[0], r.rows[2]), vdot(l.rows[0], r.rows[3]), 0, 0, 0, 0), | |
0, 2, 0, 2); | |
vec4 row1 = vshuffle( | |
vshuffle(vdot(l.rows[1], r.rows[0]), vdot(l.rows[1], r.rows[1]), 0, 0, 0, 0), | |
vshuffle(vdot(l.rows[1], r.rows[2]), vdot(l.rows[1], r.rows[3]), 0, 0, 0, 0), | |
0, 2, 0, 2); | |
vec4 row2 = vshuffle( | |
vshuffle(vdot(l.rows[2], r.rows[0]), vdot(l.rows[2], r.rows[1]), 0, 0, 0, 0), | |
vshuffle(vdot(l.rows[2], r.rows[2]), vdot(l.rows[2], r.rows[3]), 0, 0, 0, 0), | |
0, 2, 0, 2); | |
vec4 row3 = vshuffle( | |
vshuffle(vdot(l.rows[3], r.rows[0]), vdot(l.rows[3], r.rows[1]), 0, 0, 0, 0), | |
vshuffle(vdot(l.rows[3], r.rows[2]), vdot(l.rows[3], r.rows[3]), 0, 0, 0, 0), | |
0, 2, 0, 2); | |
mat4 result = { { row0, row1, row2, row3 } }; | |
return result; | |
} | |
static inline mat4 mmmul_freevec(mat4 l, mat4 r) | |
{ | |
vec4 row0 = | |
vmadd(vsplat(l.rows[0], 3), r.rows[0], | |
vmadd(vsplat(l.rows[0], 2), r.rows[0], | |
vmadd(vsplat(l.rows[0], 1), r.rows[0], | |
(vsplat(l.rows[0], 0) * r.rows[0])))); | |
vec4 row1 = | |
vmadd(vsplat(l.rows[1], 3), r.rows[1], | |
vmadd(vsplat(l.rows[1], 2), r.rows[1], | |
vmadd(vsplat(l.rows[1], 1), r.rows[1], | |
(vsplat(l.rows[1], 0) * r.rows[1])))); | |
vec4 row2 = | |
vmadd(vsplat(l.rows[2], 3), r.rows[2], | |
vmadd(vsplat(l.rows[2], 2), r.rows[2], | |
vmadd(vsplat(l.rows[2], 1), r.rows[2], | |
(vsplat(l.rows[2], 0) * r.rows[2])))); | |
vec4 row3 = | |
vmadd(vsplat(l.rows[3], 3), r.rows[3], | |
vmadd(vsplat(l.rows[3], 2), r.rows[3], | |
vmadd(vsplat(l.rows[3], 1), r.rows[3], | |
(vsplat(l.rows[3], 0) * r.rows[3])))); | |
mat4 result = { { row0, row1, row2, row3 } }; | |
return result; | |
} | |
static inline mat4 mmmul_add(mat4 l, mat4 r) | |
{ | |
vec4 row0 = | |
vsplat(l.rows[0], 3) * r.rows[0] + | |
vsplat(l.rows[0], 2) * r.rows[0] + | |
vsplat(l.rows[0], 1) * r.rows[0] + | |
vsplat(l.rows[0], 0) * r.rows[0]; | |
vec4 row1 = | |
vsplat(l.rows[1], 3) * r.rows[1] + | |
vsplat(l.rows[1], 2) * r.rows[1] + | |
vsplat(l.rows[1], 1) * r.rows[1] + | |
vsplat(l.rows[1], 0) * r.rows[1]; | |
vec4 row2 = | |
vsplat(l.rows[2], 3) * r.rows[2] + | |
vsplat(l.rows[2], 2) * r.rows[2] + | |
vsplat(l.rows[2], 1) * r.rows[2] + | |
vsplat(l.rows[2], 0) * r.rows[2]; | |
vec4 row3 = | |
vsplat(l.rows[3], 3) * r.rows[3] + | |
vsplat(l.rows[3], 2) * r.rows[3] + | |
vsplat(l.rows[3], 1) * r.rows[3] + | |
vsplat(l.rows[3], 0) * r.rows[3]; | |
mat4 result = { { row0, row1, row2, row3 } }; | |
return result; | |
} | |
static inline mat4 minverse_transpose_rows(vec4 row0, vec4 row1, vec4 row2, vec4 row3) | |
{ | |
vec4 minor0, minor1, minor2, minor3; | |
vec4 det, tmp1; | |
row1 = vshuffle(row1, row1, 2, 3, 0, 1); | |
row3 = vshuffle(row3, row3, 2, 3, 0, 1); | |
tmp1 = row2 * row3; | |
tmp1 = vshuffle(tmp1, tmp1, 1, 0, 3, 2); | |
minor0 = row1 * tmp1; | |
minor1 = row0 * tmp1; | |
tmp1 = vshuffle(tmp1, tmp1, 2, 3, 0, 1); | |
minor0 = vmsub(row1, tmp1, minor0); | |
minor1 = vmsub(row0, tmp1, minor1); | |
minor1 = vshuffle(minor1, minor1, 2, 3, 0, 1); | |
tmp1 = row1 * row2; | |
tmp1 = vshuffle(tmp1, tmp1, 1, 0, 3, 2); | |
minor0 = vmadd(row3, tmp1, minor0); | |
minor3 = row0 * tmp1; | |
tmp1 = vshuffle(tmp1, tmp1, 2, 3, 0, 1); | |
minor0 = vnmadd(row3, tmp1, minor0); | |
minor3 = vmsub(row0, tmp1, minor3); | |
minor3 = vshuffle(minor3, minor3, 2, 3, 0, 1); | |
tmp1 = vshuffle(row1, row1, 2, 3, 0, 1) * row3; | |
tmp1 = vshuffle(tmp1, tmp1, 1, 0, 3, 2); | |
row2 = vshuffle(row2, row2, 2, 3, 0, 1); | |
minor0 = vmadd(row2, tmp1, minor0); | |
minor2 = row0 * tmp1; | |
tmp1 = vshuffle(tmp1, tmp1, 2, 3, 0, 1); | |
minor0 = vnmadd(row2, tmp1, minor0); | |
minor2 = vmsub(row0, tmp1, minor2); | |
minor2 = vshuffle(minor2, minor2, 2, 3, 0, 1); | |
tmp1 = row0 * row1; | |
tmp1 = vshuffle(tmp1, tmp1, 1, 0, 3, 2); | |
minor2 = vmadd(row3, tmp1, minor2); | |
minor3 = vmsub(row2, tmp1, minor3); | |
tmp1 = vshuffle(tmp1, tmp1, 2, 3, 0, 1); | |
minor2 = vmsub(row3, tmp1, minor2); | |
minor3 = vnmadd(row2, tmp1, minor3); | |
tmp1 = row0 * row3; | |
tmp1 = vshuffle(tmp1, tmp1, 1, 0, 3, 2); | |
minor1 = vnmadd(row2, tmp1, minor1); | |
minor2 = vmadd(row1, tmp1, minor2); | |
tmp1 = vshuffle(tmp1, tmp1, 2, 3, 0, 1); | |
minor1 = vmadd(row2, tmp1, minor1); | |
minor2 = vnmadd(row1, tmp1, minor2); | |
tmp1 = row0 * row2; | |
tmp1 = vshuffle(tmp1, tmp1, 1, 0, 3, 2); | |
minor1 = vmadd(row3, tmp1, minor1); | |
minor3 = vnmadd(row1, tmp1, minor3); | |
tmp1 = vshuffle(tmp1, tmp1, 2, 3, 0, 1); | |
minor1 = vnmadd(row3, tmp1, minor1); | |
minor3 = vmadd(row1, tmp1, minor3); | |
det = row0 * minor0; | |
det = vshuffle(det, det, 2, 3, 0, 1) + det; | |
det = vaddf(vshuffle(det, det, 1, 0, 3, 2), det); | |
tmp1 = vrcpf(det); | |
det = vsubf(vaddf(tmp1, tmp1), vmulf(det, vmulf(tmp1, tmp1))); | |
det = vshuffle(det, det, 0, 0, 0, 0); | |
minor0 = det * minor0; | |
minor1 = det * minor1; | |
minor2 = det * minor2; | |
minor3 = det * minor3; | |
mat4 result = { minor0, minor1, minor2, minor3 }; | |
return result; | |
} | |
static inline mat4 mtranspose(mat4 m) | |
{ | |
_MM_TRANSPOSE4_PS(m.rows[0], m.rows[1], m.rows[2], m.rows[3]); | |
return m; | |
} | |
static inline mat4 minverse(mat4 m) | |
{ | |
return mtranspose(minverse_transpose_rows(m.rows[0], m.rows[1], m.rows[2], m.rows[3])); | |
} | |
int main() | |
{ | |
//printv(_mm_setr_ps(1,2,3,4)); | |
vec4 x = { 1, 2, 3, 4 }; | |
vec4 y = { 5, 6, 7, 8 }; | |
vec4 mask = { 1, -1, 1, -1 }; | |
printv(vblend(x, y, mask)); | |
mat4 identity = {{ | |
{ 1, 0, 0, 0 }, | |
{ 0, 1, 0, 0 }, | |
{ 0, 0, 1, 0 }, | |
{ 0, 0, 0, 1 } | |
}}; | |
mat4 matrix = {{ | |
{ 2, 0, 0, 3 }, | |
{ 0, 3, 0, 2 }, | |
{ 0, 0, 4, 1 }, | |
{ 0, 0, 0, 1 } | |
}}; | |
//printv4(x); | |
//printv(x+x); | |
//printv(__builtin_ia32_shufps(x, x, 0x00)); | |
//printv(__builtin_ia32_shufps(x, y, 0x04)); | |
//printv(vshuffle(x, x, 1, 1, 1, 1)); | |
//printv(vshuffle(x, x, 0, 1, 2, 3)); | |
//printv(vshuffle(x, x, 4, 5, 6, 7)); | |
//printv(vshuffle(x, x, 3, 2, 1, 0)); | |
//printv(-x); | |
//printv(vdot(x, x)); | |
//printv(vdot3(x, x)); | |
//printv(mvmul(identity, y)); | |
//printv(vmag(x)); | |
//printv(vmagf(x)); | |
//printv(vunit(x)); | |
//printm(identity); | |
//mat4 test = mtranspose(minverse(identity)); | |
//printm(test); | |
//vec4 foo = mvmul(mtranspose(minverse(identity)), x); | |
//printv(foo); | |
//printm(minverse(matrix)); | |
//printm(mtranspose(identity)); | |
//printm(identity); | |
//printm(mmmul(identity, identity)); | |
//printm(mmmul_freevec(identity, identity)); | |
//printm(mmmul_add(identity, matrix)); | |
//printm(mmmul(identity, minverse(identity))); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment