Skip to content

Instantly share code, notes, and snippets.

@rikusalminen
Created June 26, 2012 07:48
Show Gist options
  • Save rikusalminen/2994204 to your computer and use it in GitHub Desktop.
Save rikusalminen/2994204 to your computer and use it in GitHub Desktop.
Some 3D math with x86 SIMD: SSE, AVX, FMA, et al
#include <stdint.h>
#include <x86intrin.h>
#include <stdio.h>
void printv(__m128 vec)
{
float x[4];
_mm_storeu_ps(x, vec);
printf("%2.3f\t%2.3f\t%2.3f\t%2.3f\n", x[0], x[1], x[2], x[3]);
}
void printvi(__m128 vec)
{
uint32_t x[4];
_mm_storeu_ps((float*)x, vec);
printf("%8x\t%8x\t%8x\t%8x\n", x[0], x[1], x[2], x[3]);
}
typedef float vec4 __attribute__((vector_size(16)));
static inline vec4 vadd(vec4 x, vec4 y) { return _mm_add_ps(x, y); }
static inline vec4 vsub(vec4 x, vec4 y) { return _mm_sub_ps(x, y); }
static inline vec4 vmul(vec4 x, vec4 y) { return _mm_mul_ps(x, y); }
static inline vec4 vdiv(vec4 x, vec4 y) { return _mm_div_ps(x, y); }
static inline vec4 vmin(vec4 x, vec4 y) { return _mm_min_ps(x, y); }
static inline vec4 vmax(vec4 x, vec4 y) { return _mm_max_ps(x, y); }
static inline vec4 vaddf(vec4 x, vec4 y) { return _mm_add_ss(x, y); }
static inline vec4 vsubf(vec4 x, vec4 y) { return _mm_sub_ss(x, y); }
static inline vec4 vmulf(vec4 x, vec4 y) { return _mm_mul_ss(x, y); }
static inline vec4 vdivf(vec4 x, vec4 y) { return _mm_div_ss(x, y); }
static inline vec4 vminf(vec4 x, vec4 y) { return _mm_min_ss(x, y); }
static inline vec4 vmaxf(vec4 x, vec4 y) { return _mm_max_ss(x, y); }
static inline vec4 vsqrt(vec4 x) { return _mm_sqrt_ps(x); }
static inline vec4 vsqrtf(vec4 x) { return _mm_sqrt_ss(x); }
static inline vec4 vrcp(vec4 x) { return _mm_rcp_ps(x); }
static inline vec4 vrcpf(vec4 x) { return _mm_rcp_ss(x); }
static inline vec4 vrsqrt(vec4 x) { return _mm_rsqrt_ps(x); }
static inline vec4 vrsqrtf(vec4 x) { return _mm_rsqrt_ss(x); }
#define vshuffle_mask(a, b, c, d) (((a) << 0) | ((b) << 2) | ((c) << 4) | ((d) << 6))
#define vshuffle(x, y, a, b, c, d) (__builtin_ia32_shufps((x), (y), vshuffle_mask((a),(b),(c),(d))))
#define vsplat(v, x) vshuffle((v), (v), (x), (x), (x), (x))
#ifdef __FMA4__
static inline vec4 vmadd(vec4 a, vec4 b, vec4 c)
{
return _mm_macc_ps(a, b, c);
}
static inline vec4 vnmadd(vec4 a, vec4 b, vec4 c)
{
return _mm_nmacc_ps(a, b, c);
}
static inline vec4 vmsub(vec4 a, vec4 b, vec4 c)
{
return _mm_msub_ps(a, b, c);
}
static inline vec4 vnmsub(vec4 a, vec4 b, vec4 c)
{
return _mm_nmsub_ps(a, b, c);
}
#else
static inline vec4 vmadd(vec4 a, vec4 b, vec4 c)
{
return a * b + c;
}
static inline vec4 vnmadd(vec4 a, vec4 b, vec4 c)
{
return -(a * b) + c;
}
static inline vec4 vmsub(vec4 a, vec4 b, vec4 c)
{
return a * b - c;
}
static inline vec4 vnmsub(vec4 a, vec4 b, vec4 c)
{
return -(a * b) - c;
}
#endif
#ifdef __SSE4_1__
static inline vec4 vblend(vec4 a, vec4 b, vec4 mask) { return _mm_blendv_ps(a, b, mask); }
static inline vec4 vblend_mask(vec4 a, vec4 b, const int mask) { return _mm_blend_ps(a, b, mask); }
static inline vec4 vdot(vec4 x, vec4 y)
{
return _mm_dp_ps(x, y, 0xf);
}
static inline vec4 vdot3(vec4 x, vec4 y)
{
return _mm_dp_ps(x, y, 0x7);
}
#else
static inline vec4 vblend(vec4 a, vec4 b, vec4 mask)
{
vec4 extended_mask = _mm_cmplt_ps(mask, _mm_setzero_ps());
return _mm_or_ps(
_mm_andnot_ps(extended_mask, a),
_mm_and_ps(extended_mask, b));
}
static inline vec4 vblend_mask(vec4 a, vec4 b, const int mask);
#ifdef __SSE3__
static inline vec4 vdot(vec4 x, vec4 y)
{
vec4 prod = x * y;
vec4 sum1 = _mm_hadd_ps(prod, prod);
vec4 sum2 = _mm_hadd_ps(sum1, sum1);
return sum2;
}
#else
static inline vec4 vdot(vec4 x, vec4 y)
{
vec4 prod = x * y;
vec4 sum1 = prod + vshuffle(prod, prod, 1, 0, 3, 2);
vec4 sum2 = sum1 + vshuffle(sum1, sum1, 2, 2, 0, 0);
return sum2;
}
#endif
static inline vec4 vdot3(vec4 x, vec4 y)
{
vec4 zero = { 0, 0, 0, 0 };
return vdot(
vshuffle(x, vshuffle(x, zero, 2, 3, 0, 0), 0, 1, 0, 3),
vshuffle(y, vshuffle(y, zero, 2, 3, 0, 0), 0, 1, 0, 3));
}
#endif
static inline vec4 vmag(vec4 x)
{
return vsqrt(vdot(x, x));
}
static inline vec4 vmagf(vec4 x)
{
return vsqrtf(vdot(x, x));
}
static inline vec4 vunit(vec4 x)
{
return x * vrsqrt(vdot(x, x));
}
struct mat4_t
{
vec4 rows[4];
} __attribute__((aligned(16)));
typedef struct mat4_t mat4;
void printm(mat4 m)
{
for(int i = 0; i < 4; ++i)
printv(m.rows[i]);
printf("\n");
}
static inline vec4 mvmul_rows(vec4 x, vec4 y, vec4 z, vec4 w, vec4 v)
{
return vshuffle(
vshuffle(
vdot(x, v),
vdot(y, v),
0, 0, 0, 0),
vshuffle(
vdot(z, v),
vdot(w, v),
0, 0, 0, 0),
0, 2, 0, 2);
}
static inline vec4 mvmul(mat4 m, vec4 v)
{
return mvmul_rows(m.rows[0], m.rows[1], m.rows[2], m.rows[3], v);
}
static inline mat4 mmmul(mat4 l, mat4 r)
{
_MM_TRANSPOSE4_PS(r.rows[0], r.rows[1], r.rows[2], r.rows[3]);
vec4 row0 = vshuffle(
vshuffle(vdot(l.rows[0], r.rows[0]), vdot(l.rows[0], r.rows[1]), 0, 0, 0, 0),
vshuffle(vdot(l.rows[0], r.rows[2]), vdot(l.rows[0], r.rows[3]), 0, 0, 0, 0),
0, 2, 0, 2);
vec4 row1 = vshuffle(
vshuffle(vdot(l.rows[1], r.rows[0]), vdot(l.rows[1], r.rows[1]), 0, 0, 0, 0),
vshuffle(vdot(l.rows[1], r.rows[2]), vdot(l.rows[1], r.rows[3]), 0, 0, 0, 0),
0, 2, 0, 2);
vec4 row2 = vshuffle(
vshuffle(vdot(l.rows[2], r.rows[0]), vdot(l.rows[2], r.rows[1]), 0, 0, 0, 0),
vshuffle(vdot(l.rows[2], r.rows[2]), vdot(l.rows[2], r.rows[3]), 0, 0, 0, 0),
0, 2, 0, 2);
vec4 row3 = vshuffle(
vshuffle(vdot(l.rows[3], r.rows[0]), vdot(l.rows[3], r.rows[1]), 0, 0, 0, 0),
vshuffle(vdot(l.rows[3], r.rows[2]), vdot(l.rows[3], r.rows[3]), 0, 0, 0, 0),
0, 2, 0, 2);
mat4 result = { { row0, row1, row2, row3 } };
return result;
}
static inline mat4 mmmul_freevec(mat4 l, mat4 r)
{
vec4 row0 =
vmadd(vsplat(l.rows[0], 3), r.rows[0],
vmadd(vsplat(l.rows[0], 2), r.rows[0],
vmadd(vsplat(l.rows[0], 1), r.rows[0],
(vsplat(l.rows[0], 0) * r.rows[0]))));
vec4 row1 =
vmadd(vsplat(l.rows[1], 3), r.rows[1],
vmadd(vsplat(l.rows[1], 2), r.rows[1],
vmadd(vsplat(l.rows[1], 1), r.rows[1],
(vsplat(l.rows[1], 0) * r.rows[1]))));
vec4 row2 =
vmadd(vsplat(l.rows[2], 3), r.rows[2],
vmadd(vsplat(l.rows[2], 2), r.rows[2],
vmadd(vsplat(l.rows[2], 1), r.rows[2],
(vsplat(l.rows[2], 0) * r.rows[2]))));
vec4 row3 =
vmadd(vsplat(l.rows[3], 3), r.rows[3],
vmadd(vsplat(l.rows[3], 2), r.rows[3],
vmadd(vsplat(l.rows[3], 1), r.rows[3],
(vsplat(l.rows[3], 0) * r.rows[3]))));
mat4 result = { { row0, row1, row2, row3 } };
return result;
}
static inline mat4 mmmul_add(mat4 l, mat4 r)
{
vec4 row0 =
vsplat(l.rows[0], 3) * r.rows[0] +
vsplat(l.rows[0], 2) * r.rows[0] +
vsplat(l.rows[0], 1) * r.rows[0] +
vsplat(l.rows[0], 0) * r.rows[0];
vec4 row1 =
vsplat(l.rows[1], 3) * r.rows[1] +
vsplat(l.rows[1], 2) * r.rows[1] +
vsplat(l.rows[1], 1) * r.rows[1] +
vsplat(l.rows[1], 0) * r.rows[1];
vec4 row2 =
vsplat(l.rows[2], 3) * r.rows[2] +
vsplat(l.rows[2], 2) * r.rows[2] +
vsplat(l.rows[2], 1) * r.rows[2] +
vsplat(l.rows[2], 0) * r.rows[2];
vec4 row3 =
vsplat(l.rows[3], 3) * r.rows[3] +
vsplat(l.rows[3], 2) * r.rows[3] +
vsplat(l.rows[3], 1) * r.rows[3] +
vsplat(l.rows[3], 0) * r.rows[3];
mat4 result = { { row0, row1, row2, row3 } };
return result;
}
static inline mat4 minverse_transpose_rows(vec4 row0, vec4 row1, vec4 row2, vec4 row3)
{
vec4 minor0, minor1, minor2, minor3;
vec4 det, tmp1;
row1 = vshuffle(row1, row1, 2, 3, 0, 1);
row3 = vshuffle(row3, row3, 2, 3, 0, 1);
tmp1 = row2 * row3;
tmp1 = vshuffle(tmp1, tmp1, 1, 0, 3, 2);
minor0 = row1 * tmp1;
minor1 = row0 * tmp1;
tmp1 = vshuffle(tmp1, tmp1, 2, 3, 0, 1);
minor0 = vmsub(row1, tmp1, minor0);
minor1 = vmsub(row0, tmp1, minor1);
minor1 = vshuffle(minor1, minor1, 2, 3, 0, 1);
tmp1 = row1 * row2;
tmp1 = vshuffle(tmp1, tmp1, 1, 0, 3, 2);
minor0 = vmadd(row3, tmp1, minor0);
minor3 = row0 * tmp1;
tmp1 = vshuffle(tmp1, tmp1, 2, 3, 0, 1);
minor0 = vnmadd(row3, tmp1, minor0);
minor3 = vmsub(row0, tmp1, minor3);
minor3 = vshuffle(minor3, minor3, 2, 3, 0, 1);
tmp1 = vshuffle(row1, row1, 2, 3, 0, 1) * row3;
tmp1 = vshuffle(tmp1, tmp1, 1, 0, 3, 2);
row2 = vshuffle(row2, row2, 2, 3, 0, 1);
minor0 = vmadd(row2, tmp1, minor0);
minor2 = row0 * tmp1;
tmp1 = vshuffle(tmp1, tmp1, 2, 3, 0, 1);
minor0 = vnmadd(row2, tmp1, minor0);
minor2 = vmsub(row0, tmp1, minor2);
minor2 = vshuffle(minor2, minor2, 2, 3, 0, 1);
tmp1 = row0 * row1;
tmp1 = vshuffle(tmp1, tmp1, 1, 0, 3, 2);
minor2 = vmadd(row3, tmp1, minor2);
minor3 = vmsub(row2, tmp1, minor3);
tmp1 = vshuffle(tmp1, tmp1, 2, 3, 0, 1);
minor2 = vmsub(row3, tmp1, minor2);
minor3 = vnmadd(row2, tmp1, minor3);
tmp1 = row0 * row3;
tmp1 = vshuffle(tmp1, tmp1, 1, 0, 3, 2);
minor1 = vnmadd(row2, tmp1, minor1);
minor2 = vmadd(row1, tmp1, minor2);
tmp1 = vshuffle(tmp1, tmp1, 2, 3, 0, 1);
minor1 = vmadd(row2, tmp1, minor1);
minor2 = vnmadd(row1, tmp1, minor2);
tmp1 = row0 * row2;
tmp1 = vshuffle(tmp1, tmp1, 1, 0, 3, 2);
minor1 = vmadd(row3, tmp1, minor1);
minor3 = vnmadd(row1, tmp1, minor3);
tmp1 = vshuffle(tmp1, tmp1, 2, 3, 0, 1);
minor1 = vnmadd(row3, tmp1, minor1);
minor3 = vmadd(row1, tmp1, minor3);
det = row0 * minor0;
det = vshuffle(det, det, 2, 3, 0, 1) + det;
det = vaddf(vshuffle(det, det, 1, 0, 3, 2), det);
tmp1 = vrcpf(det);
det = vsubf(vaddf(tmp1, tmp1), vmulf(det, vmulf(tmp1, tmp1)));
det = vshuffle(det, det, 0, 0, 0, 0);
minor0 = det * minor0;
minor1 = det * minor1;
minor2 = det * minor2;
minor3 = det * minor3;
mat4 result = { minor0, minor1, minor2, minor3 };
return result;
}
static inline mat4 mtranspose(mat4 m)
{
_MM_TRANSPOSE4_PS(m.rows[0], m.rows[1], m.rows[2], m.rows[3]);
return m;
}
static inline mat4 minverse(mat4 m)
{
return mtranspose(minverse_transpose_rows(m.rows[0], m.rows[1], m.rows[2], m.rows[3]));
}
int main()
{
//printv(_mm_setr_ps(1,2,3,4));
vec4 x = { 1, 2, 3, 4 };
vec4 y = { 5, 6, 7, 8 };
vec4 mask = { 1, -1, 1, -1 };
printv(vblend(x, y, mask));
mat4 identity = {{
{ 1, 0, 0, 0 },
{ 0, 1, 0, 0 },
{ 0, 0, 1, 0 },
{ 0, 0, 0, 1 }
}};
mat4 matrix = {{
{ 2, 0, 0, 3 },
{ 0, 3, 0, 2 },
{ 0, 0, 4, 1 },
{ 0, 0, 0, 1 }
}};
//printv4(x);
//printv(x+x);
//printv(__builtin_ia32_shufps(x, x, 0x00));
//printv(__builtin_ia32_shufps(x, y, 0x04));
//printv(vshuffle(x, x, 1, 1, 1, 1));
//printv(vshuffle(x, x, 0, 1, 2, 3));
//printv(vshuffle(x, x, 4, 5, 6, 7));
//printv(vshuffle(x, x, 3, 2, 1, 0));
//printv(-x);
//printv(vdot(x, x));
//printv(vdot3(x, x));
//printv(mvmul(identity, y));
//printv(vmag(x));
//printv(vmagf(x));
//printv(vunit(x));
//printm(identity);
//mat4 test = mtranspose(minverse(identity));
//printm(test);
//vec4 foo = mvmul(mtranspose(minverse(identity)), x);
//printv(foo);
//printm(minverse(matrix));
//printm(mtranspose(identity));
//printm(identity);
//printm(mmmul(identity, identity));
//printm(mmmul_freevec(identity, identity));
//printm(mmmul_add(identity, matrix));
//printm(mmmul(identity, minverse(identity)));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment