Skip to content

Instantly share code, notes, and snippets.

@Triang3l
Last active October 18, 2017 13:10
Show Gist options
  • Save Triang3l/e594a7cde9acce45db3f4b36ad7abfcf to your computer and use it in GitHub Desktop.
Save Triang3l/e594a7cde9acce45db3f4b36ad7abfcf to your computer and use it in GitHub Desktop.
#include <DirectXMath.h>
inline DirectX::XMVECTOR Vector4TransformRM(DirectX::XMVECTOR v, const DirectX::XMMATRIX &m) {
// XMVector4Transform doesn't work since the matrix is row-major (dot) and DirectXMath expects column-major (mul, mad).
#if defined(_XM_SSE_INTRINSICS_)
// XMVector4Transform uses 11 instructions, and XMMatrixTranspose uses 8. This uses 15.
// r0x | r0y | r1z | r1w
DirectX::XMVECTOR x0y0z1w1 = _mm_shuffle_ps(m.r[0], m.r[1], _MM_SHUFFLE(3, 2, 1, 0));
// r1x | r1y | r0z | r0w
DirectX::XMVECTOR x1y1z0w0 = _mm_shuffle_ps(m.r[1], m.r[0], _MM_SHUFFLE(3, 2, 1, 0));
// r2x | r2y | r3z | r3w
DirectX::XMVECTOR x2y2z3w3 = _mm_shuffle_ps(m.r[2], m.r[3], _MM_SHUFFLE(3, 2, 1, 0));
// r3x | r3y | r2z | r2w
DirectX::XMVECTOR x3y3z2w2 = _mm_shuffle_ps(m.r[3], m.r[2], _MM_SHUFFLE(3, 2, 1, 0));
// vx * r0x | vy * r0y | vz * r1z | vw * r1w
x0y0z1w1 = _mm_mul_ps(v, x0y0z1w1);
// vx * r2x | vy * r2y | vz * r3z | vw * r3w
x2y2z3w3 = _mm_mul_ps(v, x2y2z3w3);
// vx * r0x | vz * r1z | vx * r2x | vz * r3z
DirectX::XMVECTOR merge1 = _mm_shuffle_ps(x0y0z1w1, x2y2z3w3, _MM_SHUFFLE(2, 0, 2, 0));
// vy * r0y | vw * r1w | vy * r2y | vw * r3w
DirectX::XMVECTOR merge2 = _mm_shuffle_ps(x0y0z1w1, x2y2z3w3, _MM_SHUFFLE(3, 1, 3, 1));
// vx * r0x | vz * r1z | vx * r2x | vz * r3z
// + | + | + | +
// vy * r0y | vw * r1w | vy * r2y | vw * r3w
DirectX::XMVECTOR result = _mm_add_ps(merge1, merge2);
// vx * r1x | vy * r1y | vz * r0z | vw * r0w
x1y1z0w0 = _mm_mul_ps(v, x1y1z0w0);
// vx * r3x | vy * r3y | vz * r2z | vw * r2w
x3y3z2w2 = _mm_mul_ps(v, x3y3z2w2);
// vz * r0z | vx * r1x | vz * r2z | vx * r3x
merge1 = _mm_shuffle_ps(x1y1z0w0, x3y3z2w2, _MM_SHUFFLE(0, 2, 0, 2));
// vw * r0w | vy * r1y | vw * r2w | vy * r3y
merge2 = _mm_shuffle_ps(x1y1z0w0, x3y3z2w2, _MM_SHUFFLE(1, 3, 1, 3));
// vz * r0z | vx * r1x | vz * r2z | vx * r3x
// + | + | + | +
// vw * r0w | vy * r1y | vw * r2w | vy * r3y
merge1 = _mm_add_ps(merge1, merge2);
// xyzw | zwxy | xyzw | zwxy
return _mm_add_ps(result, merge1);
#else
#pragma message("Using XMMatrixTranspose and XMVector4Transform for Vector4Transform on this platform.")
return DirectX::XMVector4Transform(v, DirectX::XMMatrixTranspose(m));
#endif
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment