Skip to content

Instantly share code, notes, and snippets.

@NiallHornFX
Last active May 7, 2020 23:58
Show Gist options
  • Save NiallHornFX/5c890a416229a584007d4938cf5150af to your computer and use it in GitHub Desktop.
Save NiallHornFX/5c890a416229a584007d4938cf5150af to your computer and use it in GitHub Desktop.
// SSE Cross Product Test Implementations Test -
temp_vec3<float> a(1.0f, 2.0f, 3.0f); temp_vec3<float> b(4.0f, 5.0f, 6.0f);
for (std::size_t j = 0; j < 5; ++j)
{
std::cout << "\nITER = " << j << "\n";
// Version A) \\
auto start = std::chrono::system_clock::now();
__m128 a_0 = _mm_shuffle_ps(a.sv, a.sv, _MM_SHUFFLE(3, 0, 2, 1)); // = a.y,a.z,a.x,0f
__m128 a_1 = _mm_shuffle_ps(a.sv, a.sv, _MM_SHUFFLE(3, 1, 0, 2)); // = a.z,a.x,a.y,0f
__m128 b_0 = _mm_shuffle_ps(b.sv, b.sv, _MM_SHUFFLE(3, 1, 0, 2)); // = b.z,b.x,b.y,0f
__m128 b_1 = _mm_shuffle_ps(b.sv, b.sv, _MM_SHUFFLE(3, 0, 2, 1)); // = b.y,b.z,b.x,0f
// Actual CP Math - ( a_0 * b_0 ) - ( a_1 * b_1 )
__m128 res_A = _mm_sub_ps(_mm_mul_ps(a_0, b_0), _mm_mul_ps(a_1, b_1)); // (a.y,a.z,a.x,0f * b.z,b.x,b.y,0f) - (a.z,a.x,a.y,0f * b.y,b.z,b.x,0f)
auto end = std::chrono::system_clock::now();
//for (std::size_t i = 0; i < 3; ++i) std::cout << res_A.m128_f32[i] << " | ";
std::cout << "\n ============== SSE Cross Product - Version A =============\n";
std::cout << "Calc Time = " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << "ns\n";
std::cout << "Calc Time = " << std::fixed << std::setprecision(8) << double((std::chrono::duration_cast<std::chrono::nanoseconds>(end - start)).count()) / double(1e+09) << "s\n";
// Version B) \\
start = std::chrono::system_clock::now();
__m128 tmp0 = _mm_shuffle_ps(b.sv, b.sv, _MM_SHUFFLE(3, 0, 2, 1));
// B --> tmp0 = (by,bz,bx,0)
__m128 tmp1 = _mm_shuffle_ps(a.sv, a.sv, _MM_SHUFFLE(3, 0, 2, 1));
// A -- > tmp1 = (ay,az,ax,0)
tmp0 = _mm_mul_ps(tmp0, a.sv);
// tmp0 = (by * ax, bz * ay, bx * az, 0)
tmp1 = _mm_mul_ps(tmp1, b.sv);
// tmp1 = (ay * bx, az * by, ax * bz, 0)
__m128 tmp2 = _mm_sub_ps(tmp0, tmp1);
// tmp2 = ( ((by * ax) - (ay * bx)) , ((bz * ay) - (az * by)) , ((bx * az) - ax * bz), (0-0))
__m128 res_B = _mm_shuffle_ps(tmp2, tmp2, _MM_SHUFFLE(3, 0, 2, 1)); // Have to shuffle to correct x,yz return order.
// res = ( ((bz * ay) - (az * by)), ((bx * az) - (ax * bz)), ((by * ax) - (ay * bx)), (0-0))
end = std::chrono::system_clock::now();
// Debug __m128s.
//for (std::size_t j = 0; j < 3; ++j) std::cout << res_B.m128_f32[j] << " | ";
std::cout << "\n ============== SSE Cross Product - Version B =============\n";
std::cout << "Calc Time = " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << "ns\n";
std::cout << "Calc Time = " << std::fixed << std::setprecision(8) << double((std::chrono::duration_cast<std::chrono::nanoseconds>(end - start)).count()) / double(1e+09) << "s\n";
}
// temp_vec3<T>
template <class T>
struct temp_vec3
{
temp_vec3() = delete;
temp_vec3(T xx, T yy, T zz) : x(xx), y(yy), z(zz) {}
explicit temp_vec3(const T *a) { v = a };
explicit temp_vec3(__m128 ssv) : sv(ssv) {}
union
{
struct { T x, y, z; };
T v[3];
__m128 sv;
};
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment