Skip to content

Instantly share code, notes, and snippets.

@iillyyaa iillyyaa/test.cpp
Last active Dec 22, 2017

Embed
What would you like to do?
x86intrin vs plain C++ with compiler's help
/*
mkdir $HOME/work
cd $HOME/work
git clone https://github.com/google/benchmark.git
cd benchmark
git checkout v1.3.0
mkdir build
cd build
cmake .. -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_INSTALL_PREFIX="$(pwd)/../root"
make
make install # this will install into ../root, per cmake cmd bove
cd ~/work
# save this file as ~/work/test.cpp
Compile and run benchmark as:
g++ \
-I$HOME/work/benchmark/root/include \
-Wall -Wextra -Wconversion -std=c++14 \
-msse4.2 \
-march=native \
-O3 \
test.cpp \
-L$HOME/work/benchmark/root/lib \
-lbenchmark -lpthread \
&& ./a.out
*/
#include <benchmark/benchmark.h>
#include <x86intrin.h>
struct Vector3f {
float x=1.23f, y=-0.12f, z=0.22f;
Vector3f() = default;
Vector3f(float xx, float yy, float zz): x(xx), y(yy), z(zz) {}
};
struct Matrix43f {
float a11 = +0.000f, a12 = -0.951f, a13 = 0.309f, a14 = +0.002f;
float a21 = +0.809f, a22 = +0.182f, a23 = 0.559f, a24 = -0.001f;
float a31 = -0.588f, a32 = +0.250f, a33 = 0.769f, a34 = +0.001f;
};
float dotPlain(Vector3f const & a, Vector3f const & b) {
return a.z*b.z + a.y*b.y + a.x*b.x;
}
float dotX86Intr(Vector3f const & a, Vector3f const & b) {
__m128 p1r = _mm_set_ps(0, a.z, a.y, a.x);
__m128 p2r = _mm_set_ps(0, b.z, b.y, b.x);
return _mm_cvtss_f32(_mm_dp_ps(p1r, p2r, 0x71));
}
Vector3f matMulPlain(Vector3f const & v, Matrix43f const & m) {
return {
m.a11 * v.x + m.a12 * v.y + m.a13 * v.z + m.a14,
m.a21 * v.x + m.a22 * v.y + m.a23 * v.z + m.a24,
m.a31 * v.x + m.a32 * v.y + m.a33 * v.z + m.a34,
};
}
Vector3f matMulX86Intr(Vector3f const & v, Matrix43f const & m) {
__m128 rv = _mm_set_ps(1, v.z, v.y, v.x);
__m128 ra1 = _mm_set_ps(m.a14, m.a13, m.a12, m.a11);
__m128 ra2 = _mm_set_ps(m.a24, m.a23, m.a22, m.a21);
__m128 ra3 = _mm_set_ps(m.a34, m.a33, m.a32, m.a31);
float dp1 = _mm_cvtss_f32(_mm_dp_ps(ra1, rv, 0xF1));
float dp2 = _mm_cvtss_f32(_mm_dp_ps(ra2, rv, 0xF1));
float dp3 = _mm_cvtss_f32(_mm_dp_ps(ra3, rv, 0xF1));
return {dp1, dp2, dp3};
}
static Vector3f a[1024];
static Matrix43f m[1024];
static void BM_dotPlain(benchmark::State& state) {
for (auto _ : state) {
a[0].x += 1.0f;
for (int i=0; i<1024; ++i) {
benchmark::DoNotOptimize(dotPlain(a[i], a[(i+13) % 1024]));
}
}
}
static void BM_dotX86Intr(benchmark::State& state) {
for (auto _ : state) {
a[0].x += 1.0f;
for (int i=0; i<1024; ++i) {
benchmark::DoNotOptimize(dotX86Intr(a[i], a[(i+13) % 1024]));
}
}
}
static void BM_matMulPlain(benchmark::State& state) {
for (auto _ : state) {
a[0].x += 1.0f;
for (int i=0; i<1024; ++i) {
benchmark::DoNotOptimize(matMulPlain(a[i], m[(i+13) % 1024]));
}
}
}
static void BM_matMulX86Intr(benchmark::State& state) {
for (auto _ : state) {
a[0].x += 1.0f;
for (int i=0; i<1024; ++i) {
benchmark::DoNotOptimize(matMulX86Intr(a[i], m[(i+13) % 1024]));
}
}
}
// Register the function as a benchmark
BENCHMARK(BM_dotX86Intr);
BENCHMARK(BM_dotPlain);
BENCHMARK(BM_matMulX86Intr);
BENCHMARK(BM_matMulPlain);
BENCHMARK_MAIN();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.