Skip to content

Instantly share code, notes, and snippets.

@belltailjp
Created January 28, 2013 07:38
Show Gist options
  • Save belltailjp/4653695 to your computer and use it in GitHub Desktop.
Save belltailjp/4653695 to your computer and use it in GitHub Desktop.
SSE,AVX組み込み関数を用いたベクトルの内積計算高速化の実験コード
#include <iostream>
#include <random>
#include <algorithm>
#include <boost/format.hpp>
#include <xmmintrin.h>
#include <immintrin.h>
#include <osakana/stopwatch.hpp>
float dot_normal(const float *vec1, const float *vec2, unsigned n)
{
float sum = 0;
for(unsigned i = 0; i < n; ++i)
sum += vec1[i] * vec2[i];
return sum;
}
float dot_sse(const float *vec1, const float *vec2, unsigned n)
{
__m128 u = {0};
for (unsigned i = 0; i < n; i += 4)
{
__m128 w = _mm_load_ps(&vec1[i]);
__m128 x = _mm_load_ps(&vec2[i]);
x = _mm_mul_ps(w, x);
u = _mm_add_ps(u, x);
}
__attribute__((aligned(16))) float t[4] = {0};
_mm_store_ps(t, u);
return t[0] + t[1] + t[2] + t[3];
}
float dot_avx(const float *vec1, const float *vec2, unsigned n)
{
__m256 u = {0};
for(unsigned i = 0; i < n; i += 8)
{
__m256 w = _mm256_load_ps(&vec1[i]);
__m256 x = _mm256_load_ps(&vec2[i]);
x = _mm256_mul_ps(w, x);
u = _mm256_add_ps(u, x);
}
__attribute__((aligned(32))) float t[8] = {0};
_mm256_store_ps(t, u);
return t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
}
float dot_avx_2(const float *vec1, const float *vec2, unsigned n)
{
__m256 u1 = {0};
__m256 u2 = {0};
for(unsigned i = 0; i < n; i += 16)
{
__m256 w1 = _mm256_load_ps(&vec1[i]);
__m256 w2 = _mm256_load_ps(&vec1[i + 8]);
__m256 x1 = _mm256_load_ps(&vec2[i]);
__m256 x2 = _mm256_load_ps(&vec2[i + 8]);
x1 = _mm256_mul_ps(w1, x1);
x2 = _mm256_mul_ps(w2, x2);
u1 = _mm256_add_ps(u1, x1);
u2 = _mm256_add_ps(u2, x2);
}
u1 = _mm256_add_ps(u1, u2);
__attribute__((aligned(32))) float t[8] = {0};
_mm256_store_ps(t, u1);
return t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
}
/*
//FMA版
float dot_avx_fma(const float *vec1, const float *vec2, unsigned n)
{
__m256 u1 = {0};
__m256 u2 = {0};
for(unsigned i = 0; i < n; i += 16)
{
__m256 w1 = _mm256_load_ps(&vec1[i]);
__m256 w2 = _mm256_load_ps(&vec1[i + 8]);
__m256 x1 = _mm256_load_ps(&vec2[i]);
__m256 x2 = _mm256_load_ps(&vec2[i + 8]);
//FMA命令で加算と乗算を行うけど,Haswellアーキテクチャ待ち(´・ω・`)
u1 = _mm256_fmadd_ps(w1, x1, u1);
u2 = _mm256_fmadd_ps(w2, x2, u2);
}
u1 = _mm256_add_ps(u1, u2);
//レジスタから書き戻し
__attribute__((aligned(32))) float t[8] = {0};
_mm256_store_ps(t, u1);
return t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
}
*/
template<class T>
double calc_for_a_moment(T t, unsigned ms)
{
osakana::stopwatch sw;
int cnt = 0;
volatile float sum = 0; //最適化で消されるの防止
while(sw.elapsed_ms() < ms)
{
sum += t();
++cnt;
}
return sw.elapsed_us()/1000.0 / cnt;
}
int main()
{
const unsigned len_begin = 8;
const unsigned len_end = 512 * 1024;
const unsigned len_fact = 2;
const unsigned run_ms = 250;
std::mt19937 rng;
std::uniform_real_distribution<> dst(-1, 1);
for(unsigned len = len_begin; len <= len_end; len *= len_fact)
{
float *p1 = new __attribute__((aligned(32))) float[len + 8];
float *p2 = new __attribute__((aligned(32))) float[len + 8];
float *vec1 = p1;
float *vec2 = p2;
while(reinterpret_cast<long>(vec1) % 32) ++vec1;
while(reinterpret_cast<long>(vec2) % 32) ++vec2;
std::generate(vec1, vec1 + len, [&rng, &dst](){ return dst(rng); });
std::generate(vec2, vec2 + len, [&rng, &dst](){ return dst(rng); });
std::cout << (boost::format("%d %lf %lf %lf %lf")
% len
% calc_for_a_moment([vec1, vec2, len](){ return dot_sse (vec1, vec2, len); }, run_ms)
% calc_for_a_moment([vec1, vec2, len](){ return dot_avx (vec1, vec2, len); }, run_ms)
% calc_for_a_moment([vec1, vec2, len](){ return dot_avx_2 (vec1, vec2, len); }, run_ms)
% calc_for_a_moment([vec1, vec2, len](){ return dot_normal(vec1, vec2, len); }, run_ms)
) << std::endl;
delete[] p1;
delete[] p2;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment