Skip to content

Instantly share code, notes, and snippets.

@JPenuchot
Last active July 19, 2018 19:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JPenuchot/66a94c76f85deb71e880058b8400ab24 to your computer and use it in GitHub Desktop.
Save JPenuchot/66a94c76f85deb71e880058b8400ab24 to your computer and use it in GitHub Desktop.
Fully optimized mean function using xsimd
#include <vector>
#include <numeric>
#include <utility>
#include <iostream>
#include <xsimd/xsimd.hpp>
using namespace std;
namespace xs = xsimd;
template<typename F, size_t... I>
inline void unroll_impl(F&& f, integer_sequence<size_t, I...>)
{
( f(integral_constant<size_t, I>{}) , ... );
}
template<size_t N, typename F>
inline void unroll(F&& f)
{
unroll_impl(forward<F>(f), make_integer_sequence<size_t, N>{});
}
template<typename T>
using align_vec =
vector<T, xs::aligned_allocator<T, xs::simd_type<T>::size * sizeof(T)>>;
template<typename T, size_t U = 4, typename F, typename It, typename... V>
void functor(F&& f, It begin, It end, V... Begins)
{
using namespace std;
constexpr size_t S = xs::simd_type<T>::size;
constexpr size_t US = U * S;
const auto size = end - begin;
const auto simd_size = size - (size % S );
const auto unrolled_simd_size = size - (size % US);
auto simd_end = &begin[simd_size];
auto unrolled_simd_end = &begin[unrolled_simd_size];
auto simd_op = [&](auto i)
{
constexpr auto I = decltype(i)::value;
xs::store_aligned(&begin[I], f(xs::load_aligned(&Begins[I])...));
};
// Unrolled SIMD core
for(; begin < unrolled_simd_end; begin += US, ((Begins+= US), ...))
unroll<U>([&](auto I)
{
simd_op(integral_constant<size_t, I * S>{});
});
// SIMD end
for(; begin < simd_end; begin += S, ((Begins+= S), ...))
simd_op(integral_constant<size_t, 0>{});
// Scalar end
for(; begin < end; begin++, ((Begins++), ...))
*begin = f((*Begins)...);
}
int main(int argc, char const *argv[])
{
const unsigned sz = 1001;
align_vec<float> a(sz), b(sz), r(sz);
fill(a.begin(), a.end(), 500.f);
fill(b.begin(), b.end(), 1000.f);
functor<float>( [](auto&& a, auto&& b) { return a + b; }
, r.data(), &r.data()[r.size()]
, a.data()
, b.data()
);
for(auto& val : r) cout << val << endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment