Skip to content

Instantly share code, notes, and snippets.

@omorgan7
Created September 19, 2019 18:58
Show Gist options
  • Save omorgan7/aabab1c496fa46a1543323ac0272feaa to your computer and use it in GitHub Desktop.
Save omorgan7/aabab1c496fa46a1543323ac0272feaa to your computer and use it in GitHub Desktop.
// compile with:
// c++ range_for.cpp -std=c++11 -m[sse4.2|avx2|avx|arch=native]
// run with ./a.out array_size
#include <vector>
#include <thread>
#include <cstdio>
#include <numeric>
#include <emmintrin.h>
#include <smmintrin.h>
#include <tmmintrin.h>
#include <chrono>
#include <cassert>
__attribute__((noinline))
int sum(const std::vector<int>& a)
{
int i = 0;
for (const auto& b : a) i += b;
return i;
}
__attribute__((noinline))
int sum2(const std::vector<int>& a)
{
int i = 0;
for (const int b : a) i += b;
return i;
}
__attribute__((noinline))
int sum3(const std::vector<int>& a)
{
int sum = 0;
for (size_t i = 0, I = a.size(); i < I; ++i) sum += a[i];
return sum;
}
__attribute__((noinline))
int sum4(const std::vector<int>& a)
{
size_t i = 0;
size_t size = a.size();
constexpr int element_width = 8;
static_assert(element_width % 4 == 0, "Must be a multiple of 4!");
size_t size4 = (size / element_width) * element_width;
const int* aPtr = a.data();
int sum = 0;
// read elements until we align to a 16 byte boundary
// this allows the faster aligned load rather than loadu to be used.
while (i < size && (reinterpret_cast<unsigned long long>(aPtr + i) % 16) != 0)
{
sum += aPtr[i++];
}
__m128i a0, a1;
a0 = a1 = _mm_setzero_si128();
for (; i < size4; i += element_width)
{
__m128i elems0 = _mm_load_si128(reinterpret_cast<const __m128i*>(aPtr + i + 0));
__m128i elems1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aPtr + i + 4));
// __m128i elems2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aPtr + i + 8));
// __m128i elems3 = _mm_load_si128(reinterpret_cast<const __m128i*>(aPtr + i + 12));
a0 = _mm_add_epi32(a0, elems0);
a1 = _mm_add_epi32(a1, elems1);
// a2 = _mm_add_epi32(a2, elems2);
// a3 = _mm_add_epi32(a3, elems3);
}
a0 = _mm_add_epi32(a0, a1);
// a2 = _mm_add_epi32(a2, a3);
// a0 = _mm_add_epi32(a0, a2);
// handle remaining cases for sizes non-divisible by 4
for (; i < size; ++i)
{
sum += aPtr[i];
}
a0 = _mm_hadd_epi32(a0, a0);
a0 = _mm_hadd_epi32(a0, a0);
int out = _mm_extract_epi32(a0, 0);
return sum + out;
}
void do_not_optimise(int a)
{
if (std::this_thread::get_id() == std::thread::id())
{
printf("%d\n", a);
}
}
int main(int argc, char** argv)
{
if (argc < 2) return 1;
std::vector<int> a(static_cast<size_t>(std::atoi(argv[1])));
std::iota(a.begin(), a.end(), 0);
{
int s = sum(a);
do_not_optimise(s);
auto time0 = std::chrono::high_resolution_clock::now();
s = sum(a);
auto time1 = std::chrono::high_resolution_clock::now();
do_not_optimise(s);
printf("sum() took: %llu nanoseconds\n", (time1 - time0).count());
}
{
int s = sum2(a);
do_not_optimise(s);
auto time0 = std::chrono::high_resolution_clock::now();
s = sum2(a);
auto time1 = std::chrono::high_resolution_clock::now();
do_not_optimise(s);
printf("sum2() took: %llu nanoseconds\n", (time1 - time0).count());
}
{
int s = sum3(a);
do_not_optimise(s);
auto time0 = std::chrono::high_resolution_clock::now();
s = sum3(a);
auto time1 = std::chrono::high_resolution_clock::now();
do_not_optimise(s);
printf("sum3() took: %llu nanoseconds\n", (time1 - time0).count());
}
{
int s = sum4(a);
do_not_optimise(s);
auto time0 = std::chrono::high_resolution_clock::now();
s = sum4(a);
auto time1 = std::chrono::high_resolution_clock::now();
do_not_optimise(s);
printf("sum4() took: %llu nanoseconds\n", (time1 - time0).count());
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment