Skip to content

Instantly share code, notes, and snippets.

@timshen91
Last active January 17, 2018 23:30
Show Gist options
  • Save timshen91/0f321fe2c5cfb04015917c0529052158 to your computer and use it in GitHub Desktop.
Save timshen91/0f321fe2c5cfb04015917c0529052158 to your computer and use it in GitHub Desktop.
/*
clang++ --version
clang++ -DTEST_WITH_FUNC=SumAutoVec a.cc -std=c++11 -O2 && time ./a.out
clang++ -DTEST_WITH_FUNC=SumSad a.cc -std=c++11 -O2 && time ./a.out
clang++ -DTEST_WITH_FUNC=SumElementWiseInt16Acc a.cc -std=c++11 -O2 && time ./a.out
g++ --version
g++ -DTEST_WITH_FUNC=SumAutoVec a.cc -std=c++11 -O2 && time ./a.out
g++ -DTEST_WITH_FUNC=SumSad a.cc -std=c++11 -O2 && time ./a.out
g++ -DTEST_WITH_FUNC=SumElementWiseInt16Acc a.cc -std=c++11 -O2 && time ./a.out
Output on x86_64-linux-gnu, SSE4.2:
+ clang++ --version
clang version 3.8.1-24 (tags/RELEASE_381/final)
Target: x86_64-pc-linux-gnu
Thread model: posix
InstalledDir: /usr/bin
+ clang++ -DTEST_WITH_FUNC=SumAutoVec a.cc -std=c++11 -O2
+ ./a.out
3276800000
real 0m2.192s
user 0m2.188s
sys 0m0.000s
+ clang++ -DTEST_WITH_FUNC=SumSad a.cc -std=c++11 -O2
+ ./a.out
3276800000
real 0m0.125s
user 0m0.124s
sys 0m0.000s
+ clang++ -DTEST_WITH_FUNC=SumElementWiseInt16Acc a.cc -std=c++11 -O2
+ ./a.out
3276800000
real 0m0.183s
user 0m0.180s
sys 0m0.000s
+ g++ --version
g++ (Debian 6.3.0-18) 6.3.0 20170516
Copyright (C) 2016 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ g++ -DTEST_WITH_FUNC=SumAutoVec a.cc -std=c++11 -O2
+ ./a.out
3276800000
real 0m1.583s
user 0m1.580s
sys 0m0.000s
+ g++ -DTEST_WITH_FUNC=SumSad a.cc -std=c++11 -O2
+ ./a.out
3276800000
real 0m0.196s
user 0m0.192s
sys 0m0.004s
+ g++ -DTEST_WITH_FUNC=SumElementWiseInt16Acc a.cc -std=c++11 -O2
+ ./a.out
3276800000
real 0m0.250s
user 0m0.248s
sys 0m0.000s
*/
#include <cassert>
#include <cstdint>
#include <emmintrin.h>
#include <vector>
#include <iostream>
using V64 = int64_t __attribute__((vector_size(16)));
using V16 = int16_t __attribute__((vector_size(16)));
inline int64_t SumAutoVec(uint8_t *buffer, size_t n) {
int64_t ret = 0;
for (int i = 0; i < n; i++) {
ret += buffer[i];
}
return ret;
}
inline int64_t SumSad(uint8_t *buffer, size_t n) {
assert(n % 16 == 0);
assert(uintptr_t(buffer) % 16 == 0);
__m128i acc = _mm_setzero_si128();
for (int i = 0; i < n; i += 16) {
__m128i data = _mm_load_si128(reinterpret_cast<__m128i *>(buffer + i));
acc = _mm_add_epi64(_mm_sad_epu8(data, _mm_setzero_si128()), acc);
}
return V64(acc)[0] + V64(acc)[1];
}
inline int64_t SumElementWiseInt16Acc(uint8_t *buffer, size_t n) {
assert(n % 16 == 0);
assert(uintptr_t(buffer) % 16 == 0);
__m128i acc = _mm_setzero_si128();
for (int i = 0; i < n; i += 16) {
__m128i data = _mm_load_si128(reinterpret_cast<__m128i *>(buffer + i));
__m128i lo = _mm_unpacklo_epi8(data, _mm_setzero_si128());
__m128i hi = _mm_unpackhi_epi8(data, _mm_setzero_si128());
acc = _mm_add_epi16(_mm_add_epi16(lo, hi), acc);
}
int64_t ret = 0;
for (int i = 0; i < 8; i++) {
ret += V16(acc)[i];
}
return ret;
}
int main() {
int count = 100000;
std::vector<uint8_t> a(32768, 1);
int64_t acc = 0;
while (count--) {
acc += TEST_WITH_FUNC(a.data(), a.size());
}
std::cout << acc << "\n";
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment