Skip to content

Instantly share code, notes, and snippets.

@PureWhiteWu
Created January 6, 2021 08:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save PureWhiteWu/e88f241fc8b62df06ae1eb04923a88ae to your computer and use it in GitHub Desktop.
Save PureWhiteWu/e88f241fc8b62df06ae1eb04923a88ae to your computer and use it in GitHub Desktop.
little_2_big_gcc.cpp
#include <memory>
#include <chrono>
#include <iostream>
#include <immintrin.h>
#include <byteswap.h>
const int N = 1024 * 64;
long long int src_data[N];
long long int avx512_data[N];
long long int avx2_data[N];
long long int bswap_result[N];
const long long int MASK = 0x0001020304050607;
const __mmask16 bit16mask[17] = {0x0000, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff, 0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff};
using namespace std;
void avx512_little_2_big(const long long int *src, long long int *dst, int n);
void avx2_little_2_big(const long long int *src, long long int *dst, int n);
void bswap_little_2_big(const long long int *src, long long int *dst, int n);
int main()
{
for (int i = 0; i < N; i++)
{
src_data[i] = 0x0123456789abcdef;
}
// 预热
for (int i = 0; i < 10000; i++)
{
avx512_little_2_big(src_data, avx512_data, N);
}
auto start = chrono::steady_clock::now();
int loop = 10000;
while (loop--)
{
avx512_little_2_big(src_data, avx512_data, N);
}
auto end = chrono::steady_clock::now();
cout << "avx512 time: " << chrono::duration_cast<chrono::microseconds>(end - start).count() / 10 << " us" << endl;
cout << "avx512 result:" << endl;
for (int i = 0; i < N; i++)
{
//cout << hex << src_data[i] <<" " << hex << avx512_data[i] <<endl;
}
// 预热
for (int i = 0; i < 100; i++)
{
avx2_little_2_big(src_data, avx2_data, N);
}
start = chrono::steady_clock::now();
loop = 10000;
while (loop--)
{
avx2_little_2_big(src_data, avx2_data, N);
}
end = chrono::steady_clock::now();
cout << "avx2 time: " << chrono::duration_cast<chrono::microseconds>(end - start).count() / 10 << " us" << endl;
// 预热
for (int i = 0; i < 100; i++)
{
bswap_little_2_big(src_data, avx2_data, N);
}
start = chrono::steady_clock::now();
loop = 10000;
while (loop--)
{
bswap_little_2_big(src_data, bswap_result, N);
}
end = chrono::steady_clock::now();
cout << "bswap time: " << chrono::duration_cast<chrono::microseconds>(end - start).count() / 10 << " us" << endl;
cout << "bswap result:" << endl;
for (int i = 0; i < N; i++)
{
//cout << hex << src_data[i] <<" " << hex << bswap_result[i] <<endl;
}
int pass = 1;
for (int i = 0; i < N; i++)
{
if (bswap_result[i] != avx512_data[i] || bswap_result[i] != avx2_data[i])
{
cout << hex << bswap_result[i] << " " << hex << avx512_data[i] << hex << avx2_data[i];
pass = 0;
break;
}
}
cout << "test result :" << pass << endl;
return 0;
}
void avx512_little_2_big(const long long int *src, long long int *dst, int n)
{
int loop_count = n / 8;
int remainder = n % 8;
__m512i mask = _mm512_set1_epi64(MASK);
for (int i = 0; i < loop_count; i++)
{
int index = i * 8;
__m512i input_data = _mm512_loadu_si512(&src[index]);
__m512i output_data = _mm512_shuffle_epi8(input_data, mask);
_mm512_storeu_si512(&avx512_data[index], output_data);
}
if (remainder != 0)
{
int index = loop_count * 8;
__m512i padding = _mm512_set1_epi64(0);
__m512i input_data = _mm512_mask_loadu_epi64(padding, bit16mask[remainder], &src[index]);
__m512i output_data = _mm512_shuffle_epi8(input_data, mask);
_mm512_mask_storeu_epi64(&avx512_data[index], bit16mask[remainder], output_data);
}
return;
}
void avx2_little_2_big(const long long int *src, long long int *dst, int n)
{
int loop_count = n / 4;
int remainder = n % 4;
__m256i mask = _mm256_set1_epi64x(MASK);
for (int i = 0; i < loop_count; i++)
{
int index = i * 4;
__m256i input_data = _mm256_loadu_si256((__m256i *)&src[index]);
__m256i output_data = _mm256_shuffle_epi8(input_data, mask);
_mm256_storeu_si256((__m256i *)&avx2_data[index], output_data);
}
if (remainder != 0)
{
int index = loop_count * 4;
for (int i = index; i < index + remainder; i++)
{
avx2_data[i] = bswap_64(src[i]);
}
}
return;
}
void bswap_little_2_big(const long long int *src, long long int *dst, int n)
{
for (int i = 0; i < n; i++)
{
dst[i] = bswap_64(src[i]);
}
return;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment