Created
January 6, 2021 08:17
-
-
Save PureWhiteWu/e88f241fc8b62df06ae1eb04923a88ae to your computer and use it in GitHub Desktop.
little_2_big_gcc.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <memory> | |
#include <chrono> | |
#include <iostream> | |
#include <immintrin.h> | |
#include <byteswap.h> | |
const int N = 1024 * 64; | |
long long int src_data[N]; | |
long long int avx512_data[N]; | |
long long int avx2_data[N]; | |
long long int bswap_result[N]; | |
const long long int MASK = 0x0001020304050607; | |
const __mmask16 bit16mask[17] = {0x0000, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff, 0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff}; | |
using namespace std; | |
void avx512_little_2_big(const long long int *src, long long int *dst, int n); | |
void avx2_little_2_big(const long long int *src, long long int *dst, int n); | |
void bswap_little_2_big(const long long int *src, long long int *dst, int n); | |
int main() | |
{ | |
for (int i = 0; i < N; i++) | |
{ | |
src_data[i] = 0x0123456789abcdef; | |
} | |
// 预热 | |
for (int i = 0; i < 10000; i++) | |
{ | |
avx512_little_2_big(src_data, avx512_data, N); | |
} | |
auto start = chrono::steady_clock::now(); | |
int loop = 10000; | |
while (loop--) | |
{ | |
avx512_little_2_big(src_data, avx512_data, N); | |
} | |
auto end = chrono::steady_clock::now(); | |
cout << "avx512 time: " << chrono::duration_cast<chrono::microseconds>(end - start).count() / 10 << " us" << endl; | |
cout << "avx512 result:" << endl; | |
for (int i = 0; i < N; i++) | |
{ | |
//cout << hex << src_data[i] <<" " << hex << avx512_data[i] <<endl; | |
} | |
// 预热 | |
for (int i = 0; i < 100; i++) | |
{ | |
avx2_little_2_big(src_data, avx2_data, N); | |
} | |
start = chrono::steady_clock::now(); | |
loop = 10000; | |
while (loop--) | |
{ | |
avx2_little_2_big(src_data, avx2_data, N); | |
} | |
end = chrono::steady_clock::now(); | |
cout << "avx2 time: " << chrono::duration_cast<chrono::microseconds>(end - start).count() / 10 << " us" << endl; | |
// 预热 | |
for (int i = 0; i < 100; i++) | |
{ | |
bswap_little_2_big(src_data, avx2_data, N); | |
} | |
start = chrono::steady_clock::now(); | |
loop = 10000; | |
while (loop--) | |
{ | |
bswap_little_2_big(src_data, bswap_result, N); | |
} | |
end = chrono::steady_clock::now(); | |
cout << "bswap time: " << chrono::duration_cast<chrono::microseconds>(end - start).count() / 10 << " us" << endl; | |
cout << "bswap result:" << endl; | |
for (int i = 0; i < N; i++) | |
{ | |
//cout << hex << src_data[i] <<" " << hex << bswap_result[i] <<endl; | |
} | |
int pass = 1; | |
for (int i = 0; i < N; i++) | |
{ | |
if (bswap_result[i] != avx512_data[i] || bswap_result[i] != avx2_data[i]) | |
{ | |
cout << hex << bswap_result[i] << " " << hex << avx512_data[i] << hex << avx2_data[i]; | |
pass = 0; | |
break; | |
} | |
} | |
cout << "test result :" << pass << endl; | |
return 0; | |
} | |
void avx512_little_2_big(const long long int *src, long long int *dst, int n) | |
{ | |
int loop_count = n / 8; | |
int remainder = n % 8; | |
__m512i mask = _mm512_set1_epi64(MASK); | |
for (int i = 0; i < loop_count; i++) | |
{ | |
int index = i * 8; | |
__m512i input_data = _mm512_loadu_si512(&src[index]); | |
__m512i output_data = _mm512_shuffle_epi8(input_data, mask); | |
_mm512_storeu_si512(&avx512_data[index], output_data); | |
} | |
if (remainder != 0) | |
{ | |
int index = loop_count * 8; | |
__m512i padding = _mm512_set1_epi64(0); | |
__m512i input_data = _mm512_mask_loadu_epi64(padding, bit16mask[remainder], &src[index]); | |
__m512i output_data = _mm512_shuffle_epi8(input_data, mask); | |
_mm512_mask_storeu_epi64(&avx512_data[index], bit16mask[remainder], output_data); | |
} | |
return; | |
} | |
void avx2_little_2_big(const long long int *src, long long int *dst, int n) | |
{ | |
int loop_count = n / 4; | |
int remainder = n % 4; | |
__m256i mask = _mm256_set1_epi64x(MASK); | |
for (int i = 0; i < loop_count; i++) | |
{ | |
int index = i * 4; | |
__m256i input_data = _mm256_loadu_si256((__m256i *)&src[index]); | |
__m256i output_data = _mm256_shuffle_epi8(input_data, mask); | |
_mm256_storeu_si256((__m256i *)&avx2_data[index], output_data); | |
} | |
if (remainder != 0) | |
{ | |
int index = loop_count * 4; | |
for (int i = index; i < index + remainder; i++) | |
{ | |
avx2_data[i] = bswap_64(src[i]); | |
} | |
} | |
return; | |
} | |
void bswap_little_2_big(const long long int *src, long long int *dst, int n) | |
{ | |
for (int i = 0; i < n; i++) | |
{ | |
dst[i] = bswap_64(src[i]); | |
} | |
return; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment