Skip to content

Instantly share code, notes, and snippets.

@yuezato
Last active December 15, 2020 19:36
Show Gist options
  • Save yuezato/752673466674b530fd1ca062343e8ef3 to your computer and use it in GitHub Desktop.
Save yuezato/752673466674b530fd1ca062343e8ef3 to your computer and use it in GitHub Desktop.
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
#include <xmmintrin.h>
#include <time.h>
#include <string.h>
double milliDiff(struct timespec *start, struct timespec *end)
{
return
(( end->tv_sec * 1000) + ( end->tv_nsec * 0.000001)) -
((start->tv_sec * 1000) + (start->tv_nsec * 0.000001));
}
#define PAGESIZE 4096
// A <- B ^ C;
__attribute__((noinline))
void xor(uint8_t *A, uint8_t *B, uint8_t *C) {
for (int i = 0; i < PAGESIZE; i += 128) {
__m256i* p = (__m256i*)(A+i);
__m256i* q = (__m256i*)(B+i);
__m256i* r = (__m256i*)(C+i);
__m256i v1 = _mm256_load_si256(q+0);
__m256i v2 = _mm256_load_si256(q+1);
__m256i v3 = _mm256_load_si256(q+2);
__m256i v4 = _mm256_load_si256(q+3);
__m256i w1 = _mm256_load_si256(r+0);
__m256i w2 = _mm256_load_si256(r+1);
__m256i w3 = _mm256_load_si256(r+2);
__m256i w4 = _mm256_load_si256(r+3);
__m256i x1 = _mm256_xor_si256(v1, w1);
__m256i x2 = _mm256_xor_si256(v2, w2);
__m256i x3 = _mm256_xor_si256(v3, w3);
__m256i x4 = _mm256_xor_si256(v4, w4);
_mm256_store_si256(p+0, x1);
_mm256_store_si256(p+1, x2);
_mm256_store_si256(p+2, x3);
_mm256_store_si256(p+3, x4);
}
}
// A <- B + C + D
__attribute__((noinline))
void xor3(uint8_t *A, uint8_t *B, uint8_t *C, uint8_t *D) {
for (int i = 0; i < PAGESIZE; i += 128) {
__m256i* p = (__m256i*)(A+i);
__m256i* q = (__m256i*)(B+i);
__m256i* r = (__m256i*)(C+i);
__m256i* s = (__m256i*)(D+i);
__m256i v1 = _mm256_load_si256(q+0);
__m256i v2 = _mm256_load_si256(q+1);
__m256i v3 = _mm256_load_si256(q+2);
__m256i v4 = _mm256_load_si256(q+3);
__m256i w1 = _mm256_load_si256(r+0);
__m256i w2 = _mm256_load_si256(r+1);
__m256i w3 = _mm256_load_si256(r+2);
__m256i w4 = _mm256_load_si256(r+3);
__m256i x1 = _mm256_load_si256(s+0);
__m256i x2 = _mm256_load_si256(s+1);
__m256i x3 = _mm256_load_si256(s+2);
__m256i x4 = _mm256_load_si256(s+3);
__m256i a1 = _mm256_xor_si256(v1, w1);
__m256i a2 = _mm256_xor_si256(v2, w2);
__m256i a3 = _mm256_xor_si256(v3, w3);
__m256i a4 = _mm256_xor_si256(v4, w4);
__m256i b1 = _mm256_xor_si256(a1, x1);
__m256i b2 = _mm256_xor_si256(a2, x2);
__m256i b3 = _mm256_xor_si256(a3, x3);
__m256i b4 = _mm256_xor_si256(a4, x4);
_mm256_store_si256(p+0, b1);
_mm256_store_si256(p+1, b2);
_mm256_store_si256(p+2, b3);
_mm256_store_si256(p+3, b4);
}
}
// A <- B + C + D + E
__attribute__((noinline))
void xor4(uint8_t *A, uint8_t *B, uint8_t *C, uint8_t *D, uint8_t *E) {
for (int i = 0; i < PAGESIZE; i += 128) {
__m256i* p = (__m256i*)(A+i);
__m256i* q = (__m256i*)(B+i);
__m256i* r = (__m256i*)(C+i);
__m256i* s = (__m256i*)(D+i);
__m256i* t = (__m256i*)(E+i);
__m256i v1 = _mm256_load_si256(q+0);
__m256i v2 = _mm256_load_si256(q+1);
__m256i v3 = _mm256_load_si256(q+2);
__m256i v4 = _mm256_load_si256(q+3);
__m256i w1 = _mm256_load_si256(r+0);
__m256i w2 = _mm256_load_si256(r+1);
__m256i w3 = _mm256_load_si256(r+2);
__m256i w4 = _mm256_load_si256(r+3);
__m256i x1 = _mm256_load_si256(s+0);
__m256i x2 = _mm256_load_si256(s+1);
__m256i x3 = _mm256_load_si256(s+2);
__m256i x4 = _mm256_load_si256(s+3);
__m256i y1 = _mm256_load_si256(t+0);
__m256i y2 = _mm256_load_si256(t+1);
__m256i y3 = _mm256_load_si256(t+2);
__m256i y4 = _mm256_load_si256(t+3);
__m256i a1 = _mm256_xor_si256(v1, w1);
__m256i a2 = _mm256_xor_si256(v2, w2);
__m256i a3 = _mm256_xor_si256(v3, w3);
__m256i a4 = _mm256_xor_si256(v4, w4);
__m256i b1 = _mm256_xor_si256(a1, x1);
__m256i b2 = _mm256_xor_si256(a2, x2);
__m256i b3 = _mm256_xor_si256(a3, x3);
__m256i b4 = _mm256_xor_si256(a4, x4);
__m256i c1 = _mm256_xor_si256(b1, y1);
__m256i c2 = _mm256_xor_si256(b2, y2);
__m256i c3 = _mm256_xor_si256(b3, y3);
__m256i c4 = _mm256_xor_si256(b4, y4);
_mm256_store_si256(p+0, c1);
_mm256_store_si256(p+1, c2);
_mm256_store_si256(p+2, c3);
_mm256_store_si256(p+3, c4);
}
}
#define NUM 120
uint8_t program1(uint8_t* A[NUM], uint8_t* B[NUM], uint8_t* C[NUM], uint8_t *D[NUM], uint8_t *store[NUM]) {
for(int i = 0; i < NUM; ++i) {
xor(store[i], A[i], B[i]);
xor(store[i], store[i], C[i]);
xor(store[i], store[i], D[i]);
}
uint8_t sum = 0;
for(int i = 0; i < NUM; ++i) {
sum += store[i][0];
}
return sum;
}
uint8_t program2(uint8_t* A[NUM], uint8_t* B[NUM], uint8_t* C[NUM], uint8_t *D[NUM], uint8_t *store[NUM]) {
for(int i = 0; i < NUM; ++i) {
xor3(store[i], A[i], B[i], C[i]);
xor(store[i], store[i], D[i]);
}
uint8_t sum = 0;
for(int i = 0; i < NUM; ++i) {
sum += store[i][0];
}
return sum;
}
uint8_t program3(uint8_t* A[NUM], uint8_t* B[NUM], uint8_t* C[NUM], uint8_t *D[NUM], uint8_t *store[NUM]) {
for(int i = 0; i < NUM; ++i) {
xor4(store[i], A[i], B[i], C[i], D[i]);
}
uint8_t sum = 0;
for(int i = 0; i < NUM; ++i) {
sum += store[i][0];
}
return sum;
}
int main() {
struct timespec ts, te;
uint8_t *A[NUM];
uint8_t *B[NUM];
uint8_t *C[NUM];
uint8_t *D[NUM];
uint8_t *store[NUM];
const int iter = 1000;
double total_elapsed = 0.0;
uint8_t total_sum = 0;
for(int i = 0; i < iter; ++i) {
for(int j=0; j<NUM; ++j) {
A[j] = malloc(4096); A[j][0] = 0;
B[j] = malloc(4096); B[j][0] = 0;
C[j] = malloc(4096); C[j][0] = 0;
D[j] = malloc(4096); D[j][0] = 0;
store[j] = malloc(4096); store[j][0] = 0;
}
clock_gettime(CLOCK_MONOTONIC, &ts);
#ifdef PROGRAM1
total_sum += program1(A, B, C, D, store);
#elif PROGRAM2
total_sum += program2(A, B, C, D, store);
#elif PROGRAM3
total_sum += program3(A, B, C, D, store);
#else
#error hoge
#endif
clock_gettime(CLOCK_MONOTONIC, &te);
double elapsed = milliDiff(&ts, &te);
total_elapsed += elapsed;
for(int j=0; j<NUM; ++j) {
free(A[j]);
free(B[j]);
free(C[j]);
free(D[j]);
free(store[j]);
}
}
printf("elapsed = %lf\n", total_elapsed);
return 0;
}

どの実験結果もピーキーな気がするのはなぜ……

1

$ for i in {0..20}; do rm -f a.out && gcc -O3 -Wall -mavx2 xor_fusion.c -DPROGRAM1 && sleep 1 && ./a.out; done
elapsed = 53.369999
elapsed = 45.633999
elapsed = 52.520001
elapsed = 46.847999
elapsed = 55.667001
elapsed = 49.068001
elapsed = 60.450000
elapsed = 50.809000
elapsed = 60.221001
elapsed = 44.804999
elapsed = 54.203000
elapsed = 44.199998
elapsed = 46.526000
elapsed = 47.847000
elapsed = 45.778000
elapsed = 45.864999
elapsed = 52.623999
elapsed = 45.811998
elapsed = 55.029003
elapsed = 62.476999
elapsed = 42.785000

2

$ for i in {0..20}; do rm -f a.out && gcc -O3 -Wall -mavx2 xor_fusion.c -DPROGRAM2 && sleep 1 && ./a.out; done
elapsed = 42.905000
elapsed = 42.015001
elapsed = 44.977998
elapsed = 49.486999
elapsed = 47.443002
elapsed = 43.584002
elapsed = 43.436003
elapsed = 42.837999
elapsed = 48.566999
elapsed = 41.295001
elapsed = 48.536001
elapsed = 52.956000
elapsed = 48.220999
elapsed = 41.437998
elapsed = 51.865000
elapsed = 42.614000
elapsed = 42.662001
elapsed = 45.722000
elapsed = 44.511001
elapsed = 43.272001
elapsed = 51.190001

3

$ for i in {0..20}; do rm -f a.out && gcc -O3 -Wall -mavx2 xor_fusion.c -DPROGRAM3 && sleep 1 && ./a.out; done
elapsed = 39.329001
elapsed = 44.178001
elapsed = 48.467002
elapsed = 41.702002
elapsed = 38.660000
elapsed = 44.770001
elapsed = 43.252998
elapsed = 40.499998
elapsed = 48.806000
elapsed = 40.871998
elapsed = 51.690998
elapsed = 50.051998
elapsed = 54.566000
elapsed = 44.430001
elapsed = 40.595002
elapsed = 41.381999
elapsed = 39.693000
elapsed = 47.827999
elapsed = 40.702999
elapsed = 40.449003
elapsed = 40.563999
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment