|
#include <stdint.h> |
|
#include <stdio.h> |
|
#include <stdlib.h> |
|
#include <immintrin.h> |
|
#include <xmmintrin.h> |
|
#include <time.h> |
|
#include <string.h> |
|
|
|
double milliDiff(struct timespec *start, struct timespec *end) |
|
{ |
|
return |
|
(( end->tv_sec * 1000) + ( end->tv_nsec * 0.000001)) - |
|
((start->tv_sec * 1000) + (start->tv_nsec * 0.000001)); |
|
} |
|
|
|
#define PAGESIZE 4096 |
|
|
|
// A <- B ^ C; |
|
__attribute__((noinline)) |
|
void xor(uint8_t *A, uint8_t *B, uint8_t *C) { |
|
for (int i = 0; i < PAGESIZE; i += 128) { |
|
__m256i* p = (__m256i*)(A+i); |
|
__m256i* q = (__m256i*)(B+i); |
|
__m256i* r = (__m256i*)(C+i); |
|
|
|
__m256i v1 = _mm256_load_si256(q+0); |
|
__m256i v2 = _mm256_load_si256(q+1); |
|
__m256i v3 = _mm256_load_si256(q+2); |
|
__m256i v4 = _mm256_load_si256(q+3); |
|
|
|
__m256i w1 = _mm256_load_si256(r+0); |
|
__m256i w2 = _mm256_load_si256(r+1); |
|
__m256i w3 = _mm256_load_si256(r+2); |
|
__m256i w4 = _mm256_load_si256(r+3); |
|
|
|
__m256i x1 = _mm256_xor_si256(v1, w1); |
|
__m256i x2 = _mm256_xor_si256(v2, w2); |
|
__m256i x3 = _mm256_xor_si256(v3, w3); |
|
__m256i x4 = _mm256_xor_si256(v4, w4); |
|
|
|
_mm256_store_si256(p+0, x1); |
|
_mm256_store_si256(p+1, x2); |
|
_mm256_store_si256(p+2, x3); |
|
_mm256_store_si256(p+3, x4); |
|
} |
|
} |
|
|
|
/* |
|
__attribute__((noinline)) |
|
void to_cache(uint8_t *a) { |
|
for (int i = 0; i < PAGESIZE; i += 128) { |
|
__m256i *q = (__m256i*)(a + i); |
|
|
|
volatile __m256i v1 = _mm256_load_si256(q+0); |
|
volatile __m256i v2 = _mm256_load_si256(q+1); |
|
volatile __m256i v3 = _mm256_load_si256(q+3); |
|
volatile __m256i v4 = _mm256_load_si256(q+4); |
|
} |
|
} |
|
*/ |
|
|
|
|
|
// noinlineを外すとバグるのでつけている |
|
__attribute__((noinline)) |
|
void to_cache(uint8_t *src) { |
|
int32_t len = 4096; |
|
asm volatile |
|
( |
|
"LOOP%=:\n\t" |
|
"vmovdqa (%0), %%ymm0;\n\t" |
|
"vmovdqa 32(%0), %%ymm1;\n\t" |
|
"vmovdqa 64(%0), %%ymm2;\n\t" |
|
"vmovdqa 96(%0), %%ymm3;\n\t" |
|
"add $128, %0;\n\t" |
|
"sub $128, %1;\n\t" |
|
"jnz LOOP%=;" |
|
: |
|
: "r"(src), "r"(len) |
|
: "ymm0", "ymm1", "ymm2", "ymm3", "cc" |
|
); |
|
} |
|
|
|
#define NUM 120 |
|
|
|
// mem <- cache + mem |
|
__attribute__((noinline)) |
|
double program1(uint8_t* A[NUM], uint8_t* B[NUM], uint8_t* C[NUM]) { |
|
struct timespec ts, te; |
|
double total = 0; |
|
for(int i = 0; i < NUM; ++i) { |
|
to_cache(A[i]); |
|
B[i][0] = 0; |
|
C[i][0] = 0; |
|
clock_gettime(CLOCK_MONOTONIC, &ts); |
|
xor(C[i], A[i], B[i]); |
|
clock_gettime(CLOCK_MONOTONIC, &te); |
|
total += milliDiff(&ts, &te); |
|
} |
|
return total; |
|
} |
|
|
|
// cache <- cache + mem |
|
__attribute__((noinline)) |
|
double program2(uint8_t* A[NUM], uint8_t* B[NUM]) { |
|
struct timespec ts, te; |
|
double total = 0; |
|
for(int i = 0; i < NUM; ++i) { |
|
to_cache(A[i]); |
|
B[i][0] = 0; |
|
clock_gettime(CLOCK_MONOTONIC, &ts); |
|
xor(A[i], A[i], B[i]); |
|
clock_gettime(CLOCK_MONOTONIC, &te); |
|
total += milliDiff(&ts, &te); |
|
} |
|
return total; |
|
} |
|
|
|
// mem <- cache + cache |
|
__attribute__((noinline)) |
|
double program3(uint8_t* A[NUM], uint8_t* B[NUM], uint8_t* C[NUM]) { |
|
struct timespec ts, te; |
|
double total = 0; |
|
for(int i = 0; i < NUM; ++i) { |
|
to_cache(A[i]); |
|
to_cache(B[i]); |
|
C[i][0] = 0; |
|
clock_gettime(CLOCK_MONOTONIC, &ts); |
|
xor(C[i], A[i], B[i]); |
|
clock_gettime(CLOCK_MONOTONIC, &te); |
|
total += milliDiff(&ts, &te); |
|
} |
|
return total; |
|
} |
|
|
|
// cache <- cache + cache |
|
__attribute__((noinline)) |
|
double program4(uint8_t* A[NUM], uint8_t* B[NUM]) { |
|
struct timespec ts, te; |
|
double total = 0; |
|
for(int i = 0; i < NUM; ++i) { |
|
to_cache(A[i]); |
|
to_cache(B[i]); |
|
clock_gettime(CLOCK_MONOTONIC, &ts); |
|
xor(A[i], A[i], B[i]); |
|
clock_gettime(CLOCK_MONOTONIC, &te); |
|
total += milliDiff(&ts, &te); |
|
} |
|
return total; |
|
} |
|
|
|
// mem <- cache + cache |
|
__attribute__((noinline)) |
|
double program5(uint8_t* A[NUM], uint8_t* B[NUM], uint8_t* C[NUM]) { |
|
struct timespec ts, te; |
|
double total = 0; |
|
for(int i = 0; i < NUM; ++i) { |
|
to_cache(A[i]); |
|
to_cache(B[i]); |
|
to_cache(C[i]); |
|
clock_gettime(CLOCK_MONOTONIC, &ts); |
|
xor(C[i], A[i], B[i]); |
|
clock_gettime(CLOCK_MONOTONIC, &te); |
|
total += milliDiff(&ts, &te); |
|
} |
|
return total; |
|
} |
|
|
|
int main() { |
|
uint8_t *A[NUM]; |
|
uint8_t *B[NUM]; |
|
uint8_t *C[NUM]; |
|
|
|
const int iter = 20000; |
|
double total_elapsed = 0.0; |
|
uint8_t info = 0; |
|
|
|
for(int i = 0; i < iter; ++i) { |
|
for(int j=0; j<NUM; ++j) { |
|
A[j] = malloc(4096); A[j][0] = 0; |
|
B[j] = malloc(4096); B[j][0] = 0; |
|
C[j] = malloc(4096); C[j][0] = 0; |
|
} |
|
|
|
/* |
|
2 と 3 で余り差が無い気がする |
|
*/ |
|
#ifdef PROGRAM1 |
|
if(!info) { puts("mem1 <- cache + mem2"); ++info; } |
|
total_elapsed += program1(A, B, C); |
|
#elif PROGRAM2 |
|
if(!info) { puts("cache1 <- cache1 + mem1"); ++info; } |
|
total_elapsed += program2(A, B); |
|
#elif PROGRAM3 |
|
if(!info) { puts("mem1 <- cache1 + cache2"); ++info; } |
|
total_elapsed += program3(A, B, C); |
|
#elif PROGRAM4 |
|
if(!info) { puts("cache1 <- cache1 + cache2"); ++info; } |
|
total_elapsed += program4(A, B); |
|
#elif PROGRAM5 |
|
if(!info) { puts("cache3 <- cache1 + cache2"); ++info; } |
|
total_elapsed += program5(A, B, C); |
|
#else |
|
#error hoge |
|
#endif |
|
|
|
for(int j=0; j<NUM; ++j) { |
|
free(A[j]); |
|
free(B[j]); |
|
free(C[j]); |
|
} |
|
} |
|
|
|
printf("elapsed = %lf ms\n", total_elapsed); |
|
|
|
return 0; |
|
} |