Skip to content

Instantly share code, notes, and snippets.

@yuezato
Created December 20, 2020 18:07
Show Gist options
  • Save yuezato/7bd716d63e1b5df840cd16ede075f5fb to your computer and use it in GitHub Desktop.
Save yuezato/7bd716d63e1b5df840cd16ede075f5fb to your computer and use it in GitHub Desktop.

PROGRAM2 cache1 <- cache1 + mem1

gcc -O3 -Wall -mavx2 read_write.c -DPROGRAM2
/Users/yuuya_uezato/cache$ for i in {0..10}; do ./a.out ; done
cache1 <- cache1 + mem1
elapsed = 353.935910 ms
cache1 <- cache1 + mem1
elapsed = 359.755992 ms
cache1 <- cache1 + mem1
elapsed = 363.224911 ms
cache1 <- cache1 + mem1
elapsed = 395.478990 ms
cache1 <- cache1 + mem1
elapsed = 359.182050 ms
cache1 <- cache1 + mem1
elapsed = 353.628968 ms
cache1 <- cache1 + mem1
elapsed = 354.928056 ms
cache1 <- cache1 + mem1
elapsed = 355.549968 ms
cache1 <- cache1 + mem1
elapsed = 358.411956 ms
cache1 <- cache1 + mem1
elapsed = 360.245996 ms
cache1 <- cache1 + mem1
elapsed = 358.720945 ms

PROGRAM3 mem1 <- cache1 + cache2

$ gcc -O3 -Wall -mavx2 read_write.c -DPROGRAM3
$ for i in {0..10}; do ./a.out ; done
mem1 <- cache1 + cache2
elapsed = 370.072017 ms
mem1 <- cache1 + cache2
elapsed = 363.054001 ms
mem1 <- cache1 + cache2
elapsed = 372.891021 ms
mem1 <- cache1 + cache2
elapsed = 365.868011 ms
mem1 <- cache1 + cache2
elapsed = 364.621985 ms
mem1 <- cache1 + cache2
elapsed = 364.180984 ms
mem1 <- cache1 + cache2
elapsed = 366.565988 ms
mem1 <- cache1 + cache2
elapsed = 357.636989 ms
mem1 <- cache1 + cache2
elapsed = 360.042065 ms
mem1 <- cache1 + cache2
elapsed = 369.603046 ms
mem1 <- cache1 + cache2
elapsed = 363.381019 ms
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
#include <xmmintrin.h>
#include <time.h>
#include <string.h>
double milliDiff(struct timespec *start, struct timespec *end)
{
return
(( end->tv_sec * 1000) + ( end->tv_nsec * 0.000001)) -
((start->tv_sec * 1000) + (start->tv_nsec * 0.000001));
}
#define PAGESIZE 4096
// A <- B ^ C;
__attribute__((noinline))
void xor(uint8_t *A, uint8_t *B, uint8_t *C) {
for (int i = 0; i < PAGESIZE; i += 128) {
__m256i* p = (__m256i*)(A+i);
__m256i* q = (__m256i*)(B+i);
__m256i* r = (__m256i*)(C+i);
__m256i v1 = _mm256_load_si256(q+0);
__m256i v2 = _mm256_load_si256(q+1);
__m256i v3 = _mm256_load_si256(q+2);
__m256i v4 = _mm256_load_si256(q+3);
__m256i w1 = _mm256_load_si256(r+0);
__m256i w2 = _mm256_load_si256(r+1);
__m256i w3 = _mm256_load_si256(r+2);
__m256i w4 = _mm256_load_si256(r+3);
__m256i x1 = _mm256_xor_si256(v1, w1);
__m256i x2 = _mm256_xor_si256(v2, w2);
__m256i x3 = _mm256_xor_si256(v3, w3);
__m256i x4 = _mm256_xor_si256(v4, w4);
_mm256_store_si256(p+0, x1);
_mm256_store_si256(p+1, x2);
_mm256_store_si256(p+2, x3);
_mm256_store_si256(p+3, x4);
}
}
/*
__attribute__((noinline))
void to_cache(uint8_t *a) {
for (int i = 0; i < PAGESIZE; i += 128) {
__m256i *q = (__m256i*)(a + i);
volatile __m256i v1 = _mm256_load_si256(q+0);
volatile __m256i v2 = _mm256_load_si256(q+1);
volatile __m256i v3 = _mm256_load_si256(q+3);
volatile __m256i v4 = _mm256_load_si256(q+4);
}
}
*/
// noinlineを外すとバグるのでつけている
__attribute__((noinline))
void to_cache(uint8_t *src) {
int32_t len = 4096;
asm volatile
(
"LOOP%=:\n\t"
"vmovdqa (%0), %%ymm0;\n\t"
"vmovdqa 32(%0), %%ymm1;\n\t"
"vmovdqa 64(%0), %%ymm2;\n\t"
"vmovdqa 96(%0), %%ymm3;\n\t"
"add $128, %0;\n\t"
"sub $128, %1;\n\t"
"jnz LOOP%=;"
:
: "r"(src), "r"(len)
: "ymm0", "ymm1", "ymm2", "ymm3", "cc"
);
}
#define NUM 120
// mem <- cache + mem
__attribute__((noinline))
double program1(uint8_t* A[NUM], uint8_t* B[NUM], uint8_t* C[NUM]) {
struct timespec ts, te;
double total = 0;
for(int i = 0; i < NUM; ++i) {
to_cache(A[i]);
B[i][0] = 0;
C[i][0] = 0;
clock_gettime(CLOCK_MONOTONIC, &ts);
xor(C[i], A[i], B[i]);
clock_gettime(CLOCK_MONOTONIC, &te);
total += milliDiff(&ts, &te);
}
return total;
}
// cache <- cache + mem
__attribute__((noinline))
double program2(uint8_t* A[NUM], uint8_t* B[NUM]) {
struct timespec ts, te;
double total = 0;
for(int i = 0; i < NUM; ++i) {
to_cache(A[i]);
B[i][0] = 0;
clock_gettime(CLOCK_MONOTONIC, &ts);
xor(A[i], A[i], B[i]);
clock_gettime(CLOCK_MONOTONIC, &te);
total += milliDiff(&ts, &te);
}
return total;
}
// mem <- cache + cache
__attribute__((noinline))
double program3(uint8_t* A[NUM], uint8_t* B[NUM], uint8_t* C[NUM]) {
struct timespec ts, te;
double total = 0;
for(int i = 0; i < NUM; ++i) {
to_cache(A[i]);
to_cache(B[i]);
C[i][0] = 0;
clock_gettime(CLOCK_MONOTONIC, &ts);
xor(C[i], A[i], B[i]);
clock_gettime(CLOCK_MONOTONIC, &te);
total += milliDiff(&ts, &te);
}
return total;
}
// cache <- cache + cache
__attribute__((noinline))
double program4(uint8_t* A[NUM], uint8_t* B[NUM]) {
struct timespec ts, te;
double total = 0;
for(int i = 0; i < NUM; ++i) {
to_cache(A[i]);
to_cache(B[i]);
clock_gettime(CLOCK_MONOTONIC, &ts);
xor(A[i], A[i], B[i]);
clock_gettime(CLOCK_MONOTONIC, &te);
total += milliDiff(&ts, &te);
}
return total;
}
// mem <- cache + cache
__attribute__((noinline))
double program5(uint8_t* A[NUM], uint8_t* B[NUM], uint8_t* C[NUM]) {
struct timespec ts, te;
double total = 0;
for(int i = 0; i < NUM; ++i) {
to_cache(A[i]);
to_cache(B[i]);
to_cache(C[i]);
clock_gettime(CLOCK_MONOTONIC, &ts);
xor(C[i], A[i], B[i]);
clock_gettime(CLOCK_MONOTONIC, &te);
total += milliDiff(&ts, &te);
}
return total;
}
int main() {
uint8_t *A[NUM];
uint8_t *B[NUM];
uint8_t *C[NUM];
const int iter = 20000;
double total_elapsed = 0.0;
uint8_t info = 0;
for(int i = 0; i < iter; ++i) {
for(int j=0; j<NUM; ++j) {
A[j] = malloc(4096); A[j][0] = 0;
B[j] = malloc(4096); B[j][0] = 0;
C[j] = malloc(4096); C[j][0] = 0;
}
/*
2 と 3 で余り差が無い気がする
*/
#ifdef PROGRAM1
if(!info) { puts("mem1 <- cache + mem2"); ++info; }
total_elapsed += program1(A, B, C);
#elif PROGRAM2
if(!info) { puts("cache1 <- cache1 + mem1"); ++info; }
total_elapsed += program2(A, B);
#elif PROGRAM3
if(!info) { puts("mem1 <- cache1 + cache2"); ++info; }
total_elapsed += program3(A, B, C);
#elif PROGRAM4
if(!info) { puts("cache1 <- cache1 + cache2"); ++info; }
total_elapsed += program4(A, B);
#elif PROGRAM5
if(!info) { puts("cache3 <- cache1 + cache2"); ++info; }
total_elapsed += program5(A, B, C);
#else
#error hoge
#endif
for(int j=0; j<NUM; ++j) {
free(A[j]);
free(B[j]);
free(C[j]);
}
}
printf("elapsed = %lf ms\n", total_elapsed);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment