yuezato/hoge.md

## hoge.md

      
    Raw
  

              hoge.md
            
          
    PROGRAM2 cache1 <- cache1 + mem1

gcc -O3 -Wall -mavx2 read_write.c -DPROGRAM2
/Users/yuuya_uezato/cache$ for i in {0..10}; do ./a.out ; done
cache1 <- cache1 + mem1
elapsed = 353.935910 ms
cache1 <- cache1 + mem1
elapsed = 359.755992 ms
cache1 <- cache1 + mem1
elapsed = 363.224911 ms
cache1 <- cache1 + mem1
elapsed = 395.478990 ms
cache1 <- cache1 + mem1
elapsed = 359.182050 ms
cache1 <- cache1 + mem1
elapsed = 353.628968 ms
cache1 <- cache1 + mem1
elapsed = 354.928056 ms
cache1 <- cache1 + mem1
elapsed = 355.549968 ms
cache1 <- cache1 + mem1
elapsed = 358.411956 ms
cache1 <- cache1 + mem1
elapsed = 360.245996 ms
cache1 <- cache1 + mem1
elapsed = 358.720945 ms
PROGRAM3 mem1 <- cache1 + cache2

$ gcc -O3 -Wall -mavx2 read_write.c -DPROGRAM3
$ for i in {0..10}; do ./a.out ; done
mem1 <- cache1 + cache2
elapsed = 370.072017 ms
mem1 <- cache1 + cache2
elapsed = 363.054001 ms
mem1 <- cache1 + cache2
elapsed = 372.891021 ms
mem1 <- cache1 + cache2
elapsed = 365.868011 ms
mem1 <- cache1 + cache2
elapsed = 364.621985 ms
mem1 <- cache1 + cache2
elapsed = 364.180984 ms
mem1 <- cache1 + cache2
elapsed = 366.565988 ms
mem1 <- cache1 + cache2
elapsed = 357.636989 ms
mem1 <- cache1 + cache2
elapsed = 360.042065 ms
mem1 <- cache1 + cache2
elapsed = 369.603046 ms
mem1 <- cache1 + cache2
elapsed = 363.381019 ms

  
## read_write.c
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
#include <xmmintrin.h>
#include <time.h>
#include <string.h>

double milliDiff(struct timespec *start, struct timespec *end)
{
  return
    ((  end->tv_sec * 1000) + (  end->tv_nsec * 0.000001)) -
    ((start->tv_sec * 1000) + (start->tv_nsec * 0.000001));
}

#define PAGESIZE 4096

// A <- B ^ C;
__attribute__((noinline))
void xor(uint8_t *A, uint8_t *B, uint8_t *C) {
  for (int i = 0; i < PAGESIZE; i += 128) {
    __m256i* p = (__m256i*)(A+i);
    __m256i* q = (__m256i*)(B+i);
    __m256i* r = (__m256i*)(C+i);

    __m256i v1 = _mm256_load_si256(q+0);
    __m256i v2 = _mm256_load_si256(q+1);
    __m256i v3 = _mm256_load_si256(q+2);
    __m256i v4 = _mm256_load_si256(q+3);

    __m256i w1 = _mm256_load_si256(r+0);
    __m256i w2 = _mm256_load_si256(r+1);
    __m256i w3 = _mm256_load_si256(r+2);
    __m256i w4 = _mm256_load_si256(r+3);

    __m256i x1 = _mm256_xor_si256(v1, w1);
    __m256i x2 = _mm256_xor_si256(v2, w2);
    __m256i x3 = _mm256_xor_si256(v3, w3);
    __m256i x4 = _mm256_xor_si256(v4, w4);

    _mm256_store_si256(p+0, x1);
    _mm256_store_si256(p+1, x2);
    _mm256_store_si256(p+2, x3);
    _mm256_store_si256(p+3, x4);
  }
}

/*
__attribute__((noinline))
void to_cache(uint8_t *a) {
  for (int i = 0; i < PAGESIZE; i += 128) {
    __m256i *q = (__m256i*)(a + i);

    volatile __m256i v1 = _mm256_load_si256(q+0);
    volatile __m256i v2 = _mm256_load_si256(q+1);
    volatile __m256i v3 = _mm256_load_si256(q+3);
    volatile __m256i v4 = _mm256_load_si256(q+4);
  }
}
*/


// noinlineを外すとバグるのでつけている
__attribute__((noinline))
void to_cache(uint8_t *src) {
  int32_t len = 4096;
  asm volatile
    (
     "LOOP%=:\n\t"
     "vmovdqa   (%0),  %%ymm0;\n\t"
     "vmovdqa 32(%0),  %%ymm1;\n\t"
     "vmovdqa 64(%0),  %%ymm2;\n\t"
     "vmovdqa 96(%0),  %%ymm3;\n\t"
     "add $128, %0;\n\t"
     "sub $128, %1;\n\t"
     "jnz LOOP%=;"
     :
     : "r"(src), "r"(len)
     : "ymm0", "ymm1", "ymm2", "ymm3", "cc"
     );
}

#define NUM 120

// mem <- cache + mem
__attribute__((noinline))
double program1(uint8_t* A[NUM], uint8_t* B[NUM], uint8_t* C[NUM]) {
  struct timespec ts, te;
  double total = 0;
  for(int i = 0; i < NUM; ++i) {
    to_cache(A[i]);
    B[i][0] = 0;
    C[i][0] = 0;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    xor(C[i], A[i], B[i]);
    clock_gettime(CLOCK_MONOTONIC, &te);
    total += milliDiff(&ts, &te);
  }
  return total;
}

// cache <- cache + mem
__attribute__((noinline))
double program2(uint8_t* A[NUM], uint8_t* B[NUM]) {
  struct timespec ts, te;
  double total = 0;
  for(int i = 0; i < NUM; ++i) {
    to_cache(A[i]);
    B[i][0] = 0;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    xor(A[i], A[i], B[i]);
    clock_gettime(CLOCK_MONOTONIC, &te);
    total += milliDiff(&ts, &te);
  }
  return total;
}

// mem <- cache + cache
__attribute__((noinline))
double program3(uint8_t* A[NUM], uint8_t* B[NUM], uint8_t* C[NUM]) {
  struct timespec ts, te;
  double total = 0;
  for(int i = 0; i < NUM; ++i) {
    to_cache(A[i]);
    to_cache(B[i]);
    C[i][0] = 0;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    xor(C[i], A[i], B[i]);
    clock_gettime(CLOCK_MONOTONIC, &te);
    total += milliDiff(&ts, &te);
  }
  return total;
}

// cache <- cache + cache
__attribute__((noinline))
double program4(uint8_t* A[NUM], uint8_t* B[NUM]) {
  struct timespec ts, te;
  double total = 0;
  for(int i = 0; i < NUM; ++i) {
    to_cache(A[i]);
    to_cache(B[i]);
    clock_gettime(CLOCK_MONOTONIC, &ts);
    xor(A[i], A[i], B[i]);
    clock_gettime(CLOCK_MONOTONIC, &te);
    total += milliDiff(&ts, &te);
  }
  return total;
}

// mem <- cache + cache
__attribute__((noinline))
double program5(uint8_t* A[NUM], uint8_t* B[NUM], uint8_t* C[NUM]) {
  struct timespec ts, te;
  double total = 0;
  for(int i = 0; i < NUM; ++i) {
    to_cache(A[i]);
    to_cache(B[i]);
    to_cache(C[i]);
    clock_gettime(CLOCK_MONOTONIC, &ts);
    xor(C[i], A[i], B[i]);
    clock_gettime(CLOCK_MONOTONIC, &te);
    total += milliDiff(&ts, &te);
  }
  return total;
}

int main() {
  uint8_t *A[NUM];
  uint8_t *B[NUM];
  uint8_t *C[NUM];

  const int iter = 20000;
  double total_elapsed = 0.0;
  uint8_t info = 0;

  for(int i = 0; i < iter; ++i) {
    for(int j=0; j<NUM; ++j) {
      A[j] = malloc(4096); A[j][0] = 0;
      B[j] = malloc(4096); B[j][0] = 0;
      C[j] = malloc(4096); C[j][0] = 0;
    }

    /*
      2 と 3 で余り差が無い気がする
     */
#ifdef PROGRAM1
    if(!info) { puts("mem1 <- cache + mem2"); ++info; }
    total_elapsed += program1(A, B, C);
#elif  PROGRAM2
    if(!info) { puts("cache1 <- cache1 + mem1"); ++info; }
    total_elapsed += program2(A, B);
#elif  PROGRAM3
    if(!info) { puts("mem1 <- cache1 + cache2"); ++info; }
    total_elapsed += program3(A, B, C);
#elif  PROGRAM4
    if(!info) { puts("cache1 <- cache1 + cache2"); ++info; }
    total_elapsed += program4(A, B);
#elif  PROGRAM5
    if(!info) { puts("cache3 <- cache1 + cache2"); ++info; }
    total_elapsed += program5(A, B, C);
#else
    #error hoge
#endif

    for(int j=0; j<NUM; ++j) {
      free(A[j]);
      free(B[j]);
      free(C[j]);
    }
  }

  printf("elapsed = %lf ms\n", total_elapsed);

  return 0;
}
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <immintrin.h>
	#include <xmmintrin.h>
	#include <time.h>
	#include <string.h>

	double milliDiff(struct timespec start, struct timespec end)
	{
	return
	(( end->tv_sec * 1000) + ( end->tv_nsec * 0.000001)) -
	((start->tv_sec * 1000) + (start->tv_nsec * 0.000001));
	}

	#define PAGESIZE 4096

	// A <- B ^ C;
	__attribute__((noinline))
	void xor(uint8_t A, uint8_t B, uint8_t *C) {
	for (int i = 0; i < PAGESIZE; i += 128) {
	__m256i* p = (__m256i*)(A+i);
	__m256i* q = (__m256i*)(B+i);
	__m256i* r = (__m256i*)(C+i);

	__m256i v1 = _mm256_load_si256(q+0);
	__m256i v2 = _mm256_load_si256(q+1);
	__m256i v3 = _mm256_load_si256(q+2);
	__m256i v4 = _mm256_load_si256(q+3);

	__m256i w1 = _mm256_load_si256(r+0);
	__m256i w2 = _mm256_load_si256(r+1);
	__m256i w3 = _mm256_load_si256(r+2);
	__m256i w4 = _mm256_load_si256(r+3);

	__m256i x1 = _mm256_xor_si256(v1, w1);
	__m256i x2 = _mm256_xor_si256(v2, w2);
	__m256i x3 = _mm256_xor_si256(v3, w3);
	__m256i x4 = _mm256_xor_si256(v4, w4);

	_mm256_store_si256(p+0, x1);
	_mm256_store_si256(p+1, x2);
	_mm256_store_si256(p+2, x3);
	_mm256_store_si256(p+3, x4);
	}
	}

	/*
	__attribute__((noinline))
	void to_cache(uint8_t *a) {
	for (int i = 0; i < PAGESIZE; i += 128) {
	__m256i q = (__m256i)(a + i);

	volatile __m256i v1 = _mm256_load_si256(q+0);
	volatile __m256i v2 = _mm256_load_si256(q+1);
	volatile __m256i v3 = _mm256_load_si256(q+3);
	volatile __m256i v4 = _mm256_load_si256(q+4);
	}
	}
	*/


	// noinlineを外すとバグるのでつけている
	__attribute__((noinline))
	void to_cache(uint8_t *src) {
	int32_t len = 4096;
	asm volatile
	(
	"LOOP%=:\n\t"
	"vmovdqa (%0), %%ymm0;\n\t"
	"vmovdqa 32(%0), %%ymm1;\n\t"
	"vmovdqa 64(%0), %%ymm2;\n\t"
	"vmovdqa 96(%0), %%ymm3;\n\t"
	"add $128, %0;\n\t"
	"sub $128, %1;\n\t"
	"jnz LOOP%=;"
	:
	: "r"(src), "r"(len)
	: "ymm0", "ymm1", "ymm2", "ymm3", "cc"
	);
	}

	#define NUM 120

	// mem <- cache + mem
	__attribute__((noinline))
	double program1(uint8_t* A[NUM], uint8_t* B[NUM], uint8_t* C[NUM]) {
	struct timespec ts, te;
	double total = 0;
	for(int i = 0; i < NUM; ++i) {
	to_cache(A[i]);
	B[i][0] = 0;
	C[i][0] = 0;
	clock_gettime(CLOCK_MONOTONIC, &ts);
	xor(C[i], A[i], B[i]);
	clock_gettime(CLOCK_MONOTONIC, &te);
	total += milliDiff(&ts, &te);
	}
	return total;
	}

	// cache <- cache + mem
	__attribute__((noinline))
	double program2(uint8_t* A[NUM], uint8_t* B[NUM]) {
	struct timespec ts, te;
	double total = 0;
	for(int i = 0; i < NUM; ++i) {
	to_cache(A[i]);
	B[i][0] = 0;
	clock_gettime(CLOCK_MONOTONIC, &ts);
	xor(A[i], A[i], B[i]);
	clock_gettime(CLOCK_MONOTONIC, &te);
	total += milliDiff(&ts, &te);
	}
	return total;
	}

	// mem <- cache + cache
	__attribute__((noinline))
	double program3(uint8_t* A[NUM], uint8_t* B[NUM], uint8_t* C[NUM]) {
	struct timespec ts, te;
	double total = 0;
	for(int i = 0; i < NUM; ++i) {
	to_cache(A[i]);
	to_cache(B[i]);
	C[i][0] = 0;
	clock_gettime(CLOCK_MONOTONIC, &ts);
	xor(C[i], A[i], B[i]);
	clock_gettime(CLOCK_MONOTONIC, &te);
	total += milliDiff(&ts, &te);
	}
	return total;
	}

	// cache <- cache + cache
	__attribute__((noinline))
	double program4(uint8_t* A[NUM], uint8_t* B[NUM]) {
	struct timespec ts, te;
	double total = 0;
	for(int i = 0; i < NUM; ++i) {
	to_cache(A[i]);
	to_cache(B[i]);
	clock_gettime(CLOCK_MONOTONIC, &ts);
	xor(A[i], A[i], B[i]);
	clock_gettime(CLOCK_MONOTONIC, &te);
	total += milliDiff(&ts, &te);
	}
	return total;
	}

	// mem <- cache + cache
	__attribute__((noinline))
	double program5(uint8_t* A[NUM], uint8_t* B[NUM], uint8_t* C[NUM]) {
	struct timespec ts, te;
	double total = 0;
	for(int i = 0; i < NUM; ++i) {
	to_cache(A[i]);
	to_cache(B[i]);
	to_cache(C[i]);
	clock_gettime(CLOCK_MONOTONIC, &ts);
	xor(C[i], A[i], B[i]);
	clock_gettime(CLOCK_MONOTONIC, &te);
	total += milliDiff(&ts, &te);
	}
	return total;
	}

	int main() {
	uint8_t *A[NUM];
	uint8_t *B[NUM];
	uint8_t *C[NUM];

	const int iter = 20000;
	double total_elapsed = 0.0;
	uint8_t info = 0;

	for(int i = 0; i < iter; ++i) {
	for(int j=0; j<NUM; ++j) {
	A[j] = malloc(4096); A[j][0] = 0;
	B[j] = malloc(4096); B[j][0] = 0;
	C[j] = malloc(4096); C[j][0] = 0;
	}

	/*
	2 と 3 で余り差が無い気がする
	*/
	#ifdef PROGRAM1
	if(!info) { puts("mem1 <- cache + mem2"); ++info; }
	total_elapsed += program1(A, B, C);
	#elif PROGRAM2
	if(!info) { puts("cache1 <- cache1 + mem1"); ++info; }
	total_elapsed += program2(A, B);
	#elif PROGRAM3
	if(!info) { puts("mem1 <- cache1 + cache2"); ++info; }
	total_elapsed += program3(A, B, C);
	#elif PROGRAM4
	if(!info) { puts("cache1 <- cache1 + cache2"); ++info; }
	total_elapsed += program4(A, B);
	#elif PROGRAM5
	if(!info) { puts("cache3 <- cache1 + cache2"); ++info; }
	total_elapsed += program5(A, B, C);
	#else
	#error hoge
	#endif

	for(int j=0; j<NUM; ++j) {
	free(A[j]);
	free(B[j]);
	free(C[j]);
	}
	}

	printf("elapsed = %lf ms\n", total_elapsed);

	return 0;
	}