Skip to content

Instantly share code, notes, and snippets.

@sedflix
Last active April 1, 2018 14:53
Show Gist options
  • Save sedflix/c27e7a3ab1df05fe26e9aa76bd9e8695 to your computer and use it in GitHub Desktop.
Save sedflix/c27e7a3ab1df05fe26e9aa76bd9e8695 to your computer and use it in GitHub Desktop.
#include <stdlib.h>
#include <xmmintrin.h>
#include <stdio.h>
#include <sys/time.h>
// gcc -DCLS=$(getconf LEVEL1_DCACHE_LINESIZE) -fopenmp idk_matrix.c
// the nxn matrix you want
#define N 1024
// CLS is Cache Line Size
// The block size
#define SM (CLS / sizeof (double))
#define AVG 20
/*
* We align the data in the cache and memory so that it can be easily cached and it is not affected by other processes memory
* 64 is assumed to the cache line size on my system
*/
// Input array initialisation
double mul1[N][N] __attribute__ ((aligned (CLS)));
double mul2[N][N] __attribute__ ((aligned (CLS)));
// will store the result
double res[N][N] __attribute__ ((aligned (CLS)));
void multiply(int n) {
// filling array with random numbers
for (int r = 0; r < n; ++r) {
for (int c = 0; c < n; ++c) {
mul1[r][c] = (rand() + 0.0) / (RAND_MAX + 0.0);
mul2[r][c] = (rand() + 0.0) / (RAND_MAX + 0.0);
}
}
int i, i2, j, j2, k, k2;
double *restrict rres;
double *restrict rmul1;
double *restrict rmul2;
#pragma omp parallel shared(res, mul1, mul2) private(i, j, k, i2, k2, j2, rres, rmul1, rmul2)
{
#pragma omp for
for (i = 0; i < N; i += SM) {
for (j = 0; j < N; j += SM) {
for (k = 0; k < N; k += SM) {
// We are multipying the matrix in blocks.
// We fetch the instructions that can fit into memory use it all.
for (i2 = 0, rres = &res[i][j], rmul1 = &mul1[i][k]; i2 < SM; ++i2, rres += N, rmul1 += N) {
_mm_prefetch (&rmul1[8], _MM_HINT_NTA);
for (k2 = 0, rmul2 = &mul2[k][j]; k2 < SM; ++k2, rmul2 += N) {
__m128d m1d = _mm_load_sd(&rmul1[k2]);
m1d = _mm_unpacklo_pd(m1d, m1d);
for (j2 = 0; j2 < SM; j2 += 2) {
__m128d m2 = _mm_load_pd(&rmul2[j2]);
__m128d r2 = _mm_load_pd(&rres[j2]);
_mm_store_pd(&rres[j2],
_mm_add_pd(_mm_mul_pd(m2, m1d), r2));
}
}
}
}
}
}
}
}
void getTime(int n) {
struct timeval t0, t1;
double elapsed=0;
for (int i = 0; i < AVG; ++i) {
gettimeofday(&t0, 0);
multiply(n);
gettimeofday(&t1, 0);
elapsed += (t1.tv_sec - t0.tv_sec) * 1.0f + (t1.tv_usec - t0.tv_usec) / 1000000.0f;
}
printf("%d \t %f\n", n, elapsed/AVG);
}
void main() {
getTime(N);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment