Skip to content

Instantly share code, notes, and snippets.

@thirdwing
Last active February 9, 2018 19:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thirdwing/5b64aa80afa37b1c4cf27cefbd49306c to your computer and use it in GitHub Desktop.
Save thirdwing/5b64aa80afa37b1c4cf27cefbd49306c to your computer and use it in GitHub Desktop.
// gcc -msse3 -O3 sse_4x4.c -l cblas
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <mmintrin.h>
#include <xmmintrin.h>
#include <pmmintrin.h>
#include <emmintrin.h>
// use cblas to verify the results
#include <cblas.h>
int main(int argc, char **argv) {
double A[16];
double B[16];
double C_sse[16];
double C_blas[16];
srand(time(NULL));
for (int i = 0; i < 16; ++i) {
A[i] = (double) rand() / RAND_MAX;
B[i] = (double) rand() / RAND_MAX;
C_sse[i] = C_blas[i] = (double) rand() / RAND_MAX;
}
for (int j = 0; j < 4; j++) {
__m128d c1 = _mm_loadu_pd(C_sse + j * 4);
__m128d c2 = _mm_loadu_pd(C_sse + j * 4 + 2);
for (int i = 0; i < 4; i++) {
__m128d b1 = _mm_loadu_pd(B + i * 4);
__m128d b2 = _mm_loadu_pd(B + i * 4 + 2);
__m128d a = _mm_load1_pd(A + j * 4 + i);
c1 = _mm_add_pd(c1, _mm_mul_pd(a, b1)); // multiply and add
c2 = _mm_add_pd(c2, _mm_mul_pd(a, b2));
}
// store the result back into the array
_mm_storeu_pd(C_sse + j * 4, c1);
_mm_storeu_pd(C_sse + j * 4 + 2, c2);
}
double alpha = 1, beta = 1;
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 4, 4, 4,
alpha, A, 4, B, 4, beta, C_blas, 4);
for (int i = 0; i < 16; ++i) {
printf("C_sse[%d] = %f, C_blas[%d] = %f\n", i, C_sse[i], i, C_blas[i]);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment