Skip to content

Instantly share code, notes, and snippets.

@thirdwing
Created February 9, 2018 19:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thirdwing/1246194f9b5bd93b3c37c2f909482f7e to your computer and use it in GitHub Desktop.
Save thirdwing/1246194f9b5bd93b3c37c2f909482f7e to your computer and use it in GitHub Desktop.
// gcc -msse3 -O3 sse_2x2.c -l cblas
#include <mmintrin.h>
#include <xmmintrin.h>
#include <pmmintrin.h>
#include <emmintrin.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
// use cblas to verify the results
#include <cblas.h>
int main(int argc, char **argv) {
double A[4];
double B[4];
double C_sse[4];
double C_blas[4];
srand(time(NULL));
for (int i = 0; i < 4; ++i) {
A[i] = (double) rand() / RAND_MAX;
B[i] = (double) rand() / RAND_MAX;
C_sse[i] = C_blas[i] = (double) rand() / RAND_MAX;
}
__m128d c1 = _mm_loadu_pd(C_sse + 0); // c1 = (C[0],C[1])
__m128d c2 = _mm_loadu_pd(C_sse + 2); // c2 = (C[2],C[3])
for (int i = 0; i < 2; i++) {
__m128d a1 = _mm_load1_pd(A + 0 + i); // load next column of A
__m128d a2 = _mm_load1_pd(A + 2 + i); // load next column of A
__m128d b = _mm_loadu_pd(B + i * 2); // load next row of B
c1 = _mm_add_pd(c1, _mm_mul_pd(a1, b)); // multiply and add
c2 = _mm_add_pd(c2, _mm_mul_pd(a2, b));
}
// store the result back into the array
_mm_storeu_pd(C_sse + 0, c1);
_mm_storeu_pd(C_sse + 2, c2);
double alpha = 1, beta = 1;
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 2, 2, 2,
alpha, A, 2, B, 2, beta, C_blas, 2);
for (int i = 0; i < 4; ++i) {
printf("C_sse[%d] = %f, C_blas[%d] = %f\n", i, C_sse[i], i, C_blas[i]);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment