Created
January 15, 2011 14:26
-
-
Save quantumelixir/780937 to your computer and use it in GitHub Desktop.
Matrix-Vector multiplication optimization and benchmarking
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* y += A*x | |
* | |
* Matrix-Vector multiplication optimization experiment | |
* TODO: Benchmark against the following: | |
* eigen2: ourselves, with the default options (SSE2 vectorization enabled). | |
* eigen2_novec: ourselves but with Eigen's explicit vectorization disabled. However, gcc's auto-vectorization was enabled. | |
* INTEL_MKL: The Intel Math Kernel Library, which includes a BLAS/LAPACK (11.0). Closed-source. | |
* ACML: The AMD's core math library, which includes a BLAS/LAPACK (4.2.0). Closed-source. | |
* GOTO: The GOTO BLAS library (1.26). | |
* ATLAS: The math-atlas BLAS library (3.8.3). | |
*/ | |
#include <stdio.h> | |
#include <malloc.h> | |
#include <stdlib.h> | |
#include <cblas.h> | |
#include <math.h> | |
#include <time.h> | |
void my_gemv(int M, int N, double* As, double* xs, double* ys); | |
// time stuff | |
static clock_t tic_timestart; | |
void tic(void) { | |
tic_timestart = clock(); | |
} | |
double toc(void) { | |
clock_t tic_timestop; | |
tic_timestop = clock(); | |
printf("%8.2f\n", (double)(tic_timestop - tic_timestart) / CLOCKS_PER_SEC); | |
return (double)(tic_timestop - tic_timestart) / CLOCKS_PER_SEC; | |
} | |
double tocq(void) { | |
clock_t tic_timestop; | |
tic_timestop = clock(); | |
return (double)(tic_timestop - tic_timestart) / CLOCKS_PER_SEC; | |
} | |
double | |
error(double* a, double* b, int n) | |
{ | |
double e = 0.0; | |
for (int i = 0; i < n; ++i) | |
e += fabs(a[i] - b[i]); | |
return e; | |
} | |
void | |
print_vector(double* y, int n) | |
{ | |
printf("------------------------------------\n"); | |
for (int i = 0; i < n; ++i) | |
printf("%g ", y[i]); | |
printf("\n"); | |
printf("------------------------------------\n\n"); | |
} | |
void | |
init(double** y, double **ymine, double** A, double** x, int M, int N) | |
{ | |
*y = (double *) malloc(sizeof(double) * M); | |
*ymine = (double *) malloc(sizeof(double) * M); | |
*A = (double *) malloc(sizeof(double) * M * N); | |
*x = (double *) malloc(sizeof(double) * N); | |
srand(time(NULL)); | |
for (int i = 0; i < N; ++i) | |
(*x)[i] = rand() * 1.0/RAND_MAX; | |
for (int i = 0; i < M; ++i) | |
for (int j = 0; j < N; ++j) | |
(*A)[i*N + j] = rand() * 1.0/RAND_MAX; | |
} | |
void | |
destroy(double *y, double *ymine, double* A, double *x) | |
{ | |
free(y); | |
free(ymine); | |
free(A); | |
free(x); | |
} | |
int main(int argc, char **argv) | |
{ | |
double *ys, *ymine, *As, *xs; | |
int M, N; | |
if (argc < 3) { | |
printf("Run with required arguments M and N\n"); | |
exit(1); | |
} | |
else { | |
M = atoi(argv[1]); | |
N = atoi(argv[2]); | |
} | |
printf("Setting up.. "); | |
tic(); | |
init(&ys, &ymine, &As, &xs, M, N); | |
toc(); | |
printf("\n"); | |
printf( "cblas_sgemv: " ); | |
tic(); | |
cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, As, N, xs, 1, 0.0, ys, 1); | |
toc(); | |
printf( "my_gemv : " ); | |
tic(); | |
my_gemv(M, N, As, xs, ymine); | |
toc(); | |
printf("Max absolute error: %g\n", error(ys, ymine, M)); | |
destroy(ys, ymine, As, xs); | |
return 0; | |
} | |
void | |
my_gemv(int M, int N, double* As, double* xs, double* ys) | |
{ | |
// register blocking? | |
double *x = xs, *A = As, *y = ys; | |
double *ye = ys + M, *xe = xs + N; | |
double *y1 = ys, *y2 = ys + 1; | |
double *A1 = As, *A2 = As + N; | |
while (y1 != ye) | |
{ | |
x = xs; | |
while (x != xe) | |
{ | |
*y1 += *A1++ * *x; | |
*y2 += *A2++ * *x; | |
x++; | |
} | |
y1+=2; y2+=2; | |
A1+=N; A2+=N; | |
} | |
} | |
// gcc -std=gnu9x -L/opt/local/lib -lcblas -I/usr/include/malloc/ -o gemv gemv.c | |
/* | |
* // naive version | |
* [19:05][~]$ g++ -O3 gemv.cpp && time ./gemv 10000 10000 | |
* | |
* real 0m3.047s | |
* user 0m2.413s | |
* sys 0m0.467s | |
* | |
* for (int i = 0; i < M; ++i) | |
* for (int j = 0; j < N; ++j) | |
* ys[i] += As[i*M + j] * xs[j]; | |
*/ | |
/* | |
* // don't use subscripted array access | |
* [19:05][~]$ g++ -O3 -o gemv gemv.cpp && time ./gemv 10000 10000 | |
* | |
* real 0m0.980s | |
* user 0m0.430s | |
* sys 0m0.480s | |
* | |
* float *x = xs, *A = As, *y = ys; | |
* float *ye = ys + M, *xe = xs + N; | |
* while (y < ye) | |
* { | |
* x = xs; | |
* while (x < xe) | |
* *y += *A++ * *x++; | |
* y++; | |
* } | |
* | |
*/ | |
/* | |
* // changing < to != | |
* float *x = xs, *A = As, *y = ymine; | |
* float *ye = ymine + M, *xe = xs + N; | |
* while (y != ye) | |
* { | |
* x = xs; | |
* while (x != xe) | |
* *y += *A++ * *x++; | |
* y++; | |
* } | |
* | |
*/ | |
/* | |
* // register blocking? | |
* float *x = xs, *A = As, *y = ys; | |
* float *ye = ys + M, *xe = xs + N; | |
* | |
* float *y1 = ys, *y2 = ys + 1; | |
* float *A1 = As, *A2 = As + N; | |
* | |
* while (y1 != ye) | |
* { | |
* x = xs; | |
* while (x != xe) | |
* { | |
* *y1 += *A1++ * *x; | |
* *y2 += *A2++ * *x; | |
* x++; | |
* } | |
* y1+=2; y2+=2; | |
* A1+=N; A2+=N; | |
* } | |
* | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment