Skip to content

Instantly share code, notes, and snippets.

@jeromerobert
Last active August 29, 2015 14:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jeromerobert/e4fffc1eb2a9168f6205 to your computer and use it in GitHub Desktop.
Save jeromerobert/e4fffc1eb2a9168f6205 to your computer and use it in GitHub Desktop.
OpenBLAS bug #478
// build with gcc -Ofast -g -lpthread -lblas -lrt openblas-bug478.c or
// gcc -Ofast -g -lpthread -lopenblas -lrt openblas-bug478.c
// run with OPENBLAS_NUM_THREADS=1
#include <stdlib.h>
#include <stdio.h>
#include <cblas.h>
#include <time.h>
#include <pthread.h>
// number of loop for a 1x1 matrix. Change it if the test is
// too slow on you computer.
#define NLOOP 20e9
typedef struct {
int matrix_size;
int n_loop;
int threaded;
} BenchParam;
void * dgemv_bench(BenchParam * param)
{
int i, n;
n = param->n_loop;
int size = param->matrix_size;
double v = 1.01;
int one = 1;
double * A = calloc(size*size, sizeof(double));
double * x = calloc(size, sizeof(double));
double * y = calloc(size, sizeof(double));
for(i = 0; i < size; i++)
y[i] = i;
for(i = 0; i < size * size; i++)
A[i] = i;
for(i = 0; i < param->n_loop; i++)
{
dgemv_("N", &size, &size, &v, A, &size, y, &one, &v, y, &one);
}
if(param->threaded)
pthread_exit(NULL);
return NULL;
}
double thread_bench(int nloop, int nb_threads, int matrix_size, double reftime)
{
BenchParam param;
pthread_t threads[nb_threads];
int t, rc;
struct timespec tick, tock;
param.matrix_size = matrix_size;
clock_gettime(CLOCK_MONOTONIC, &tick);
param.threaded = 1;
for(t=0; t<nb_threads; t++){
param.n_loop = nloop / nb_threads;
rc = pthread_create(&threads[t], NULL, dgemv_bench, &param);
if (rc){
printf("ERROR; return code from pthread_create() is %d\n", rc);
exit(-1);
}
}
for(t=0; t<nb_threads; t++){
pthread_join(threads[t], NULL);
}
clock_gettime(CLOCK_MONOTONIC, &tock);
double dt = (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9;
if(reftime > 0) {
printf("Nb threads %d, matrix size %d, time %g, speedup %g\n", nb_threads, matrix_size, dt, reftime/dt);
return reftime;
}
else {
printf("Nb threads %d, matrix size %d, time %g\n", nb_threads, matrix_size, dt);
return dt;
}
}
int main(int argc, char * argv[]) {
int i, j;
struct timespec tick, tock;
int nb_threads[5] = {1, 2, 4, 6, 12};
int matrix_sizes[5] = {20, 40, 60, 80, 200};
for(j = 0; j < 5; j++)
{
double ms = matrix_sizes[j];
double reftime = -1;
for(i = 0; i < 5; i++)
reftime = thread_bench((int)(NLOOP/(ms*ms)), nb_threads[i], ms, reftime);
puts("");
}
}
Nb threads 1, matrix size 20, time 11.1883
Nb threads 2, matrix size 20, time 16.9531, speedup 0.659958
Nb threads 4, matrix size 20, time 14.639, speedup 0.764281
Nb threads 6, matrix size 20, time 15.0962, speedup 0.741134
Nb threads 12, matrix size 20, time 13.8431, speedup 0.808224
Nb threads 1, matrix size 40, time 6.76891
Nb threads 2, matrix size 40, time 5.63293, speedup 1.20167
Nb threads 4, matrix size 40, time 4.39875, speedup 1.53883
Nb threads 6, matrix size 40, time 4.02665, speedup 1.68103
Nb threads 12, matrix size 40, time 3.49245, speedup 1.93815
Nb threads 1, matrix size 60, time 6.84569
Nb threads 2, matrix size 60, time 3.9338, speedup 1.74022
Nb threads 4, matrix size 60, time 2.27475, speedup 3.00943
Nb threads 6, matrix size 60, time 1.8754, speedup 3.65024
Nb threads 12, matrix size 60, time 1.81147, speedup 3.77907
Nb threads 1, matrix size 80, time 6.14711
Nb threads 2, matrix size 80, time 3.59512, speedup 1.70985
Nb threads 4, matrix size 80, time 2.03429, speedup 3.02175
Nb threads 6, matrix size 80, time 1.32202, speedup 4.6498
Nb threads 12, matrix size 80, time 1.15665, speedup 5.31459
Nb threads 1, matrix size 200, time 6.1784
Nb threads 2, matrix size 200, time 3.37149, speedup 1.83254
Nb threads 4, matrix size 200, time 1.68204, speedup 3.67315
Nb threads 6, matrix size 200, time 1.1164, speedup 5.73275
Nb threads 12, matrix size 200, time 0.636767, speedup 9.70276
Nb threads 1, matrix size 20, time 21.3955
Nb threads 2, matrix size 20, time 10.7553, speedup 1.9893
Nb threads 4, matrix size 20, time 5.3625, speedup 3.98983
Nb threads 6, matrix size 20, time 3.6023, speedup 5.93941
Nb threads 12, matrix size 20, time 1.89335, speedup 11.3003
Nb threads 1, matrix size 40, time 16.7607
Nb threads 2, matrix size 40, time 8.41217, speedup 1.99243
Nb threads 4, matrix size 40, time 4.19766, speedup 3.99286
Nb threads 6, matrix size 40, time 2.79663, speedup 5.99317
Nb threads 12, matrix size 40, time 1.47789, speedup 11.341
Nb threads 1, matrix size 60, time 15.4368
Nb threads 2, matrix size 60, time 7.7506, speedup 1.99169
Nb threads 4, matrix size 60, time 3.8857, speedup 3.97272
Nb threads 6, matrix size 60, time 4.18131, speedup 3.69185
Nb threads 12, matrix size 60, time 1.32583, speedup 11.6431
Nb threads 1, matrix size 80, time 16.6979
Nb threads 2, matrix size 80, time 8.42563, speedup 1.98179
Nb threads 4, matrix size 80, time 4.20207, speedup 3.97372
Nb threads 6, matrix size 80, time 2.80974, speedup 5.94286
Nb threads 12, matrix size 80, time 1.43335, speedup 11.6495
Nb threads 1, matrix size 200, time 15.2966
Nb threads 2, matrix size 200, time 7.6919, speedup 1.98866
Nb threads 4, matrix size 200, time 3.85008, speedup 3.97305
Nb threads 6, matrix size 200, time 2.70593, speedup 5.65297
Nb threads 12, matrix size 200, time 1.32625, speedup 11.4599
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment