Skip to content

Instantly share code, notes, and snippets.

@EnisBerk
Created November 7, 2016 11:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EnisBerk/3731be6096fb049fdc672e0c7540bc2e to your computer and use it in GitHub Desktop.
Save EnisBerk/3731be6096fb049fdc672e0c7540bc2e to your computer and use it in GitHub Desktop.
#include<time.h>
#include <stdio.h>
cudaEvent_t start, stop;
float time2;
float total_time=0;
__global__
void saxpy(int n, float a, float *x, float *y)
{
int i = blockDim.x*blockIdx.x+threadIdx.x;
if (i<n)y[i] = a*x[i] + y[i];
}
__global__ void _add_32_12(int n, float *x, int sx, int nx, float *y, int sy, int ny, float *z) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
while (i < n) {
float xi = (nx==n ? x[i] : sx==1 ? x[i%nx] : nx==1 ? x[0] : x[(i/sx)%nx]);
float yi = (ny==n ? y[i] : sy==1 ? y[i%ny] : ny==1 ? y[0] : y[(i/sy)%ny]);
z[i] = xi+yi;
i += blockDim.x * gridDim.x;
}
}
int main(void)
{
int iter_num=1;
int N = 1<<30;
int nx = N;
int ny = N*N;
int nz = N*N;
int sx = N;
int sy = N;
int n = nz;
float *x, *y, *d_x, *d_y, *z, *d_z;
x = (float*)malloc(nx*sizeof(float));
y = (float*)malloc(ny*sizeof(float));
z = (float*)malloc(nz*sizeof(float));
cudaMalloc(&d_x, nx*sizeof(float));
cudaMalloc(&d_y, ny*sizeof(float));
cudaMalloc(&d_z, nz*sizeof(float));
srand(time(NULL));
for (int i=0; i<iter_num; i++){
for (int i=0; i<nx; i++)
{
x[i] = rand();
}
for (int i=0; i<ny; i++)
{
y[i] = rand();
}
cudaMemcpy(d_x,x,nx*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y,y,ny*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_z,z,nz*sizeof(float), cudaMemcpyHostToDevice);
//Perform SAXPY on 1M elements
time2=0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord( start, 0 );
_add_32_12<<<256,256>>>(n,x,sx,nx,y,sy,ny,z);
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time2, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
total_time+=time2;
cudaMemcpy(y,d_y,N*sizeof(float), cudaMemcpyDeviceToHost);
}
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_z);
printf("\n%f\n",total_time/iter_num);
// printf("Effective Bandwidth (GB/s): %f\n", N*N*4*3/milliseconds/1e6);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment