Created
November 7, 2016 11:34
-
-
Save EnisBerk/3731be6096fb049fdc672e0c7540bc2e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<time.h> | |
#include <stdio.h> | |
cudaEvent_t start, stop; | |
float time2; | |
float total_time=0; | |
__global__ | |
void saxpy(int n, float a, float *x, float *y) | |
{ | |
int i = blockDim.x*blockIdx.x+threadIdx.x; | |
if (i<n)y[i] = a*x[i] + y[i]; | |
} | |
__global__ void _add_32_12(int n, float *x, int sx, int nx, float *y, int sy, int ny, float *z) { | |
int i = threadIdx.x + blockIdx.x * blockDim.x; | |
while (i < n) { | |
float xi = (nx==n ? x[i] : sx==1 ? x[i%nx] : nx==1 ? x[0] : x[(i/sx)%nx]); | |
float yi = (ny==n ? y[i] : sy==1 ? y[i%ny] : ny==1 ? y[0] : y[(i/sy)%ny]); | |
z[i] = xi+yi; | |
i += blockDim.x * gridDim.x; | |
} | |
} | |
int main(void) | |
{ | |
int iter_num=1; | |
int N = 1<<30; | |
int nx = N; | |
int ny = N*N; | |
int nz = N*N; | |
int sx = N; | |
int sy = N; | |
int n = nz; | |
float *x, *y, *d_x, *d_y, *z, *d_z; | |
x = (float*)malloc(nx*sizeof(float)); | |
y = (float*)malloc(ny*sizeof(float)); | |
z = (float*)malloc(nz*sizeof(float)); | |
cudaMalloc(&d_x, nx*sizeof(float)); | |
cudaMalloc(&d_y, ny*sizeof(float)); | |
cudaMalloc(&d_z, nz*sizeof(float)); | |
srand(time(NULL)); | |
for (int i=0; i<iter_num; i++){ | |
for (int i=0; i<nx; i++) | |
{ | |
x[i] = rand(); | |
} | |
for (int i=0; i<ny; i++) | |
{ | |
y[i] = rand(); | |
} | |
cudaMemcpy(d_x,x,nx*sizeof(float), cudaMemcpyHostToDevice); | |
cudaMemcpy(d_y,y,ny*sizeof(float), cudaMemcpyHostToDevice); | |
cudaMemcpy(d_z,z,nz*sizeof(float), cudaMemcpyHostToDevice); | |
//Perform SAXPY on 1M elements | |
time2=0; | |
cudaEventCreate(&start); | |
cudaEventCreate(&stop); | |
cudaEventRecord( start, 0 ); | |
_add_32_12<<<256,256>>>(n,x,sx,nx,y,sy,ny,z); | |
cudaEventRecord( stop, 0 ); | |
cudaEventSynchronize( stop ); | |
cudaEventElapsedTime( &time2, start, stop ); | |
cudaEventDestroy( start ); | |
cudaEventDestroy( stop ); | |
total_time+=time2; | |
cudaMemcpy(y,d_y,N*sizeof(float), cudaMemcpyDeviceToHost); | |
} | |
cudaFree(d_x); | |
cudaFree(d_y); | |
cudaFree(d_z); | |
printf("\n%f\n",total_time/iter_num); | |
// printf("Effective Bandwidth (GB/s): %f\n", N*N*4*3/milliseconds/1e6); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment