Skip to content

Instantly share code, notes, and snippets.

@hayunjong83
Created March 3, 2020 13:40
Show Gist options
  • Save hayunjong83/05551b82ba73fb46da0492f6a9d3d983 to your computer and use it in GitHub Desktop.
Save hayunjong83/05551b82ba73fb46da0492f6a9d3d983 to your computer and use it in GitHub Desktop.
simple SAXPY operation
__global__ void saxpy(int n, float a, float *__restrict__ x, float *__restrict__ y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if( i < n )
y[i] = a * x[i] + y[i];
}
int main()
{
int N = 1 << 16;
int size = N * sizeof(float);
float *h_x = (float*)malloc(size);
float *h_y = (float*)malloc(size);
float *d_x;
float *d_y;
cudaMalloc((void**) &d_x, size);
cudaMalloc((void**) &d_y, size);
for(int i=0; i < N; i++)
{
h_x[i] = 2.0;
h_y[i] = 2.0;
}
cudaMemcpy(d_x, h_x, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_y, h_y, size, cudaMemcpyHostToDevice);
saxpy<<<256, 256>>>(N, 2.0, d_x, d_y);
cudaMemcpy(h_y, d_y, size, cudaMemcpyDeviceToHost);
cudaFree(d_x);
cudaFree(d_y);
free(h_x);
free(h_y);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment