Skip to content

Instantly share code, notes, and snippets.

@imSrbh
Created April 24, 2020 08:05
Show Gist options
  • Save imSrbh/0584f9a9c726d7962e2a40c77613ca56 to your computer and use it in GitHub Desktop.
Save imSrbh/0584f9a9c726d7962e2a40c77613ca56 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_common.cuh"
#include "common.h"
//assume grid is 1D and block is 1D then nx = size
__global__ void sum_arrays_1Dgrid_1Dblock(float* a, float* b, float *c, int nx)
{
int gid = blockIdx.x * blockDim.x + threadIdx.x;
c[gid] = a[gid] + b[gid];
}
//assume grid is 2D and block is 2D then nx * ny = size
__global__ void sum_arrays_2Dgrid_2Dblock(float* a, float* b, float *c, int nx, int ny)
{
int gidx = blockIdx.x * blockDim.x + threadIdx.x;
int gidy = blockIdx.y*blockDim.y + threadIdx.y;
int gid = gidy * nx + gidx;
if(gidx < nx && gidy < ny)
c[gid] = a[gid] + b[gid];
}
void run_sum_array_1d(int argc, char** argv)
{
printf("Runing 1D grid \n");
int size = 1 << 22;
int block_size = 128;
int nx, ny = 0;
if (argc > 2)
size = 1 << atoi(argv[2]);
if (argc > 4)
block_size = 1 << atoi(argv[4]);
unsigned int byte_size = size * sizeof(float);
printf("Input size : %d \n",size);
float * h_a, *h_b, *h_out, *h_ref;
h_a = (float*)malloc(byte_size);
h_b = (float*)malloc(byte_size);
h_out = (float*)malloc(byte_size);
h_ref = (float*)malloc(byte_size);
if (!h_a)
printf("host memory allocation error \n");
for (size_t i = 0; i < size; i++)
{
h_a[i] = i % 10;
h_b[i] = i % 7;
}
sum_array_cpu(h_a, h_b, h_out,size);
dim3 block( block_size);
dim3 grid((size+block.x -1)/block.x);
printf("Kernel is lauch with grid(%d,%d,%d) and block(%d,%d,%d) \n",
grid.x,grid.y,grid.z,block.x,block.y, block.z);
float *d_a, *d_b, *d_c;
gpuErrchk(cudaMalloc((void**)&d_a, byte_size));
gpuErrchk(cudaMalloc((void**)&d_b, byte_size));
gpuErrchk(cudaMalloc((void**)&d_c, byte_size));
gpuErrchk(cudaMemset(d_c,0,byte_size));
gpuErrchk(cudaMemcpy(d_a,h_a,byte_size,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, h_b, byte_size, cudaMemcpyHostToDevice));
sum_arrays_1Dgrid_1Dblock << <grid, block >> > (d_a, d_b, d_c, size);
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_ref,d_c,byte_size,cudaMemcpyDeviceToHost));
compare_arrays(h_out, h_ref,size);
cudaFree(d_c);
cudaFree(d_b);
cudaFree(d_a);
free(h_ref);
free(h_out);
free(h_b);
free(h_a);
}
void run_sum_array_2d(int argc, char** argv)
{
printf("Runing 2D grid \n");
int size = 1 << 22; // 4194304
int block_x = 128;
int nx = 1 << 14; // 16384
int ny = size / nx;
int block_y = 8;
if (argc > 2)
size = 1 << atoi(argv[2]);
if (argc > 3)
nx = 1 << atoi(argv[3]);
ny = size / nx;
if (argc > 4)
{
int pow = atoi(argv[4]);
if (pow < 3 || pow > 10)
{
printf("Block size is invalid, default block size used (%d,%d)\n",block_x,block_y);
}
else
{
block_x = 1 << pow;
block_y = 1024 / block_x;
}
}
unsigned int byte_size = size * sizeof(float);
printf("Input size : %d, nx : %d, ny : %d, block_x : %d, block_y : %d \n",
size,nx,ny,block_x,block_y);
float * h_a, *h_b, *h_out, *h_ref;
h_a = (float*)malloc(byte_size);
h_b = (float*)malloc(byte_size);
h_out = (float*)malloc(byte_size);
h_ref = (float*)malloc(byte_size);
if (!h_a)
printf("host memory allocation error \n");
for (size_t i = 0; i < size; i++)
{
h_a[i] = i % 10;
h_b[i] = i % 7;
}
sum_array_cpu(h_a, h_b, h_out, size);
dim3 block(block_x,block_y);
dim3 grid((nx + block_x -1)/ block_x, (ny + block_y -1)/block_y );
printf("Kernel is lauch with grid(%d,%d,%d) and block(%d,%d,%d) \n",
grid.x, grid.y, grid.z, block.x, block.y, block.z);
float *d_a, *d_b, *d_c;
gpuErrchk(cudaMalloc((void**)&d_a, byte_size));
gpuErrchk(cudaMalloc((void**)&d_b, byte_size));
gpuErrchk(cudaMalloc((void**)&d_c, byte_size));
gpuErrchk(cudaMemset(d_c, 0, byte_size));
gpuErrchk(cudaMemcpy(d_a, h_a, byte_size, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, h_b, byte_size, cudaMemcpyHostToDevice));
sum_arrays_2Dgrid_2Dblock << <grid, block >> > (d_a, d_b, d_c,nx,ny);
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_ref, d_c, byte_size, cudaMemcpyDeviceToHost));
compare_arrays(h_out, h_ref, size);
cudaFree(d_c);
cudaFree(d_b);
cudaFree(d_a);
free(h_ref);
free(h_out);
free(h_b);
free(h_a);
}
////arguments :
////1 - kernel (0:1D or 1:2D),
////2 - input size (2 pow (x))
////3 - for 2D kernel nx,
////4 - block.x
////5 - block.y
//int main(int argc, char** argv)
//{
// printf("\n----------------------- SUM ARRAY EXAMPLE FOR NVPROF ------------------------ \n\n");
// if (argc > 1)
// {
// if (atoi(argv[1]) > 0)
// {
// run_sum_array_2d(argc, argv);
// }
// else
// {
// run_sum_array_1d(argc, argv);
// }
// }
// else
// {
// run_sum_array_1d(argc, argv);
// }
//
// //query_device();
// return 0;
//}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment