Skip to content

Instantly share code, notes, and snippets.

@imSrbh
Last active April 27, 2020 04:38
Show Gist options
  • Save imSrbh/37abe5de04b6d187c2161be22eee83dd to your computer and use it in GitHub Desktop.
Save imSrbh/37abe5de04b6d187c2161be22eee83dd to your computer and use it in GitHub Desktop.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
//for random intialize
#include <stdlib.h>
#include <time.h>
//for memset
#include <cstring>
__global__ void mul_vec_gpu(int * a, int * b, int* c, int size)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index < size)
c[index] = a[index] * b[index];
}
void mul_vec_cpu(int * a, int * b, int * c, int size)
{
for (int i = 0; i < size; i++)
{
c[i] = a[i] * b[i];
}
}
int main()
{
int size = 10000;
int block_size = 128;
cudaError error;
//number of bytes needed to hold element count
size_t NO_BYTES = size * sizeof(int);
// host pointers
int *h_a, *h_b, *gpu_result, *cpu_result;
//allocate memory for host size pointers
h_a = (int *)malloc(NO_BYTES);
h_b = (int *)malloc(NO_BYTES);
gpu_result = (int *)malloc(NO_BYTES);
cpu_result = (int *)malloc(NO_BYTES);
//initialize h_a and h_b vectors randomly
time_t t;
srand((unsigned)time(&t));
for (size_t i = 0; i < size; i++)
{
h_a[i] = (int)(rand() & 0xFF);
}
for (size_t i = 0; i < size; i++)
{
h_b[i] = (int)(rand() & 0xFF);
}
memset(gpu_result, 0, NO_BYTES);
memset(cpu_result, 0, NO_BYTES);
//multiplication in CPU
clock_t cpu_start, cpu_end;
cpu_start = clock();
mul_vec_cpu(h_a, h_b, cpu_result, size);
cpu_end = clock();
int *d_a, *d_b, *d_c;
cudaMalloc((int **)&d_a, NO_BYTES);
cudaMalloc((int **)&d_b, NO_BYTES);
cudaMalloc((int **)&d_c, NO_BYTES);
//kernel launch parameters
dim3 block(block_size);
dim3 grid((size / block.x) + 1);
clock_t mem_htod_start, mem_htod_end;
mem_htod_start = clock();
cudaMemcpy(d_a, h_a, NO_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, NO_BYTES, cudaMemcpyHostToDevice);
mem_htod_end = clock();
//execution time measuring in GPU
clock_t gpu_start, gpu_end;
gpu_start = clock();
mul_vec_gpu << <grid, block >> > (d_a, d_b, d_c, size);
cudaDeviceSynchronize();
gpu_end = clock();
clock_t mem_dtoh_start, mem_dtoh_end;
mem_dtoh_start = clock();
cudaMemcpy(gpu_result, d_c, NO_BYTES, cudaMemcpyDeviceToHost);
mem_dtoh_end = clock();
//compare_vectors(cpu_result, gpu_result, size);
printf("Validity Check :");
for (int i = 0; i < size; i++)
{
if (gpu_result[i] != cpu_result[i])
{
printf("Vectors are different \n");
return;
}
}
printf("After Multiplication resultant vectors of CPU and GPU are same \n");
printf("CPU mul time : %4.6f \n",
(double)((double)(cpu_end - cpu_start) / CLOCKS_PER_SEC));
printf("GPU kernel execution time mul time : %4.6f \n",
(double)((double)(gpu_end - gpu_start) / CLOCKS_PER_SEC));
printf("Mem transfer host to device : %4.6f \n",
(double)((double)(mem_htod_end - mem_htod_start) / CLOCKS_PER_SEC));
printf("Mem transfer device to host : %4.6f \n",
(double)((double)(mem_dtoh_end - mem_dtoh_start) / CLOCKS_PER_SEC));
printf("Total GPU time : %4.6f \n",
(double)((double)((mem_htod_end - mem_htod_start)
+ (gpu_end - gpu_start)
+ (mem_dtoh_end - mem_dtoh_start)) / CLOCKS_PER_SEC));
cudaFree(d_c);
cudaFree(d_b);
cudaFree(d_a);
free(gpu_result);
free(h_a);
free(h_b);
cudaDeviceReset();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment