Last active
April 27, 2020 04:38
-
-
Save imSrbh/37abe5de04b6d187c2161be22eee83dd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "cuda_runtime.h" | |
#include "device_launch_parameters.h" | |
#include <stdio.h> | |
//for random intialize | |
#include <stdlib.h> | |
#include <time.h> | |
//for memset | |
#include <cstring> | |
__global__ void mul_vec_gpu(int * a, int * b, int* c, int size) | |
{ | |
int index = blockDim.x * blockIdx.x + threadIdx.x; | |
if (index < size) | |
c[index] = a[index] * b[index]; | |
} | |
void mul_vec_cpu(int * a, int * b, int * c, int size) | |
{ | |
for (int i = 0; i < size; i++) | |
{ | |
c[i] = a[i] * b[i]; | |
} | |
} | |
int main() | |
{ | |
int size = 10000; | |
int block_size = 128; | |
cudaError error; | |
//number of bytes needed to hold element count | |
size_t NO_BYTES = size * sizeof(int); | |
// host pointers | |
int *h_a, *h_b, *gpu_result, *cpu_result; | |
//allocate memory for host size pointers | |
h_a = (int *)malloc(NO_BYTES); | |
h_b = (int *)malloc(NO_BYTES); | |
gpu_result = (int *)malloc(NO_BYTES); | |
cpu_result = (int *)malloc(NO_BYTES); | |
//initialize h_a and h_b vectors randomly | |
time_t t; | |
srand((unsigned)time(&t)); | |
for (size_t i = 0; i < size; i++) | |
{ | |
h_a[i] = (int)(rand() & 0xFF); | |
} | |
for (size_t i = 0; i < size; i++) | |
{ | |
h_b[i] = (int)(rand() & 0xFF); | |
} | |
memset(gpu_result, 0, NO_BYTES); | |
memset(cpu_result, 0, NO_BYTES); | |
//multiplication in CPU | |
clock_t cpu_start, cpu_end; | |
cpu_start = clock(); | |
mul_vec_cpu(h_a, h_b, cpu_result, size); | |
cpu_end = clock(); | |
int *d_a, *d_b, *d_c; | |
cudaMalloc((int **)&d_a, NO_BYTES); | |
cudaMalloc((int **)&d_b, NO_BYTES); | |
cudaMalloc((int **)&d_c, NO_BYTES); | |
//kernel launch parameters | |
dim3 block(block_size); | |
dim3 grid((size / block.x) + 1); | |
clock_t mem_htod_start, mem_htod_end; | |
mem_htod_start = clock(); | |
cudaMemcpy(d_a, h_a, NO_BYTES, cudaMemcpyHostToDevice); | |
cudaMemcpy(d_b, h_b, NO_BYTES, cudaMemcpyHostToDevice); | |
mem_htod_end = clock(); | |
//execution time measuring in GPU | |
clock_t gpu_start, gpu_end; | |
gpu_start = clock(); | |
mul_vec_gpu << <grid, block >> > (d_a, d_b, d_c, size); | |
cudaDeviceSynchronize(); | |
gpu_end = clock(); | |
clock_t mem_dtoh_start, mem_dtoh_end; | |
mem_dtoh_start = clock(); | |
cudaMemcpy(gpu_result, d_c, NO_BYTES, cudaMemcpyDeviceToHost); | |
mem_dtoh_end = clock(); | |
//compare_vectors(cpu_result, gpu_result, size); | |
printf("Validity Check :"); | |
for (int i = 0; i < size; i++) | |
{ | |
if (gpu_result[i] != cpu_result[i]) | |
{ | |
printf("Vectors are different \n"); | |
return; | |
} | |
} | |
printf("After Multiplication resultant vectors of CPU and GPU are same \n"); | |
printf("CPU mul time : %4.6f \n", | |
(double)((double)(cpu_end - cpu_start) / CLOCKS_PER_SEC)); | |
printf("GPU kernel execution time mul time : %4.6f \n", | |
(double)((double)(gpu_end - gpu_start) / CLOCKS_PER_SEC)); | |
printf("Mem transfer host to device : %4.6f \n", | |
(double)((double)(mem_htod_end - mem_htod_start) / CLOCKS_PER_SEC)); | |
printf("Mem transfer device to host : %4.6f \n", | |
(double)((double)(mem_dtoh_end - mem_dtoh_start) / CLOCKS_PER_SEC)); | |
printf("Total GPU time : %4.6f \n", | |
(double)((double)((mem_htod_end - mem_htod_start) | |
+ (gpu_end - gpu_start) | |
+ (mem_dtoh_end - mem_dtoh_start)) / CLOCKS_PER_SEC)); | |
cudaFree(d_c); | |
cudaFree(d_b); | |
cudaFree(d_a); | |
free(gpu_result); | |
free(h_a); | |
free(h_b); | |
cudaDeviceReset(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment