Last active
May 23, 2020 00:09
-
-
Save mikebsg01/2150a7d18e0268b743a769405e9f0233 to your computer and use it in GitHub Desktop.
Parallel programming - Exam #1 - By: Michael Serrato
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "cuda_runtime.h" | |
#include "device_launch_parameters.h" | |
#include <iostream> | |
#include <ctime> | |
#include <cstdlib> | |
#include <cmath> | |
using namespace std; | |
/** | |
* @author Michael Serrato | |
* @author Alejandro Madariaga Angeles | |
* @author Alejandro Gimenez Isasi | |
*/ | |
#define LENGTH 62000 | |
#define M 3 | |
#define N 6 | |
#define MAX_RANDOM_NUM 100 | |
#define EPSILON float(0.0000001) | |
template <typename T> | |
void printArray(T* ptr, size_t size) { | |
for (size_t i = 0; i < size; ++i) { | |
cout << (i > 0 ? " " : "") << ptr[i]; | |
} | |
cout << endl; | |
} | |
/* Random Number Generator */ | |
template <typename T> | |
void fillWithRandoms(T* ptr, size_t size) { | |
int randNum; | |
for (size_t i = 0; i < size; ++i) { | |
randNum = round((((double)rand()) / RAND_MAX) * MAX_RANDOM_NUM); | |
ptr[i] = randNum; | |
} | |
} | |
/* Vector Substraction performed by the CPU. */ | |
template <typename T> | |
void vectorSubstractionByCPU(T* a, T* b, T* c, size_t size) { | |
for (size_t i = 0; i < size; ++i) { | |
c[i] = a[i] - b[i]; | |
} | |
} | |
/* Vector Substraction performed by the GPU - First Strategy */ | |
template <typename T> | |
__global__ void vectorSubstractionByGPUFirstStrategy(T* a, T* b, T* c) { | |
const int tid = blockIdx.x * N; | |
for (size_t i = 0; i < N; ++i) { | |
if ((tid + i) < LENGTH) { | |
/* | |
printf("block: %d, N: %d, i: %d\n", blockIdx.x, N, i); | |
printf("tid: %d\n", tid + i); | |
*/ | |
c[tid + i] = a[tid + i] - b[tid + i]; | |
} | |
} | |
} | |
/* Vector Substraction performed by the GPU - Second Strategy */ | |
template <typename T> | |
__global__ void vectorSubstractionByGPUSecondStrategy(T* a, T* b, T* c) { | |
const int first_tid = blockIdx.x * (N / 2); | |
const int second_tid = first_tid + (LENGTH / 2); | |
for (int i = 0; i < N / 2; i++) { | |
if ((first_tid + i) < (LENGTH / 2)) { | |
c[first_tid + i] = a[first_tid + i] - b[first_tid + i]; | |
} | |
if ((second_tid + i) < LENGTH) { | |
c[second_tid + i] = a[second_tid + i] - b[second_tid + i]; | |
} | |
} | |
} | |
/* Vector Substraction performed by the GPU - Third Strategy */ | |
template <typename T> | |
__global__ void vectorSubstractionByGPUThirdStrategy(T* a, T* b, T* c) { | |
int tid = blockIdx.x * N; | |
for (int i = 0; i < M; ++i) { | |
for (int j = 0; j < N; ++j) { | |
c[tid + j] = a[tid + j] - b[tid + j]; | |
} | |
tid += (LENGTH / M); | |
} | |
} | |
float compareArrays(float* a, float* b, size_t size, int* countDiff) { | |
int _countDiff = 0; | |
float difference = 0, currentDifference = 0; | |
for (size_t i = 0; i < size; ++i) { | |
currentDifference = abs(a[i] - b[i]); | |
difference += currentDifference; | |
if (currentDifference > EPSILON) { | |
++_countDiff; | |
} | |
} | |
*countDiff = _countDiff; | |
return difference; | |
} | |
int main() { | |
float* dev_a, | |
* dev_b, | |
* dev_c, | |
* a = new float[LENGTH], | |
* b = new float[LENGTH], | |
* cpu_c = new float[LENGTH], | |
* gpu_c = new float[LENGTH]; | |
int maxThreads, numBlocks, numThreads; | |
clock_t start_time, end_time, duration_time; | |
cudaDeviceProp devProp; | |
cudaError_t cudaStatus; | |
float difference = 0; | |
int countDiff = 0; | |
cudaSetDevice(0); | |
cudaGetDeviceProperties(&devProp, 0); | |
maxThreads = devProp.maxThreadsPerBlock; | |
cout << "Propiedades del GPU:\n" | |
<< "\t- Hilos maximos por bloque: " << maxThreads << endl | |
<< "=============================================" << endl << endl; | |
// Allocate Memory in GPU | |
cudaMalloc(&dev_a, LENGTH * sizeof(float)); | |
cudaMalloc(&dev_b, LENGTH * sizeof(float)); | |
cudaMalloc(&dev_c, LENGTH * sizeof(float)); | |
// Fill Arrays A & B with Random Numbers | |
srand((unsigned)time(NULL)); | |
fillWithRandoms(a, LENGTH); | |
fillWithRandoms(b, LENGTH); | |
// printArray(a, LENGTH); | |
// printArray(b, LENGTH); | |
cout << endl << "Resta de vectores con " << LENGTH << " elementos:" << endl << endl; | |
cout << "\tOperacion en CPU:" << endl; | |
start_time = clock(); | |
vectorSubstractionByCPU(a, b, cpu_c, LENGTH); | |
end_time = clock(); | |
duration_time = end_time - start_time; | |
// cout << endl; | |
// printArray(cpu_c, LENGTH); | |
printf("\n\t\t-> Toma %.6f ms\n\n", ((((double)duration_time) / CLOCKS_PER_SEC) * 1000.00)); | |
// RUNNING FIRST STRATEGY... | |
cout << "\tOperacion en GPU - PRIMER ESTRATEGIA:" << endl; | |
numBlocks = ceil(((double)LENGTH) / N); | |
numThreads = 1; | |
cout << "\t\t- Numero de bloques utilizados: " << numBlocks << endl; | |
cout << "\t\t- Numero de hilos por bloque: " << numThreads << endl << endl; | |
cudaMemcpy(dev_a, a, LENGTH * sizeof(float), cudaMemcpyHostToDevice); | |
cudaMemcpy(dev_b, b, LENGTH * sizeof(float), cudaMemcpyHostToDevice); | |
start_time = clock(); | |
dim3 dimGrid1(numBlocks); | |
dim3 dimBlock1(numThreads); | |
vectorSubstractionByGPUFirstStrategy << <dimGrid1, dimBlock1 >> > (dev_a, dev_b, dev_c); | |
cudaStatus = cudaGetLastError(); | |
if (cudaStatus != cudaSuccess) { | |
fprintf(stderr, "Kernel launch FAILED: %s\n", | |
cudaGetErrorString(cudaStatus)); | |
} | |
end_time = clock(); | |
duration_time = end_time - start_time; | |
cudaMemcpy(gpu_c, dev_c, LENGTH * sizeof(float), cudaMemcpyDeviceToHost); | |
// cout << endl; | |
// printArray(gpu_c, LENGTH); | |
printf("\n\t\t-> Toma %.6f ms\n\n", ((((double)duration_time) / CLOCKS_PER_SEC) * 1000.00)); | |
difference = compareArrays(gpu_c, cpu_c, LENGTH, &countDiff); | |
printf("\t\tElementos diferentes %d (%.3f %%) Con valor de %.8f\n", countDiff, ((((float)countDiff) / LENGTH) * 100), difference); | |
printf("\t======================================================================\n\n"); | |
// RUNNING SECOND STRATEGY... | |
cout << "\tOperacion en GPU - SEGUNDA ESTRATEGIA:" << endl; | |
numBlocks = ceil(((double)LENGTH) / N); | |
numThreads = 1; | |
cout << "\t\t- Numero de bloques utilizados: " << numBlocks << endl; | |
cout << "\t\t- Numero de hilos por bloque: " << numThreads << endl << endl; | |
cudaMemcpy(dev_a, a, LENGTH * sizeof(float), cudaMemcpyHostToDevice); | |
cudaMemcpy(dev_b, b, LENGTH * sizeof(float), cudaMemcpyHostToDevice); | |
start_time = clock(); | |
dim3 dimGrid2(numBlocks); | |
dim3 dimBlock2(numThreads); | |
vectorSubstractionByGPUSecondStrategy << <dimGrid2, dimBlock2 >> > (dev_a, dev_b, dev_c); | |
cudaStatus = cudaGetLastError(); | |
if (cudaStatus != cudaSuccess) { | |
fprintf(stderr, "Kernel launch FAILED: %s\n", | |
cudaGetErrorString(cudaStatus)); | |
} | |
end_time = clock(); | |
duration_time = end_time - start_time; | |
cudaMemcpy(gpu_c, dev_c, LENGTH * sizeof(float), cudaMemcpyDeviceToHost); | |
// cout << endl; | |
// printArray(gpu_c, LENGTH); | |
printf("\n\t\t-> Toma %.6f ms\n\n", ((((double)duration_time) / CLOCKS_PER_SEC) * 1000.00)); | |
difference = compareArrays(gpu_c, cpu_c, LENGTH, &countDiff); | |
printf("\t\tElementos diferentes %d (%.3f %%) Con valor de %.8f\n", countDiff, ((((float)countDiff) / LENGTH) * 100), difference); | |
printf("\t======================================================================\n\n"); | |
// RUNNING THIRD STRATEGY... | |
cout << "\tOperacion en GPU - TERCERA ESTRATEGIA:" << endl; | |
numBlocks = ceil(((double)LENGTH) / (M * N)); | |
numThreads = 1; | |
cout << "\t\t- Numero de bloques utilizados: " << numBlocks << endl; | |
cout << "\t\t- Numero de hilos por bloque: " << numThreads << endl << endl; | |
cudaMemcpy(dev_a, a, LENGTH * sizeof(float), cudaMemcpyHostToDevice); | |
cudaMemcpy(dev_b, b, LENGTH * sizeof(float), cudaMemcpyHostToDevice); | |
start_time = clock(); | |
dim3 dimGrid3(numBlocks); | |
dim3 dimBlock3(numThreads); | |
vectorSubstractionByGPUThirdStrategy << <dimGrid3, dimBlock3 >> > (dev_a, dev_b, dev_c); | |
cudaStatus = cudaGetLastError(); | |
if (cudaStatus != cudaSuccess) { | |
fprintf(stderr, "Kernel launch FAILED: %s\n", | |
cudaGetErrorString(cudaStatus)); | |
} | |
end_time = clock(); | |
duration_time = end_time - start_time; | |
cudaMemcpy(gpu_c, dev_c, LENGTH * sizeof(float), cudaMemcpyDeviceToHost); | |
// cout << endl; | |
// printArray(gpu_c, LENGTH); | |
printf("\n\t\t-> Toma %.6f ms\n\n", ((((double)duration_time) / CLOCKS_PER_SEC) * 1000.00)); | |
difference = compareArrays(gpu_c, cpu_c, LENGTH, &countDiff); | |
printf("\t\tElementos diferentes %d (%.3f %%) Con valor de %.8f\n", countDiff, ((((float)countDiff) / LENGTH) * 100), difference); | |
printf("\t======================================================================\n\n"); | |
// Free Memory in GPU | |
cudaFree(dev_a); | |
cudaFree(dev_b); | |
cudaFree(dev_c); | |
printf("\nPresione cualquier tecla para salir..."); | |
char key; | |
scanf("%c", &key); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment