Skip to content

Instantly share code, notes, and snippets.

@mikebsg01
Last active March 9, 2020 20:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mikebsg01/586a74be51599c5851cadf4fa16801d8 to your computer and use it in GitHub Desktop.
Save mikebsg01/586a74be51599c5851cadf4fa16801d8 to your computer and use it in GitHub Desktop.
Vector Sum in CUDA C++ - By: Michael Serrato
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <ctime>
#include <cstdlib>
#include <cmath>
using namespace std;
/**
* @author Michael Serrato
*/
string getDeviceType(cudaDeviceProp& devProp) {
switch (devProp.major) {
case 2:
return "Fermi";
break;
case 3:
return "Kepler";
break;
case 5:
return "Maxwell";
break;
case 6:
return "Pascal";
break;
case 7:
return "Volta";
break;
default:
return "Unkown device type";
break;
}
}
int getSPCores(cudaDeviceProp& devProp) {
int cores, sp = 0, sm = devProp.multiProcessorCount;
switch (devProp.major) {
case 2: /* Fermi */
sp = devProp.minor == 1 ? 48 : 32;
break;
case 3: /* Kepler */
sp = 192;
break;
case 5: /* Maxwell */
sp = 128;
break;
case 6: /* Pascal */
sp = devProp.minor == 1 ? 128 : 64;
break;
case 7: /* Volta */
sp = 64;
break;
default:
cout << "Unknown device type" << endl;
break;
}
cores = sm * sp;
return cores;
}
size_t getMemInKb(size_t memory) {
return memory / 1024;
}
size_t getMemInMb(size_t memory) {
return getMemInKb(memory) / 1024;
}
size_t getMemInGb(size_t memory) {
return getMemInMb(memory) / 1024;
}
void printDevProp(cudaDeviceProp& devProp, int device) {
cout << "Dispositivo: " << device << endl
<< "Numero de revision mayoritario: " << devProp.major << endl
<< "Numero de revision minoritario: " << devProp.minor << endl
<< "Nombre: " << devProp.name << endl
<< "Arquitectura: " << getDeviceType(devProp) << endl
<< "Numero de multiprocesadores: " << devProp.multiProcessorCount << endl
<< "Cores CUDA: " << getSPCores(devProp) << endl
<< "Total de memoria global: " << getMemInGb(devProp.totalGlobalMem) << " GB" << endl
<< "Total de memoria compartida por bloque: " << getMemInKb(devProp.sharedMemPerBlock) << " KB" << endl
<< "Total de registros por bloque: " << devProp.regsPerBlock << endl
<< "Tamano del warp: " << devProp.warpSize << endl
<< "Pitch maximo de memoria: " << getMemInGb(devProp.memPitch + 1) << " GB" << endl
<< "Hilos maximos por bloque: " << devProp.maxThreadsPerBlock << endl << endl;
for (int i = 0; i < 3; ++i) {
cout << "Dimension maxima " << ((char)('X' + i)) << " de bloque: " << devProp.maxThreadsDim[i] << endl;
}
cout << endl;
for (int i = 0; i < 3; ++i) {
cout << "Dimension maxima " << ((char)('X' + i)) << " de grid: " << devProp.maxGridSize[i] << endl;
}
cout << endl;
cout << "Velocidad de reloj: " << devProp.clockRate << endl
<< "Frecuancia pico del reloj de memoria: " << devProp.memoryClockRate << " Khz" << endl
<< "Memoria constante total: " << devProp.totalConstMem << endl
<< "Alineamiento de textura: " << devProp.textureAlignment << endl
<< "Copiado y ejecucion concurrente: " << (devProp.deviceOverlap ? "Si" : "No") << endl
<< "Timeout de ejecucion del Kernel: " << (devProp.kernelExecTimeoutEnabled ? "Si" : "No") << endl;
}
#define length 100000015
#define ELEMENTS_PER_THREAD 1000
// #define length 25
const int maxRandomNum = 10;
template <typename T>
void printArray(T* ptr, size_t size) {
for (size_t i = 0; i < length; ++i) {
cout << (i > 0 ? " " : "") << ptr[i];
}
cout << endl;
}
template <typename T>
void fillWithRandoms(T* ptr, size_t size) {
int randNum;
for (size_t i = 0; i < size; ++i) {
randNum = round((((double)rand()) / RAND_MAX) * maxRandomNum);
ptr[i] = randNum;
}
}
template <typename T>
void vectorSumByCPU(size_t size, T* a, T* b, T* c) {
for (size_t i = 0; i < size; ++i) {
c[i] = a[i] + b[i];
}
}
template <typename T>
__global__ void vectorSumByGPU(T* a, T* b, T* c) {
int tid = ((blockIdx.x * blockDim.x) + threadIdx.x) * ELEMENTS_PER_THREAD;
for (size_t i = 0; i < ELEMENTS_PER_THREAD; ++i) {
if (tid < length) {
/*
printf("> tid = (%d * %d) + %d = %d\ta = %d\tb = %d\tc = %d\n",
blockIdx.x,
blockDim.x,
threadIdx.x,
tid,
*(a + tid),
*(b + tid),
a[tid] + b[tid]
);
*/
c[tid + i] = a[tid + i] + b[tid + i];
}
}
}
template <typename T>
int compareArrays(T* a, T* b, int size) {
int diff = 0;
for (size_t i = 0; i < size; ++i) {
if (a[i] != b[i]) {
++diff;
}
}
return diff;
}
int main() {
int* dev_a,
* dev_b,
* dev_c,
* a = new int[length],
* b = new int[length],
* cpu_c = new int[length],
* gpu_c = new int[length];
clock_t start_time, end_time, duration_time;
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, 0);
int maxThreads = devProp.maxThreadsPerBlock;
srand((unsigned)time(NULL));
fillWithRandoms(a, length);
fillWithRandoms(b, length);
cout << "Suma de vectores con " << length << " elementos:" << endl;
cout << "Operacion en CPU:" << endl;
start_time = clock();
vectorSumByCPU(length, a, b, cpu_c);
end_time = clock();
duration_time = end_time - start_time;
/*
printArray(a, length);
printArray(b, length);
printArray(cpu_c, length);
*/
cout << "Toma " << ((((double)duration_time) / CLOCKS_PER_SEC) * 1000) << " ms" << endl;
cout << "Operacion en GPU: " << endl;
// maxThreads = 10;
int numBlocks = (length / (maxThreads * ELEMENTS_PER_THREAD)) + 1;
dim3 dimGrid(numBlocks);
dim3 dimBlock(maxThreads);
cout << "Numero de bloques utilizados: " << numBlocks << endl;
cout << "Numero de hilos por bloque: " << maxThreads << endl;
cudaMalloc(&dev_a, length * sizeof(int));
cudaMalloc(&dev_b, length * sizeof(int));
cudaMalloc(&dev_c, length * sizeof(int));
cudaMemcpy(dev_a, a, length * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, length * sizeof(int), cudaMemcpyHostToDevice);
start_time = clock();
vectorSumByGPU << <dimGrid, dimBlock >> > (dev_a, dev_b, dev_c);
end_time = clock();
duration_time = end_time - start_time;
// cudaDeviceSynchronize();
cudaMemcpy(gpu_c, dev_c, length * sizeof(int), cudaMemcpyDeviceToHost);
cout << "Toma " << ((((double)duration_time) / CLOCKS_PER_SEC) * 1000) << " ms" << endl;
/*
printArray(a, length);
printArray(b, length);
printArray(gpu_c, length);
*/
cout << "Errores de calculo: " << compareArrays(cpu_c, gpu_c, length) << endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment