mikebsg01/VectorSum_by_MichaelSerrato.cu

## VectorSum_by_MichaelSerrato.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <ctime>
#include <cstdlib>
#include <cmath>
using namespace std;

/**
 * @author Michael Serrato
 */

string getDeviceType(cudaDeviceProp& devProp) {
    switch (devProp.major) {
    case 2:
        return "Fermi";
        break;
    case 3:
        return "Kepler";
        break;
    case 5:
        return "Maxwell";
        break;
    case 6:
        return "Pascal";
        break;
    case 7:
        return "Volta";
        break;
    default:
        return "Unkown device type";
        break;
    }
}

int getSPCores(cudaDeviceProp& devProp) {
    int cores, sp = 0, sm = devProp.multiProcessorCount;

    switch (devProp.major) {
    case 2: /* Fermi */
        sp = devProp.minor == 1 ? 48 : 32;
        break;
    case 3: /* Kepler */
        sp = 192;
        break;
    case 5: /* Maxwell */
        sp = 128;
        break;
    case 6: /* Pascal */
        sp = devProp.minor == 1 ? 128 : 64;
        break;
    case 7: /* Volta */
        sp = 64;
        break;
    default:
        cout << "Unknown device type" << endl;
        break;
    }

    cores = sm * sp;
    return cores;
}

size_t getMemInKb(size_t memory) {
    return memory / 1024;
}

size_t getMemInMb(size_t memory) {
    return getMemInKb(memory) / 1024;
}

size_t getMemInGb(size_t memory) {
    return getMemInMb(memory) / 1024;
}

void printDevProp(cudaDeviceProp& devProp, int device) {
    cout << "Dispositivo: " << device << endl
        << "Numero de revision mayoritario: " << devProp.major << endl
        << "Numero de revision minoritario: " << devProp.minor << endl
        << "Nombre: " << devProp.name << endl
        << "Arquitectura: " << getDeviceType(devProp) << endl
        << "Numero de multiprocesadores: " << devProp.multiProcessorCount << endl
        << "Cores CUDA: " << getSPCores(devProp) << endl
        << "Total de memoria global: " << getMemInGb(devProp.totalGlobalMem) << " GB" << endl
        << "Total de memoria compartida por bloque: " << getMemInKb(devProp.sharedMemPerBlock) << " KB" << endl
        << "Total de registros por bloque: " << devProp.regsPerBlock << endl
        << "Tamano del warp: " << devProp.warpSize << endl
        << "Pitch maximo de memoria: " << getMemInGb(devProp.memPitch + 1) << " GB" << endl
        << "Hilos maximos por bloque: " << devProp.maxThreadsPerBlock << endl << endl;


    for (int i = 0; i < 3; ++i) {
        cout << "Dimension maxima " << ((char)('X' + i)) << " de bloque: " << devProp.maxThreadsDim[i] << endl;
    }

    cout << endl;

    for (int i = 0; i < 3; ++i) {
        cout << "Dimension maxima " << ((char)('X' + i)) << " de grid: " << devProp.maxGridSize[i] << endl;
    }

    cout << endl;

    cout << "Velocidad de reloj: " << devProp.clockRate << endl
        << "Frecuancia pico del reloj de memoria: " << devProp.memoryClockRate << " Khz" << endl
        << "Memoria constante total: " << devProp.totalConstMem << endl
        << "Alineamiento de textura: " << devProp.textureAlignment << endl
        << "Copiado y ejecucion concurrente: " << (devProp.deviceOverlap ? "Si" : "No") << endl
        << "Timeout de ejecucion del Kernel: " << (devProp.kernelExecTimeoutEnabled ? "Si" : "No") << endl;
}

#define length 100000015
#define ELEMENTS_PER_THREAD 1000
// #define length 25
const int maxRandomNum = 10;

template <typename T>
void printArray(T* ptr, size_t size) {
    for (size_t i = 0; i < length; ++i) {
        cout << (i > 0 ? " " : "") << ptr[i];
    }
    cout << endl;
}

template <typename T>
void fillWithRandoms(T* ptr, size_t size) {
    int randNum;

    for (size_t i = 0; i < size; ++i) {
        randNum = round((((double)rand()) / RAND_MAX) * maxRandomNum);
        ptr[i] = randNum;
    }
}

template <typename T>
void vectorSumByCPU(size_t size, T* a, T* b, T* c) {
    for (size_t i = 0; i < size; ++i) {
        c[i] = a[i] + b[i];
    }
}

template <typename T>
__global__ void vectorSumByGPU(T* a, T* b, T* c) {
    int tid = ((blockIdx.x * blockDim.x) + threadIdx.x) * ELEMENTS_PER_THREAD;

    for (size_t i = 0; i < ELEMENTS_PER_THREAD; ++i) {
        if (tid < length) {
            /*
            printf("> tid = (%d  * %d) + %d = %d\ta = %d\tb = %d\tc = %d\n",
                blockIdx.x,
                blockDim.x,
                threadIdx.x,
                tid,
                *(a + tid),
                *(b + tid),
                a[tid] + b[tid]
            );
            */

            c[tid + i] = a[tid + i] + b[tid + i];
        }
    }
}

template <typename T>
int compareArrays(T* a, T* b, int size) {
    int diff = 0;

    for (size_t i = 0; i < size; ++i) {
        if (a[i] != b[i]) {
            ++diff;
        }
    }

    return diff;
}

int main() {
    int* dev_a,
        * dev_b,
        * dev_c,
        * a = new int[length],
        * b = new int[length],
        * cpu_c = new int[length],
        * gpu_c = new int[length];

    clock_t start_time, end_time, duration_time;

    cudaDeviceProp devProp;
    cudaGetDeviceProperties(&devProp, 0);
    int maxThreads = devProp.maxThreadsPerBlock;

    srand((unsigned)time(NULL));

    fillWithRandoms(a, length);
    fillWithRandoms(b, length);

    cout << "Suma de vectores con " << length << " elementos:" << endl;

    cout << "Operacion en CPU:" << endl;
    start_time = clock();
    vectorSumByCPU(length, a, b, cpu_c);
    end_time = clock();
    duration_time = end_time - start_time;

    /*
    printArray(a, length);
    printArray(b, length);
    printArray(cpu_c, length);
    */

    cout << "Toma " << ((((double)duration_time) / CLOCKS_PER_SEC) * 1000) << " ms" << endl;

    cout << "Operacion en GPU: " << endl;

    // maxThreads = 10;
    int numBlocks = (length / (maxThreads * ELEMENTS_PER_THREAD)) + 1;

    dim3 dimGrid(numBlocks);
    dim3 dimBlock(maxThreads);

    cout << "Numero de bloques utilizados: " << numBlocks << endl;
    cout << "Numero de hilos por bloque: " << maxThreads << endl;

    cudaMalloc(&dev_a, length * sizeof(int));
    cudaMalloc(&dev_b, length * sizeof(int));
    cudaMalloc(&dev_c, length * sizeof(int));

    cudaMemcpy(dev_a, a, length * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, length * sizeof(int), cudaMemcpyHostToDevice);

    start_time = clock();
    vectorSumByGPU << <dimGrid, dimBlock >> > (dev_a, dev_b, dev_c);
    end_time = clock();
    duration_time = end_time - start_time;
    // cudaDeviceSynchronize();

    cudaMemcpy(gpu_c, dev_c, length * sizeof(int), cudaMemcpyDeviceToHost);

    cout << "Toma " << ((((double)duration_time) / CLOCKS_PER_SEC) * 1000) << " ms" << endl;

    /*
    printArray(a, length);
    printArray(b, length);
    printArray(gpu_c, length);
    */

    cout << "Errores de calculo: " << compareArrays(cpu_c, gpu_c, length) << endl;

    return 0;
}
	#include "cuda_runtime.h"
	#include "device_launch_parameters.h"
	#include <iostream>
	#include <ctime>
	#include <cstdlib>
	#include <cmath>
	using namespace std;

	/**
	* @author Michael Serrato
	*/

	string getDeviceType(cudaDeviceProp& devProp) {
	switch (devProp.major) {
	case 2:
	return "Fermi";
	break;
	case 3:
	return "Kepler";
	break;
	case 5:
	return "Maxwell";
	break;
	case 6:
	return "Pascal";
	break;
	case 7:
	return "Volta";
	break;
	default:
	return "Unkown device type";
	break;
	}
	}

	int getSPCores(cudaDeviceProp& devProp) {
	int cores, sp = 0, sm = devProp.multiProcessorCount;

	switch (devProp.major) {
	case 2: /* Fermi */
	sp = devProp.minor == 1 ? 48 : 32;
	break;
	case 3: /* Kepler */
	sp = 192;
	break;
	case 5: /* Maxwell */
	sp = 128;
	break;
	case 6: /* Pascal */
	sp = devProp.minor == 1 ? 128 : 64;
	break;
	case 7: /* Volta */
	sp = 64;
	break;
	default:
	cout << "Unknown device type" << endl;
	break;
	}

	cores = sm * sp;
	return cores;
	}

	size_t getMemInKb(size_t memory) {
	return memory / 1024;
	}

	size_t getMemInMb(size_t memory) {
	return getMemInKb(memory) / 1024;
	}

	size_t getMemInGb(size_t memory) {
	return getMemInMb(memory) / 1024;
	}

	void printDevProp(cudaDeviceProp& devProp, int device) {
	cout << "Dispositivo: " << device << endl
	<< "Numero de revision mayoritario: " << devProp.major << endl
	<< "Numero de revision minoritario: " << devProp.minor << endl
	<< "Nombre: " << devProp.name << endl
	<< "Arquitectura: " << getDeviceType(devProp) << endl
	<< "Numero de multiprocesadores: " << devProp.multiProcessorCount << endl
	<< "Cores CUDA: " << getSPCores(devProp) << endl
	<< "Total de memoria global: " << getMemInGb(devProp.totalGlobalMem) << " GB" << endl
	<< "Total de memoria compartida por bloque: " << getMemInKb(devProp.sharedMemPerBlock) << " KB" << endl
	<< "Total de registros por bloque: " << devProp.regsPerBlock << endl
	<< "Tamano del warp: " << devProp.warpSize << endl
	<< "Pitch maximo de memoria: " << getMemInGb(devProp.memPitch + 1) << " GB" << endl
	<< "Hilos maximos por bloque: " << devProp.maxThreadsPerBlock << endl << endl;


	for (int i = 0; i < 3; ++i) {
	cout << "Dimension maxima " << ((char)('X' + i)) << " de bloque: " << devProp.maxThreadsDim[i] << endl;
	}

	cout << endl;

	for (int i = 0; i < 3; ++i) {
	cout << "Dimension maxima " << ((char)('X' + i)) << " de grid: " << devProp.maxGridSize[i] << endl;
	}

	cout << endl;

	cout << "Velocidad de reloj: " << devProp.clockRate << endl
	<< "Frecuancia pico del reloj de memoria: " << devProp.memoryClockRate << " Khz" << endl
	<< "Memoria constante total: " << devProp.totalConstMem << endl
	<< "Alineamiento de textura: " << devProp.textureAlignment << endl
	<< "Copiado y ejecucion concurrente: " << (devProp.deviceOverlap ? "Si" : "No") << endl
	<< "Timeout de ejecucion del Kernel: " << (devProp.kernelExecTimeoutEnabled ? "Si" : "No") << endl;
	}

	#define length 100000015
	#define ELEMENTS_PER_THREAD 1000
	// #define length 25
	const int maxRandomNum = 10;

	template <typename T>
	void printArray(T* ptr, size_t size) {
	for (size_t i = 0; i < length; ++i) {
	cout << (i > 0 ? " " : "") << ptr[i];
	}
	cout << endl;
	}

	template <typename T>
	void fillWithRandoms(T* ptr, size_t size) {
	int randNum;

	for (size_t i = 0; i < size; ++i) {
	randNum = round((((double)rand()) / RAND_MAX) * maxRandomNum);
	ptr[i] = randNum;
	}
	}

	template <typename T>
	void vectorSumByCPU(size_t size, T* a, T* b, T* c) {
	for (size_t i = 0; i < size; ++i) {
	c[i] = a[i] + b[i];
	}
	}

	template <typename T>
	__global__ void vectorSumByGPU(T* a, T* b, T* c) {
	int tid = ((blockIdx.x * blockDim.x) + threadIdx.x) * ELEMENTS_PER_THREAD;

	for (size_t i = 0; i < ELEMENTS_PER_THREAD; ++i) {
	if (tid < length) {
	/*
	printf("> tid = (%d * %d) + %d = %d\ta = %d\tb = %d\tc = %d\n",
	blockIdx.x,
	blockDim.x,
	threadIdx.x,
	tid,
	*(a + tid),
	*(b + tid),
	a[tid] + b[tid]
	);
	*/

	c[tid + i] = a[tid + i] + b[tid + i];
	}
	}
	}

	template <typename T>
	int compareArrays(T* a, T* b, int size) {
	int diff = 0;

	for (size_t i = 0; i < size; ++i) {
	if (a[i] != b[i]) {
	++diff;
	}
	}

	return diff;
	}

	int main() {
	int* dev_a,
	* dev_b,
	* dev_c,
	* a = new int[length],
	* b = new int[length],
	* cpu_c = new int[length],
	* gpu_c = new int[length];

	clock_t start_time, end_time, duration_time;

	cudaDeviceProp devProp;
	cudaGetDeviceProperties(&devProp, 0);
	int maxThreads = devProp.maxThreadsPerBlock;

	srand((unsigned)time(NULL));

	fillWithRandoms(a, length);
	fillWithRandoms(b, length);

	cout << "Suma de vectores con " << length << " elementos:" << endl;

	cout << "Operacion en CPU:" << endl;
	start_time = clock();
	vectorSumByCPU(length, a, b, cpu_c);
	end_time = clock();
	duration_time = end_time - start_time;

	/*
	printArray(a, length);
	printArray(b, length);
	printArray(cpu_c, length);
	*/

	cout << "Toma " << ((((double)duration_time) / CLOCKS_PER_SEC) * 1000) << " ms" << endl;

	cout << "Operacion en GPU: " << endl;

	// maxThreads = 10;
	int numBlocks = (length / (maxThreads * ELEMENTS_PER_THREAD)) + 1;

	dim3 dimGrid(numBlocks);
	dim3 dimBlock(maxThreads);

	cout << "Numero de bloques utilizados: " << numBlocks << endl;
	cout << "Numero de hilos por bloque: " << maxThreads << endl;

	cudaMalloc(&dev_a, length * sizeof(int));
	cudaMalloc(&dev_b, length * sizeof(int));
	cudaMalloc(&dev_c, length * sizeof(int));

	cudaMemcpy(dev_a, a, length * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(dev_b, b, length * sizeof(int), cudaMemcpyHostToDevice);

	start_time = clock();
	vectorSumByGPU << <dimGrid, dimBlock >> > (dev_a, dev_b, dev_c);
	end_time = clock();
	duration_time = end_time - start_time;
	// cudaDeviceSynchronize();

	cudaMemcpy(gpu_c, dev_c, length * sizeof(int), cudaMemcpyDeviceToHost);

	cout << "Toma " << ((((double)duration_time) / CLOCKS_PER_SEC) * 1000) << " ms" << endl;

	/*
	printArray(a, length);
	printArray(b, length);
	printArray(gpu_c, length);
	*/

	cout << "Errores de calculo: " << compareArrays(cpu_c, gpu_c, length) << endl;

	return 0;
	}