Skip to content

Instantly share code, notes, and snippets.

@linuxelf001
Created January 28, 2010 04:00
Show Gist options
  • Save linuxelf001/288423 to your computer and use it in GitHub Desktop.
Save linuxelf001/288423 to your computer and use it in GitHub Desktop.
####################################### CUDA VERSION ################################################
// Author's : Rakesh Ginjupalli, Felix Rohrer
// Date : 12/27/09
// We thank Dr Gaurav Khanna for his support
// Includes
#include <stdio.h>
// Variables
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
int iNumElements = 10; // Length of float arrays to process
bool DEBUG = true;
// Functions
void fillFloatArray(float* arr, int length);
void printFloatArray(float* arr, char* name, int length);
// Device code
__global__ void VecAdd(const float* A, const float* B, float* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
// Host code
int main(int argc, char **argv)
{
// set amount of numbers to be calculated
if(argc > 1){
iNumElements = atoi(argv[1]);
printf("Setting numbers to %d\n", iNumElements);
if(argc == 3){
DEBUG = false;
}
}
size_t size = iNumElements * sizeof(float);
// Allocate input vectors h_A and h_B in host memory
h_A = (float*)malloc(size);
h_B = (float*)malloc(size);
h_C = (float*)malloc(size);
// Initialize input vectors
fillFloatArray(h_A, iNumElements);
fillFloatArray(h_B, iNumElements);
if(DEBUG){
printFloatArray(h_A, "Array A", iNumElements);
printFloatArray(h_B, "Array B", iNumElements);
}
// Allocate vectors in device memory
cudaMalloc((void**)&d_A, size);
cudaMalloc((void**)&d_B, size);
cudaMalloc((void**)&d_C, size);
// Copy vectors from host memory to device memory
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// Invoke kernel
int threadsPerBlock = 256;
int blocksPerGrid = (iNumElements + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, iNumElements);
// Copy result from device memory to host memory
// h_C contains the result in host memory
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
if(DEBUG){
printFloatArray(h_C, "Result", iNumElements);
}
// Free device memory
if (d_A) cudaFree(d_A);
if (d_B) cudaFree(d_B);
if (d_C) cudaFree(d_C);
// Free host memory
if (h_A) free(h_A);
if (h_B) free(h_B);
if (h_C) free(h_C);
}
void fillFloatArray(float* arr, int length){
for(int i=0;i<length;i++){
arr[i] = rand() / (float)RAND_MAX;
}
}
void printFloatArray(float* arr, char* name, int length){
printf("%s:\n", name);
for(int i=0;i<length;i++){
printf("%.1f ", arr[i]);
}printf("\n\n");
}
######################################### OPENCL VERSION ##################################################
// Author's : Rakesh Ginjupalli, Felix Rohrer
// Date : 12/27/09
// we thank Dr Gaurav Khanna for his support
// common SDK header for standard utilities and system libs
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
// Host buffers for demo
// *********************************************************************
float *srcA, *srcB, *dst; // Host buffers for OpenCL test
bool DEBUG = true;
// OpenCL Vars
size_t szParmDataBytes; // Byte size of context information
size_t szKernelLength; // Byte size of kernel code
int iNumElements = 10; // Length of float arrays to process
// Forward Declarations
// *********************************************************************
void fillFloatArray(float* arr, int length);
void printFloatArray(float* arr, char* name, int length);
// Main function
// *********************************************************************
int main(int argc, char **argv)
{
// set amount of numbers to be calculated
if(argc > 1){
iNumElements = atoi(argv[1]);
printf("Setting numbers to %d\n", iNumElements);
if(argc == 3){
DEBUG = false;
}
}
// set and log Global and Local work size dimensions
size_t szLocalWorkSize = 256;
float multiplier = iNumElements/(float)szLocalWorkSize;
if(multiplier > (int)multiplier){
multiplier += 1;
}
size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; // rounded up to the nearest multiple of the LocalWorkSize
// Allocate and initialize host arrays
srcA = (float *)malloc(sizeof(float) * szGlobalWorkSize);
srcB = (float *)malloc(sizeof(float) * szGlobalWorkSize);
dst = (float *)malloc(sizeof(float) * szGlobalWorkSize);
fillFloatArray(srcA, iNumElements);
fillFloatArray(srcB, iNumElements);
if(DEBUG){
printFloatArray(srcA, "Field A", iNumElements);
printFloatArray(srcB, "Field B", iNumElements);
}
// Create the OpenCL context on a GPU device
cl_context cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
// Get the list of GPU devices associated with context
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
cl_device_id* cdDevices = (cl_device_id*)malloc(szParmDataBytes);
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
// Create a command-queue
cl_command_queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, 0);
// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
cl_mem cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, NULL);
cl_mem cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, NULL);
cl_mem cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, NULL);
char* cSourceCL = " __kernel void VectorAdd(__global const float* a, __global const float* b, __global float* c, int iNumElements) "
" { "
" int iGID = get_global_id(0); "
" if (iGID < iNumElements) "
" c[iGID] = a[iGID] + b[iGID]; "
" } ";
// Create the program
cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, NULL);
clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
// Create the kernel
cl_kernel ckKernel = clCreateKernel(cpProgram, "VectorAdd", NULL);
// Set the Argument values
clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (float*)&cmDevSrcA);
clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (float*)&cmDevSrcB);
clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (float*)&cmDevDst);
clSetKernelArg(ckKernel, 3, sizeof(cl_int), (float*)&iNumElements);
// --------------------------------------------------------
// Start Core sequence... copy input data to GPU, compute, copy results back
// Asynchronous write of data to GPU device
clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcA, 0, NULL, NULL);
clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcB, 0, NULL, NULL);
// Launch kernel
clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
// Synchronous/blocking read of results, and check accumulated errors
clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
if(DEBUG){
printFloatArray(dst, "Result", iNumElements);
}
// Cleanup allocated objects
if(cdDevices)free(cdDevices);
if(ckKernel)clReleaseKernel(ckKernel);
if(cpProgram)clReleaseProgram(cpProgram);
if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
if(cxGPUContext)clReleaseContext(cxGPUContext);
if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
if(cmDevDst)clReleaseMemObject(cmDevDst);
// Free host memory
free(srcA);
free(srcB);
free (dst);
}
void fillFloatArray(float* arr, int length){
for(int i=0;i<length;i++){
arr[i] = rand() / (float)RAND_MAX;
}
}
void printFloatArray(float* arr, char* name, int length){
printf("%s:\n", name);
for(int i=0;i<length;i++){
printf("%.1f ", arr[i]);
}printf("\n\n");
}
############################################### PYTHON CODE FOR COMPARISON ###############################
import os, subprocess
from timeit import Timer
def callCUDA():
# change qd-newCUDA to the correct name of the executable which runs the algorithm using CUDA
subprocess.call("./cuda", 0, None, None, None, None)
def callOpenCL():
# change qd-newCPU to the correct name of the executable which runs the algorithm using CPU
subprocess.call("./qd", 0, None, None, None, None)
if __name__=='__main__':
f = open('performanceResult', 'w')
f.write('Mutliprecision library - CPU/CUDA comparison\n')
time = min(Timer("callCUDA()","from __main__ import callCUDA").repeat(5,1))
mystr = 'Time needed using CUDA:\n' + str(time) +'\n'
f.write(mystr)
f.write('\n\nFloating point vector addition using OpenCL\n')
time = min(Timer("callOpenCL()","from __main__ import callOpenCL").repeat(5,1))
mystr = 'Time needed using CPU:\n' + str(time) +'\n'
f.write(mystr)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment