Created
January 28, 2010 04:00
-
-
Save linuxelf001/288423 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
####################################### CUDA VERSION ################################################ | |
// Author's : Rakesh Ginjupalli, Felix Rohrer | |
// Date : 12/27/09 | |
// We thank Dr Gaurav Khanna for his support | |
// Includes | |
#include <stdio.h> | |
// Variables | |
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C; | |
int iNumElements = 10; // Length of float arrays to process | |
bool DEBUG = true; | |
// Functions | |
void fillFloatArray(float* arr, int length); | |
void printFloatArray(float* arr, char* name, int length); | |
// Device code | |
__global__ void VecAdd(const float* A, const float* B, float* C, int N) | |
{ | |
int i = blockDim.x * blockIdx.x + threadIdx.x; | |
if (i < N) | |
C[i] = A[i] + B[i]; | |
} | |
// Host code | |
int main(int argc, char **argv) | |
{ | |
// set amount of numbers to be calculated | |
if(argc > 1){ | |
iNumElements = atoi(argv[1]); | |
printf("Setting numbers to %d\n", iNumElements); | |
if(argc == 3){ | |
DEBUG = false; | |
} | |
} | |
size_t size = iNumElements * sizeof(float); | |
// Allocate input vectors h_A and h_B in host memory | |
h_A = (float*)malloc(size); | |
h_B = (float*)malloc(size); | |
h_C = (float*)malloc(size); | |
// Initialize input vectors | |
fillFloatArray(h_A, iNumElements); | |
fillFloatArray(h_B, iNumElements); | |
if(DEBUG){ | |
printFloatArray(h_A, "Array A", iNumElements); | |
printFloatArray(h_B, "Array B", iNumElements); | |
} | |
// Allocate vectors in device memory | |
cudaMalloc((void**)&d_A, size); | |
cudaMalloc((void**)&d_B, size); | |
cudaMalloc((void**)&d_C, size); | |
// Copy vectors from host memory to device memory | |
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); | |
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); | |
// Invoke kernel | |
int threadsPerBlock = 256; | |
int blocksPerGrid = (iNumElements + threadsPerBlock - 1) / threadsPerBlock; | |
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, iNumElements); | |
// Copy result from device memory to host memory | |
// h_C contains the result in host memory | |
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); | |
if(DEBUG){ | |
printFloatArray(h_C, "Result", iNumElements); | |
} | |
// Free device memory | |
if (d_A) cudaFree(d_A); | |
if (d_B) cudaFree(d_B); | |
if (d_C) cudaFree(d_C); | |
// Free host memory | |
if (h_A) free(h_A); | |
if (h_B) free(h_B); | |
if (h_C) free(h_C); | |
} | |
void fillFloatArray(float* arr, int length){ | |
for(int i=0;i<length;i++){ | |
arr[i] = rand() / (float)RAND_MAX; | |
} | |
} | |
void printFloatArray(float* arr, char* name, int length){ | |
printf("%s:\n", name); | |
for(int i=0;i<length;i++){ | |
printf("%.1f ", arr[i]); | |
}printf("\n\n"); | |
} | |
######################################### OPENCL VERSION ################################################## | |
// Author's : Rakesh Ginjupalli, Felix Rohrer | |
// Date : 12/27/09 | |
// we thank Dr Gaurav Khanna for his support | |
// common SDK header for standard utilities and system libs | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <CL/cl.h> | |
// Host buffers for demo | |
// ********************************************************************* | |
float *srcA, *srcB, *dst; // Host buffers for OpenCL test | |
bool DEBUG = true; | |
// OpenCL Vars | |
size_t szParmDataBytes; // Byte size of context information | |
size_t szKernelLength; // Byte size of kernel code | |
int iNumElements = 10; // Length of float arrays to process | |
// Forward Declarations | |
// ********************************************************************* | |
void fillFloatArray(float* arr, int length); | |
void printFloatArray(float* arr, char* name, int length); | |
// Main function | |
// ********************************************************************* | |
int main(int argc, char **argv) | |
{ | |
// set amount of numbers to be calculated | |
if(argc > 1){ | |
iNumElements = atoi(argv[1]); | |
printf("Setting numbers to %d\n", iNumElements); | |
if(argc == 3){ | |
DEBUG = false; | |
} | |
} | |
// set and log Global and Local work size dimensions | |
size_t szLocalWorkSize = 256; | |
float multiplier = iNumElements/(float)szLocalWorkSize; | |
if(multiplier > (int)multiplier){ | |
multiplier += 1; | |
} | |
size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; // rounded up to the nearest multiple of the LocalWorkSize | |
// Allocate and initialize host arrays | |
srcA = (float *)malloc(sizeof(float) * szGlobalWorkSize); | |
srcB = (float *)malloc(sizeof(float) * szGlobalWorkSize); | |
dst = (float *)malloc(sizeof(float) * szGlobalWorkSize); | |
fillFloatArray(srcA, iNumElements); | |
fillFloatArray(srcB, iNumElements); | |
if(DEBUG){ | |
printFloatArray(srcA, "Field A", iNumElements); | |
printFloatArray(srcB, "Field B", iNumElements); | |
} | |
// Create the OpenCL context on a GPU device | |
cl_context cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL); | |
// Get the list of GPU devices associated with context | |
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes); | |
cl_device_id* cdDevices = (cl_device_id*)malloc(szParmDataBytes); | |
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL); | |
// Create a command-queue | |
cl_command_queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, 0); | |
// Allocate the OpenCL buffer memory objects for source and result on the device GMEM | |
cl_mem cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, NULL); | |
cl_mem cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, NULL); | |
cl_mem cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, NULL); | |
char* cSourceCL = " __kernel void VectorAdd(__global const float* a, __global const float* b, __global float* c, int iNumElements) " | |
" { " | |
" int iGID = get_global_id(0); " | |
" if (iGID < iNumElements) " | |
" c[iGID] = a[iGID] + b[iGID]; " | |
" } "; | |
// Create the program | |
cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, NULL); | |
clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); | |
// Create the kernel | |
cl_kernel ckKernel = clCreateKernel(cpProgram, "VectorAdd", NULL); | |
// Set the Argument values | |
clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (float*)&cmDevSrcA); | |
clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (float*)&cmDevSrcB); | |
clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (float*)&cmDevDst); | |
clSetKernelArg(ckKernel, 3, sizeof(cl_int), (float*)&iNumElements); | |
// -------------------------------------------------------- | |
// Start Core sequence... copy input data to GPU, compute, copy results back | |
// Asynchronous write of data to GPU device | |
clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcA, 0, NULL, NULL); | |
clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize, srcB, 0, NULL, NULL); | |
// Launch kernel | |
clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); | |
// Synchronous/blocking read of results, and check accumulated errors | |
clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); | |
if(DEBUG){ | |
printFloatArray(dst, "Result", iNumElements); | |
} | |
// Cleanup allocated objects | |
if(cdDevices)free(cdDevices); | |
if(ckKernel)clReleaseKernel(ckKernel); | |
if(cpProgram)clReleaseProgram(cpProgram); | |
if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue); | |
if(cxGPUContext)clReleaseContext(cxGPUContext); | |
if(cmDevSrcA)clReleaseMemObject(cmDevSrcA); | |
if(cmDevSrcB)clReleaseMemObject(cmDevSrcB); | |
if(cmDevDst)clReleaseMemObject(cmDevDst); | |
// Free host memory | |
free(srcA); | |
free(srcB); | |
free (dst); | |
} | |
void fillFloatArray(float* arr, int length){ | |
for(int i=0;i<length;i++){ | |
arr[i] = rand() / (float)RAND_MAX; | |
} | |
} | |
void printFloatArray(float* arr, char* name, int length){ | |
printf("%s:\n", name); | |
for(int i=0;i<length;i++){ | |
printf("%.1f ", arr[i]); | |
}printf("\n\n"); | |
} | |
############################################### PYTHON CODE FOR COMPARISON ############################### | |
import os, subprocess | |
from timeit import Timer | |
def callCUDA(): | |
# change qd-newCUDA to the correct name of the executable which runs the algorithm using CUDA | |
subprocess.call("./cuda", 0, None, None, None, None) | |
def callOpenCL(): | |
# change qd-newCPU to the correct name of the executable which runs the algorithm using CPU | |
subprocess.call("./qd", 0, None, None, None, None) | |
if __name__=='__main__': | |
f = open('performanceResult', 'w') | |
f.write('Mutliprecision library - CPU/CUDA comparison\n') | |
time = min(Timer("callCUDA()","from __main__ import callCUDA").repeat(5,1)) | |
mystr = 'Time needed using CUDA:\n' + str(time) +'\n' | |
f.write(mystr) | |
f.write('\n\nFloating point vector addition using OpenCL\n') | |
time = min(Timer("callOpenCL()","from __main__ import callOpenCL").repeat(5,1)) | |
mystr = 'Time needed using CPU:\n' + str(time) +'\n' | |
f.write(mystr) | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment