Skip to content

Instantly share code, notes, and snippets.

@Foadsf
Last active May 13, 2018 18:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Foadsf/628a046040c302f507c81fd0568d8b34 to your computer and use it in GitHub Desktop.
Save Foadsf/628a046040c302f507c81fd0568d8b34 to your computer and use it in GitHub Desktop.
OpenCL: comparing the time required to add two arrays of integers on different devices
ArrSize CPU GPU1 GPU2
1024 0.0022650000 0.0043130000 0.0028170000
2048 0.0017180000 0.0042000000 0.0034180000
4096 0.0018160000 0.0034690000 0.0033670000
8192 0.0024080000 0.0040740000 0.0032710000
16384 0.0029050000 0.0042900000 0.0031160000
32768 0.0036250000 0.0040950000 0.0034520000
65536 0.0052330000 0.0036320000 0.0032050000
131072 0.0076970000 0.0039470000 0.0031800000
262144 0.0162720000 0.0044290000 0.0031160000
524288 0.0315100000 0.0042210000 0.0035350000
1048576 0.0655650000 0.0051560000 0.0044900000
2097152 0.1322130000 0.0066200000 0.0069630000
4194304 0.2562870000 0.0099040000 0.0084920000
8388608 0.5589450000 0.0205990000 0.0121820000
16777216 1.0985770000 0.0335420000 0.0163770000
33554432 2.0345780000 0.0963500000 0.0444100000
67108864 3.7661480000 0.1875540000 0.0827400000
134217728 8.7979440000 0.3874850000 0.1690280000
#include <stdio.h>
#include <stdlib.h>
#include <time.h> // time()
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
#define OPENCL_CHECK(val) check_opencl_call((val), __FILE__, __LINE__)
void check_opencl_call(cl_int val, const char *const file, int const line)
{
if (val != CL_SUCCESS) {
printf("OpenCL error at %s:%d\n", file, line);
}
}
int int_pow(int base, int exp)
{
int result = 1;
while (exp)
{
if (exp & 1)
result *= base;
exp /= 2;
base *= base;
}
return result;
}
int main(void) {
// Create the two input vectors
int i,j,k,l;
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id* platform_id;
cl_uint ret_num_platforms;
cl_device_id* device_id;
cl_uint ret_num_devices;
// get all platforms
cl_int ret = clGetPlatformIDs(0, NULL, &ret_num_platforms);
OPENCL_CHECK(ret);
platform_id = (cl_platform_id*) malloc(sizeof(cl_platform_id) * ret_num_platforms);//allocates a block om hip memory of platformCount size
ret = clGetPlatformIDs(ret_num_platforms, platform_id, NULL);//put a list of OpenCL platforms in platforms
OPENCL_CHECK(ret);
for (i = 0; i < ret_num_platforms; i++) {
clock_t start, end;
double cpu_time_used;
// get all devices
ret = clGetDeviceIDs(platform_id[i], CL_DEVICE_TYPE_ALL, 0, NULL, &ret_num_devices);
OPENCL_CHECK(ret);
device_id = (cl_device_id*) malloc(sizeof(cl_device_id) * ret_num_devices);
ret = clGetDeviceIDs(platform_id[i], CL_DEVICE_TYPE_ALL, ret_num_devices, device_id, &ret_num_devices);
OPENCL_CHECK(ret);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, ret_num_devices, device_id, NULL, NULL, &ret);
OPENCL_CHECK(ret);
for (k = 0; k < 20; k++) {
const int LIST_SIZE = 1024*int_pow(2,k);
printf("%d,", LIST_SIZE);
int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
for(l = 0; l < LIST_SIZE; l++) {
A[i] = rand();
B[i] = rand();
}
for (j = 0; j < ret_num_devices; j++) {
start = clock();
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id[j], 0, &ret);
OPENCL_CHECK(ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
OPENCL_CHECK(ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
OPENCL_CHECK(ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
OPENCL_CHECK(ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
OPENCL_CHECK(ret);
// Build the program
ret = clBuildProgram(program, 1, device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
OPENCL_CHECK(ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 64; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("%.10f,", cpu_time_used);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(a_mem_obj);
ret = clReleaseMemObject(b_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
free(C);
}
free(A);
free(B);
printf("\n");
}
ret = clReleaseContext(context);
free(device_id);
}
free(platform_id);
return 0;
}
OS := $(shell uname)
OPTIONS:=
ifeq ($(OS),Darwin)
OPTIONS += -framework OpenCL
else
OPTIONS += -l OpenCL
endif
all: main.c
gcc -Wall -g main.c -o main $(OPTIONS)
clean:
rm -rf main1 main2
set terminal qt
set logscale y
set logscale x 2
set datafile separator ","
plot 'data.csv' using 1:2 title 'CPU' with lines,\
'data.csv' using 1:3 title 'GPU1' with lines,\
'data.csv' using 1:4 title 'GPU2' with lines,\
pause -1
__kernel void vector_add(__global int *A, __global int *B, __global int *C) {
// Get the index of the current element
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
}
@Foadsf
Copy link
Author

Foadsf commented May 12, 2018

I made this snippet (Gist) to compare the time required to do a simple parallel calculation (C[i]=A[i]+B[i]) on different devices on my mac and then measure the time required for different arrays with different length. This gist is also mention in this StackOverflow and this Reddit posts.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment