Last active
May 13, 2018 18:12
-
-
Save Foadsf/628a046040c302f507c81fd0568d8b34 to your computer and use it in GitHub Desktop.
OpenCL: comparing the time required to add two arrays of integers on different devices
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ArrSize | CPU | GPU1 | GPU2 | ||
---|---|---|---|---|---|
1024 | 0.0022650000 | 0.0043130000 | 0.0028170000 | ||
2048 | 0.0017180000 | 0.0042000000 | 0.0034180000 | ||
4096 | 0.0018160000 | 0.0034690000 | 0.0033670000 | ||
8192 | 0.0024080000 | 0.0040740000 | 0.0032710000 | ||
16384 | 0.0029050000 | 0.0042900000 | 0.0031160000 | ||
32768 | 0.0036250000 | 0.0040950000 | 0.0034520000 | ||
65536 | 0.0052330000 | 0.0036320000 | 0.0032050000 | ||
131072 | 0.0076970000 | 0.0039470000 | 0.0031800000 | ||
262144 | 0.0162720000 | 0.0044290000 | 0.0031160000 | ||
524288 | 0.0315100000 | 0.0042210000 | 0.0035350000 | ||
1048576 | 0.0655650000 | 0.0051560000 | 0.0044900000 | ||
2097152 | 0.1322130000 | 0.0066200000 | 0.0069630000 | ||
4194304 | 0.2562870000 | 0.0099040000 | 0.0084920000 | ||
8388608 | 0.5589450000 | 0.0205990000 | 0.0121820000 | ||
16777216 | 1.0985770000 | 0.0335420000 | 0.0163770000 | ||
33554432 | 2.0345780000 | 0.0963500000 | 0.0444100000 | ||
67108864 | 3.7661480000 | 0.1875540000 | 0.0827400000 | ||
134217728 | 8.7979440000 | 0.3874850000 | 0.1690280000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <time.h> // time() | |
#ifdef __APPLE__ | |
#include <OpenCL/opencl.h> | |
#else | |
#include <CL/cl.h> | |
#endif | |
#define MAX_SOURCE_SIZE (0x100000) | |
#define OPENCL_CHECK(val) check_opencl_call((val), __FILE__, __LINE__) | |
void check_opencl_call(cl_int val, const char *const file, int const line) | |
{ | |
if (val != CL_SUCCESS) { | |
printf("OpenCL error at %s:%d\n", file, line); | |
} | |
} | |
int int_pow(int base, int exp) | |
{ | |
int result = 1; | |
while (exp) | |
{ | |
if (exp & 1) | |
result *= base; | |
exp /= 2; | |
base *= base; | |
} | |
return result; | |
} | |
int main(void) { | |
// Create the two input vectors | |
int i,j,k,l; | |
// Load the kernel source code into the array source_str | |
FILE *fp; | |
char *source_str; | |
size_t source_size; | |
fp = fopen("vector_add_kernel.cl", "r"); | |
if (!fp) { | |
fprintf(stderr, "Failed to load kernel.\n"); | |
exit(1); | |
} | |
source_str = (char*)malloc(MAX_SOURCE_SIZE); | |
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp); | |
fclose( fp ); | |
// Get platform and device information | |
cl_platform_id* platform_id; | |
cl_uint ret_num_platforms; | |
cl_device_id* device_id; | |
cl_uint ret_num_devices; | |
// get all platforms | |
cl_int ret = clGetPlatformIDs(0, NULL, &ret_num_platforms); | |
OPENCL_CHECK(ret); | |
platform_id = (cl_platform_id*) malloc(sizeof(cl_platform_id) * ret_num_platforms);//allocates a block om hip memory of platformCount size | |
ret = clGetPlatformIDs(ret_num_platforms, platform_id, NULL);//put a list of OpenCL platforms in platforms | |
OPENCL_CHECK(ret); | |
for (i = 0; i < ret_num_platforms; i++) { | |
clock_t start, end; | |
double cpu_time_used; | |
// get all devices | |
ret = clGetDeviceIDs(platform_id[i], CL_DEVICE_TYPE_ALL, 0, NULL, &ret_num_devices); | |
OPENCL_CHECK(ret); | |
device_id = (cl_device_id*) malloc(sizeof(cl_device_id) * ret_num_devices); | |
ret = clGetDeviceIDs(platform_id[i], CL_DEVICE_TYPE_ALL, ret_num_devices, device_id, &ret_num_devices); | |
OPENCL_CHECK(ret); | |
// Create an OpenCL context | |
cl_context context = clCreateContext( NULL, ret_num_devices, device_id, NULL, NULL, &ret); | |
OPENCL_CHECK(ret); | |
for (k = 0; k < 20; k++) { | |
const int LIST_SIZE = 1024*int_pow(2,k); | |
printf("%d,", LIST_SIZE); | |
int *A = (int*)malloc(sizeof(int)*LIST_SIZE); | |
int *B = (int*)malloc(sizeof(int)*LIST_SIZE); | |
for(l = 0; l < LIST_SIZE; l++) { | |
A[i] = rand(); | |
B[i] = rand(); | |
} | |
for (j = 0; j < ret_num_devices; j++) { | |
start = clock(); | |
// Create a command queue | |
cl_command_queue command_queue = clCreateCommandQueue(context, device_id[j], 0, &ret); | |
OPENCL_CHECK(ret); | |
// Create memory buffers on the device for each vector | |
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, | |
LIST_SIZE * sizeof(int), NULL, &ret); | |
OPENCL_CHECK(ret); | |
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, | |
LIST_SIZE * sizeof(int), NULL, &ret); | |
OPENCL_CHECK(ret); | |
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, | |
LIST_SIZE * sizeof(int), NULL, &ret); | |
OPENCL_CHECK(ret); | |
// Copy the lists A and B to their respective memory buffers | |
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, | |
LIST_SIZE * sizeof(int), A, 0, NULL, NULL); | |
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, | |
LIST_SIZE * sizeof(int), B, 0, NULL, NULL); | |
// Create a program from the kernel source | |
cl_program program = clCreateProgramWithSource(context, 1, | |
(const char **)&source_str, (const size_t *)&source_size, &ret); | |
OPENCL_CHECK(ret); | |
// Build the program | |
ret = clBuildProgram(program, 1, device_id, NULL, NULL, NULL); | |
// Create the OpenCL kernel | |
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret); | |
OPENCL_CHECK(ret); | |
// Set the arguments of the kernel | |
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj); | |
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj); | |
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj); | |
// Execute the OpenCL kernel on the list | |
size_t global_item_size = LIST_SIZE; // Process the entire lists | |
size_t local_item_size = 64; // Process in groups of 64 | |
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, | |
&global_item_size, &local_item_size, 0, NULL, NULL); | |
// Read the memory buffer C on the device to the local variable C | |
int *C = (int*)malloc(sizeof(int)*LIST_SIZE); | |
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, | |
LIST_SIZE * sizeof(int), C, 0, NULL, NULL); | |
end = clock(); | |
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC; | |
printf("%.10f,", cpu_time_used); | |
// Clean up | |
ret = clFlush(command_queue); | |
ret = clFinish(command_queue); | |
ret = clReleaseKernel(kernel); | |
ret = clReleaseProgram(program); | |
ret = clReleaseMemObject(a_mem_obj); | |
ret = clReleaseMemObject(b_mem_obj); | |
ret = clReleaseMemObject(c_mem_obj); | |
ret = clReleaseCommandQueue(command_queue); | |
free(C); | |
} | |
free(A); | |
free(B); | |
printf("\n"); | |
} | |
ret = clReleaseContext(context); | |
free(device_id); | |
} | |
free(platform_id); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
OS := $(shell uname) | |
OPTIONS:= | |
ifeq ($(OS),Darwin) | |
OPTIONS += -framework OpenCL | |
else | |
OPTIONS += -l OpenCL | |
endif | |
all: main.c | |
gcc -Wall -g main.c -o main $(OPTIONS) | |
clean: | |
rm -rf main1 main2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set terminal qt | |
set logscale y | |
set logscale x 2 | |
set datafile separator "," | |
plot 'data.csv' using 1:2 title 'CPU' with lines,\ | |
'data.csv' using 1:3 title 'GPU1' with lines,\ | |
'data.csv' using 1:4 title 'GPU2' with lines,\ | |
pause -1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__kernel void vector_add(__global int *A, __global int *B, __global int *C) { | |
// Get the index of the current element | |
int i = get_global_id(0); | |
// Do the operation | |
C[i] = A[i] + B[i]; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I made this snippet (Gist) to compare the time required to do a simple parallel calculation (C[i]=A[i]+B[i]) on different devices on my mac and then measure the time required for different arrays with different length. This gist is also mention in this StackOverflow and this Reddit posts.