yohanesgultom/! CUDA & OpenCL Intro

## ! CUDA & OpenCL Intro
Simple CUDA and OpenCL code

Compilation:

* CUDA (*.cu): nvcc filename.cu
* CUDA + CUBLAS (*.cu): nvcc filename.cu -lcublas
* OpenCL (*.c): gcc filename.c -lOpenCL

## deviceQuery.c
// device_query.c
// yohanes.gultom@gmail.com
// Original source:
// * http://stackoverflow.com/questions/17240071/what-is-the-right-way-to-call-clgetplatforminfo
// * Banger, R, Bhattacharyya .K. "OpenCL Programming by Example". 2013. Packt publishing. p43

#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#define NELEMS(x)  (sizeof(x) / sizeof((x)[0]))

const cl_platform_info attributeTypes[5] = {
    CL_PLATFORM_NAME,
    CL_PLATFORM_VENDOR,
    CL_PLATFORM_VERSION,
    CL_PLATFORM_PROFILE,
    CL_PLATFORM_EXTENSIONS
};

const char* const attributeNames[] = {
    "CL_PLATFORM_NAME",
    "CL_PLATFORM_VENDOR",
    "CL_PLATFORM_VERSION",
    "CL_PLATFORM_PROFILE",
    "CL_PLATFORM_EXTENSIONS"
};

void PrintDeviceInfo(cl_device_id device)
{
    char queryBuffer[1024];
    int queryInt;
    cl_int clError;
    clError = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(queryBuffer), &queryBuffer, NULL);
    printf("    CL_DEVICE_NAME: %s\n", queryBuffer);
    queryBuffer[0] = '\0';
    clError = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(queryBuffer), &queryBuffer, NULL);
    printf("    CL_DEVICE_VENDOR: %s\n", queryBuffer);
    queryBuffer[0] = '\0';
    clError = clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(queryBuffer), &queryBuffer, NULL);
    printf("    CL_DRIVER_VERSION: %s\n", queryBuffer);
    queryBuffer[0] = '\0';
    clError = clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(queryBuffer), &queryBuffer, NULL);
    printf("    CL_DEVICE_VERSION: %s\n", queryBuffer);
    queryBuffer[0] = '\0';
    clError = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &queryInt, NULL);
    printf("    CL_DEVICE_MAX_COMPUTE_UNITS: %d\n", queryInt);
}

int main(void) {
    int i, j, k, num_attributes;
    char* info;

    cl_platform_id * platforms = NULL;
    cl_uint num_platforms;
    cl_device_id *device_list = NULL;
    cl_uint num_devices;
    cl_int clStatus;
    size_t infoSize;

    // Get platform and device information
    clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
    platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms);
    clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);

    // for each platform print all attributes
    num_attributes = NELEMS(attributeTypes);
    // printf("\nAttribute Count = %d ", num_attributes);
    for (i = 0; i < num_platforms; i++) {
        printf("Platform - %d\n", i+1);
        for (j = 0; j < num_attributes; j++) {
            // get platform attribute value size
            clGetPlatformInfo(platforms[i], attributeTypes[j], 0, NULL, &infoSize);
            info = (char*) malloc(infoSize);
            // get platform attribute value
            clGetPlatformInfo(platforms[i], attributeTypes[j], infoSize, info, NULL);
            printf("  %d.%d %-11s: %s\n", i+1, j+1, attributeNames[j], info);
        }
        //Get the devices list and choose the device you want to run on
        clStatus = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
        device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
        clStatus = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
        for (k = 0; k < num_devices; k++) {
            printf("  Device - %d:\n", (k+1));
            PrintDeviceInfo(device_list[k]);
        }
    }


    free(platforms);
    // free(device_list);
    return 0;
}

## mmul.cl
__kernel void matrixMul(__global float* C, __global float* A, __global float* B, int width)
{
    // 2D Thread ID
    int tx = get_global_id(0);
    int ty = get_global_id(1);

    // value stores the element that is
    // computed by the thread
    float value = 0;
    int i = 0;
    for (i = 0; i < width; ++i)
    {
        value += A[ty * width + i] * B[i * width + tx];
    }

    // Write the matrix to device memory each
    // thread writes one element
    C[ty * width + tx] = value;
}

## mmul.cl.c
/**
 * Perkalian matriks persegi
 * Source: http://gpgpu-computing4.blogspot.co.id/2009/09/matrix-multiplication-2-opencl.html
 **/

#define CL_USE_DEPRECATED_OPENCL_1_2_APIS

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

#define WIDTH 1024 // ukuran baris matriks
#define TILE_SIZE 16 // ukuran baris submatriks
#define MAX_SOURCE_SIZE (0x100000)

char *oclLoadProgSource(char *fileName, char *comment, size_t *source_size)
{
    /* Load the source code containing the kernel*/
    FILE *fp = fopen(fileName, "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    char *source_str = (char*)malloc(MAX_SOURCE_SIZE);
    *source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);
    return source_str;
}

void randomInit(float* data, int size)
{
    int i = 0;
    for (i = 0; i < size; ++i)
    data[i] = rand() / (float)RAND_MAX;
}

void validateMatrixMul(float* C, float* A, float* B, int width) {
    int i, j, k = 0;
    float sum = .0f;
    for (i = 0; i < width; i++) {
        for (j = 0; j < width; j++) {
            sum = .0f;
            for (k = 0; k < width; k++) {
                sum = sum + A[i*width+k] * B[k*width+j];
            }
            if (fabs(C[i*width+j] - sum) > 1e-3)
            {
                fprintf(stderr, "Result verification failed at element %d!\n", i*width+j);
                exit(EXIT_FAILURE);
            }
        }
    }
}


int main(void)
{
    // Isi sesuai dengan indeks platform yang ingin digunakan
    // Indeks berdasarkan hasil device_query.c
    int platformId = 0;
    int deviceId = 0;

    // alokasi memory variable di host
    unsigned int size = WIDTH * WIDTH;
    unsigned int mem_size = sizeof(float) * size;
    float* h_A = (float*) malloc(mem_size);
    float* h_B = (float*) malloc(mem_size);
    float* h_C = (float*) malloc(mem_size);

    // inisialisasi acak
    randomInit(h_A, size);
    randomInit(h_B, size);

    cl_int clStatus;

    // Ambil list platforms
    cl_uint num_platforms;
    clGetPlatformIDs(0, NULL, &num_platforms);
    cl_platform_id *platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id)*num_platforms);
    clGetPlatformIDs(num_platforms, platforms, NULL);

    // Pakai platform sesuai platformId
    cl_platform_id cpPlatform = platforms[platformId];

    // Ambil list devices
    cl_uint num_devices;
    clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
    cl_device_id *device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
    clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);

    // Pakai device sesuai deviceId
    cl_device_id cdDevice = device_list[deviceId];

    // Buat context
    cl_context cxGPUContext = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &clStatus);

    // Buat command queue (OpenCL < 2.0)
    cl_command_queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &clStatus);

    // Buat command-queue (OpenCL >= 2.0)
    // cl_command_queue cqCommandQueue = clCreateCommandQueueWithProperties(cxGPUContext, cdDevice, 0, &clStatus);

    // Setup device memory
    cl_mem d_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size, NULL, &clStatus);
    cl_mem d_B = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size, NULL, &clStatus);
    cl_mem d_C = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, mem_size, NULL, &clStatus);

    // Tulis (salin) memory data dari host ke device
    clEnqueueWriteBuffer(cqCommandQueue, d_A, CL_FALSE, 0, sizeof(cl_float) * size, h_A, 0, NULL, NULL);
    clEnqueueWriteBuffer(cqCommandQueue, d_B, CL_FALSE, 0, sizeof(cl_float) * size, h_B, 0, NULL, NULL);

    // baca kernel dari file eksternal dan buat program
    size_t szKernelLength;
    char *cSourceCL = oclLoadProgSource("mmul.cl", "// My comment\n", &szKernelLength);
    cl_program clProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &clStatus);

    clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL);
    cl_kernel clKernel = clCreateKernel(clProgram, "matrixMul", &clStatus);

    // tentukan argumen kernel
    int w = WIDTH;
    clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C);
    clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A);
    clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B);
    clSetKernelArg(clKernel, 3, sizeof(cl_int), (void *)&w);

    // jalankan kernel
    size_t localWorkSize[] = {TILE_SIZE, TILE_SIZE}; // ukuran work-group (block)
    size_t globalWorkSize[] = {WIDTH, WIDTH}; // jumlah seluruh work-items (threads)
    clEnqueueNDRangeKernel(cqCommandQueue, clKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);

    // salin hasil dari memory device
    clEnqueueReadBuffer(cqCommandQueue, d_C, CL_TRUE, 0, mem_size, h_C, 0, NULL, NULL);

    // dealokasi objek-objek OpenCL
    clReleaseMemObject(d_A);
    clReleaseMemObject(d_C);
    clReleaseMemObject(d_B);

    clReleaseContext(cxGPUContext);
    clReleaseKernel(clKernel);
    clReleaseProgram(clProgram);
    if(cqCommandQueue) {
        clFlush(cqCommandQueue);
        clFinish(cqCommandQueue);
    }

    // validasi
//     validateMatrixMul(h_C, h_A, h_B, WIDTH);
//     printf("Test PASSED\n");

    // dealokasi matriks
    free(h_A);
    free(h_B);
    free(h_C);
    free(device_list);
    free(platforms);

    return 0;
}

## mmul_cublas.cu
/**
 * Perkalian paralel matriks bujur sangkar dengan CUBLAS
 *
 * Referensi: https://raw.githubusercontent.com/sol-prog/cuda_cublas_curand_thrust/master/mmul_1.cu
 *
 **/

#include <stdio.h>
#include <cublas_v2.h>

#define WIDTH 1024

void randomInit(float* data, int size)
{
   for (int i = 0; i < size; ++i)
   data[i] = rand() / (float)RAND_MAX;
}

void validateMatrixMul(float* C, float* A, float* B, int width) {
    int i, j, k = 0;
    float sum = .0f;
    for (i = 0; i < width; i++) {
        for (j = 0; j < width; j++) {
            sum = .0f;
            for (k = 0; k < width; k++) {
                sum = sum + A[i*width+k] * B[k*width+j];
            }
            if (fabs(C[i*width+j] - sum) > 1e-3)
            {
                fprintf(stderr, "Result verification failed at element %d!\n", i*width+j);
                exit(EXIT_FAILURE);
            }
        }
    }
}

int main() {

    // Alokasi variable di memory host
    unsigned int size = WIDTH * WIDTH;
    unsigned int mem_size = sizeof(float) * size;
    float* h_A = (float*) malloc(mem_size);
    float* h_B = (float*) malloc(mem_size);
    float* h_C = (float*) malloc(mem_size);

    // inisalisasi acak
    randomInit(h_A, size);
    randomInit(h_B, size);

    // Alokasi variable di memory device
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A,WIDTH * WIDTH * sizeof(float));
    cudaMalloc(&d_B,WIDTH * WIDTH * sizeof(float));
    cudaMalloc(&d_C,WIDTH * WIDTH * sizeof(float));

    // Salin variable dari memory host ke device
    cudaMemcpy(d_A,h_A,WIDTH * WIDTH * sizeof(float),cudaMemcpyHostToDevice);
    cudaMemcpy(d_B,h_B,WIDTH * WIDTH * sizeof(float),cudaMemcpyHostToDevice);

    // Eksekusi perkalian matriks
	const float alf = 1.0f;
	const float bet = 0.0f;
	const float *alpha = &alf;
	const float *beta = &bet;
	cublasHandle_t handle;
    cublasCreate(&handle);

    // Catatan: Posisi d_A dan d_B positions ditukar karena kita menggunakan row-major format https://ipfs.io/ipfs/QmXoypizjW3WknFiJnKLwHCnL72vedxjQkDDP1mXWo6uco/wiki/Row-major_order.html
    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, WIDTH, WIDTH, alpha, d_B, WIDTH, d_A, WIDTH, beta, d_C, WIDTH);

    // Salin variable hasil dari memory device ke host
    cudaMemcpy(h_C,d_C,WIDTH * WIDTH * sizeof(float),cudaMemcpyDeviceToHost);

    // Dealokasi memory device
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

//     validateMatrixMul(h_C, h_A, h_B, WIDTH);
//     printf("Test PASSED\n");

    // Dealokasi memory host
    free(h_A);
    free(h_B);
    free(h_C);

	return EXIT_SUCCESS;
}

## mmul_cuda.cu
/**
 * Perkalian paralel matriks bujur sangkar
 *
 * Referensi: http://gpgpu-computing4.blogspot.co.id/2009/08/matrix-multiplication-2.html
 *
 **/

#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#define WIDTH 1024 // ukuran matriks
#define TILE_SIZE 16 // ukuran tile/submatriks

__global__ void matrixMul( float* C, float* A, float* B, int width)
{

    // 2D Thread ID
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;

    // lakukan multiplikasi untuk elemen
    // C[tx, ty] atau C[ty * width + tx]
    float value = 0;
    for (int i = 0; i < width; ++i)
    {
        float elementA = A[ty * width + i];
        float elementB = B[i * width + tx];
        value += elementA * elementB;
    }

    C[ty * width + tx] = value;
}

void randomInit(float* data, int size)
{
   for (int i = 0; i < size; ++i)
   data[i] = rand() / (float)RAND_MAX;
}

void validateMatrixMul(float* C, float* A, float* B, int width) {
    int i, j, k = 0;
    float sum = .0f;
    for (i = 0; i < width; i++) {
        for (j = 0; j < width; j++) {
            sum = .0f;
            for (k = 0; k < width; k++) {
                sum = sum + A[i*width+k] * B[k*width+j];
            }
            if (fabs(C[i*width+j] - sum) > 1e-3)
            {
                fprintf(stderr, "Result verification failed at element %d!\n", i*width+j);
                exit(EXIT_FAILURE);
            }
        }
    }
}


int main()
{
    // alokasi host memory
    unsigned int size = WIDTH * WIDTH;
    unsigned int mem_size = sizeof(float) * size;
    float* h_A = (float*) malloc(mem_size);
    float* h_B = (float*) malloc(mem_size);
    float* h_C = (float*) malloc(mem_size);

    // inisalisasi acak
    randomInit(h_A, size);
    randomInit(h_B, size);

    // alokasi device memory
    float *d_A, *d_B, *d_C;
    cudaMalloc((void**) &d_A, mem_size);
    cudaMalloc((void**) &d_B, mem_size);
    cudaMalloc((void**) &d_C, mem_size);

    // salin data ke device memory
    cudaMemcpy(d_A, h_A, mem_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, mem_size, cudaMemcpyHostToDevice);

    // jalankan kernel
    // dimensi block 2D = 16 * 16 threads
    // dimensi grid 2D = 64 * 64 blocks
    // total threads = 64 * 64 * 16 * 16 = 1048576 threads
    dim3 blockDim(TILE_SIZE, TILE_SIZE);
    dim3 gridDim(WIDTH / TILE_SIZE, WIDTH / TILE_SIZE);
    matrixMul<<< gridDim, blockDim >>>(d_C, d_A, d_B, WIDTH);

    // salin hasil dari device
    cudaMemcpy(h_C, d_C, mem_size, cudaMemcpyDeviceToHost);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // validasi
//     validateMatrixMul(h_C, h_A, h_B, WIDTH);
//     printf("Test PASSED\n");

    // dealokasi
    free(h_A);
    free(h_B);
    free(h_C);

}

## saxpy.cl
// SAXPY (Single precision real Alpha X plus Y)
// Original source: Banger, R, Bhattacharyya .K. OpenCL Programming by Example. 2013. Packt publishing
// By: yohanes.gultom@gmail.com

__kernel void saxpy_kernel(float alpha, __global float *A, __global float *B, __global float *C)
{
  //Get the index of the work-item
  int index = get_global_id(0);
  C[index] = alpha* A[index] + B[index];
}

## saxpy.cl.c
/**
 * Simplified SAXPY OpenCL
 * Tested on: CL_PLATFORM_VERSION: OpenCL 1.2 CUDA 9.0.282
 */

#define CL_USE_DEPRECATED_OPENCL_1_2_APIS

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

#define VECTOR_SIZE 1024
#define MAX_SOURCE_SIZE (0x100000)
char *oclLoadProgSource(char *fileName, char *comment, size_t *source_size)
{
    /* Load the source code containing the kernel*/
    FILE *fp = fopen(fileName, "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    char *source_str = (char*)malloc(MAX_SOURCE_SIZE);
    *source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);
    return source_str;
}

int main(void) {

    // Isi sesuai dengan indeks platform yang ingin digunakan
    // Indeks berdasarkan hasil device_query.c
    int platformId = 0;
    int deviceId = 0;

    int i;
    char *kernel_filename = "saxpy.cl";
    char *kernel_comment = "// saxpy";
    size_t kernelLength;

    // Allocate space for vectors A, B and C
    float alpha = 2.0;
    float *A = (float*)malloc(sizeof(float)*VECTOR_SIZE);
    float *B = (float*)malloc(sizeof(float)*VECTOR_SIZE);
    float *C = (float*)malloc(sizeof(float)*VECTOR_SIZE);
    for(i = 0; i < VECTOR_SIZE; i++)
    {
        A[i] = i;
        B[i] = VECTOR_SIZE - i;
        C[i] = 0;
    }

    // Get platform and device information
    cl_platform_id * platforms = NULL;
    cl_uint num_platforms;
    cl_device_id *device_list = NULL;
    cl_uint num_devices;
    cl_context context;
    char *kernel_content = NULL;

    //Set up the Platform
    cl_int clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
    platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id)*num_platforms);
    clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);

    //Get the devices list and choose the device you want to run on
    clStatus = clGetDeviceIDs( platforms[platformId], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
    device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
    clStatus = clGetDeviceIDs( platforms[platformId], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);

    // Create one OpenCL context for each device in the platform
    context = clCreateContext( NULL, num_devices, device_list, NULL, NULL, &clStatus);

    // Create a command queue (OpenCL < 2.0)
    cl_command_queue command_queue = clCreateCommandQueue(context, device_list[deviceId], 0, &clStatus);

    // Create a command queue (OpenCL >= 2.0)
    // cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_list[deviceId], 0, &clStatus);

    // Create memory buffers on the device for each vector
    cl_mem A_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
    cl_mem B_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
    cl_mem C_clmem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);

    // Copy the Buffer A and B to the device
    clStatus = clEnqueueWriteBuffer(command_queue, A_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), A, 0, NULL, NULL);
    clStatus = clEnqueueWriteBuffer(command_queue, B_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), B, 0, NULL, NULL);

    // Create a program from the kernel source
    kernel_content = oclLoadProgSource(kernel_filename, kernel_comment, &kernelLength);
    // printf("%s\n", kernel_content);
    cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernel_content, NULL, &clStatus);

    // Build the program
    clStatus = clBuildProgram(program, 1, device_list, NULL, NULL, NULL);

    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "saxpy_kernel", &clStatus);

    // Set the arguments of the kernel
    clStatus = clSetKernelArg(kernel, 0, sizeof(float), (void *)&alpha);
    clStatus = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&A_clmem);
    clStatus = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&B_clmem);
    clStatus = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&C_clmem);

    // Execute the OpenCL kernel on the list
    size_t global_size = VECTOR_SIZE; // Process the entire lists
    size_t local_size = 64;

    // Process one item at a time
    clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);

    // Read the cl memory C_clmem on device to the host variable C
    clStatus = clEnqueueReadBuffer(command_queue, C_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), C, 0, NULL, NULL);

    // Clean up and wait for all the comands to complete.
    clStatus = clFlush(command_queue);
    clStatus = clFinish(command_queue);

    // Validate result
//     for (i = 0; i < VECTOR_SIZE; ++i)
//     {
//         if (fabs(alpha * A[i] + B[i] - C[i]) > 1e-5)
//         {
//             fprintf(stderr, "Result verification failed at element %d!\n", i);
//             exit(EXIT_FAILURE);
//         }
//     }
//     printf("Test PASSED\n");

    // Finally release all OpenCL allocated objects and host buffers.
    clStatus = clReleaseKernel(kernel);
    clStatus = clReleaseProgram(program);
    clStatus = clReleaseMemObject(A_clmem);
    clStatus = clReleaseMemObject(B_clmem);
    clStatus = clReleaseMemObject(C_clmem);
    clStatus = clReleaseCommandQueue(command_queue);
    clStatus = clReleaseContext(context);
    free(A);
    free(B);
    free(C);
    free(platforms);
    free(device_list);
    return 0;
}

## simpleIndexing.cu
/**
 * How to get global thread index on various grid/block indexing schemes
 * Source: http://www.martinpeniak.com/index.php?option=com_content&view=article&catid=17:updates&id=288:cuda-thread-indexing-explained
 *
 */

// 1D grid of 1D blocks
__device__ int getGlobalIdx_1D_1D()
{
	return blockIdx.x * blockDim.x + threadIdx.x;
}

// 1D grid of 2D blocks
__device__ int getGlobalIdx_1D_2D()
{
	return blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
}

// 1D grid of 3D blocks
__device__ int getGlobalIdx_1D_3D()
{
	return blockIdx.x * blockDim.x * blockDim.y * blockDim.z
	 + threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
}

// 2D grid of 1D blocks
__device__ int getGlobalIdx_2D_1D()
{
	int blockId   = blockIdx.y * gridDim.x + blockIdx.x;
	int threadId = blockId * blockDim.x + threadIdx.x;
	return threadId;
}

// 2D grid of 2D blocks
 __device__ int getGlobalIdx_2D_2D()
{
	int blockId = blockIdx.x + blockIdx.y * gridDim.x;
	int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
	return threadId;
}

// 2D grid of 3D blocks
__device__ int getGlobalIdx_2D_3D()
{
	int blockId = blockIdx.x
			 + blockIdx.y * gridDim.x;
	int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
			   + (threadIdx.z * (blockDim.x * blockDim.y))
			   + (threadIdx.y * blockDim.x)
			   + threadIdx.x;
	return threadId;
}

// 3D grid of 1D blocks
__device__ int getGlobalIdx_3D_1D()
{
	int blockId = blockIdx.x
			 + blockIdx.y * gridDim.x
			 + gridDim.x * gridDim.y * blockIdx.z;
	int threadId = blockId * blockDim.x + threadIdx.x;
	return threadId;
}


// 3D grid of 2D blocks
__device__ int getGlobalIdx_3D_2D()
{
	int blockId = blockIdx.x
		         + blockIdx.y * gridDim.x
			 + gridDim.x * gridDim.y * blockIdx.z;
	int threadId = blockId * (blockDim.x * blockDim.y)
			  + (threadIdx.y * blockDim.x)
			  + threadIdx.x;
	return threadId;
}


// 3D grid of 3D blocks
__device__ int getGlobalIdx_3D_3D()
{
	int blockId = blockIdx.x
			 + blockIdx.y * gridDim.x
			 + gridDim.x * gridDim.y * blockIdx.z;
	int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
			  + (threadIdx.z * (blockDim.x * blockDim.y))
			  + (threadIdx.y * blockDim.x)
			  + threadIdx.x;
	return threadId;
}

## vectorAdd.c
/**
 * Vector addition: C = A + B.
 * Serial CPU execution
 */

#include <stdio.h>
#include <stdlib.h>

int main(void)
{
    // ukuran/total elemen vektor
    int numElements = 50000;
    size_t size = numElements * sizeof(float);

    float *h_A = (float *)malloc(size);
    float *h_B = (float *)malloc(size);
    float *h_C = (float *)malloc(size);

    for (int i = 0; i < numElements; ++i)
    {
        h_A[i] = rand()/(float)RAND_MAX;
        h_B[i] = rand()/(float)RAND_MAX;
    }

    for (int i = 0; i < numElements; ++i)
    {
        h_C[i] = h_A[i] + h_B[i];
    }

    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

## vectorAdd.cl
__kernel void VectorAdd(__global const float* a, __global const float* b, __global float* c, int iNumElements)
{
    // ambil indeks global work-item (thread)
    int iGID = get_global_id(0);

    // jumlah work-items (threads) bisa melebihi iNumElements
    if (iGID < iNumElements)
    {
        // jumlahkan elemen vektor ke iGID
        c[iGID] = a[iGID] + b[iGID];
    }
}

## vectorAdd.cl.c
/*
 * Penjumlahan vektor
 *
 * Tested on CL_PLATFORM_VERSION: OpenCL 1.2 CUDA 9.0.282
 */

#define CL_USE_DEPRECATED_OPENCL_1_2_APIS

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

#define NUM_ELEMENTS 50000
#define MAX_SOURCE_SIZE (0x100000)

char *oclLoadProgSource(char *fileName, char *comment, size_t *source_size)
{
    /* Load the source code containing the kernel*/
    FILE *fp = fopen(fileName, "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    char *source_str = (char*)malloc(MAX_SOURCE_SIZE);
    *source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);
    return source_str;
}

int main(void)
{
    // Isi sesuai dengan indeks platform yang ingin digunakan
    // Indeks berdasarkan hasil device_query.c
    int platformId = 0;
    int deviceId = 0;

    int i = 0;
    int iNumElements = NUM_ELEMENTS;

    // Alokasi dan inisialisi variable di memory host
    float *srcA = (float *)malloc(sizeof(float) * iNumElements);
    float *srcB = (float *)malloc(sizeof(float) * iNumElements);
    float *dst = (float *)malloc(sizeof(float) * iNumElements);
    i = 0;
    for (i = 0; i < iNumElements; ++i)
    {
        srcA[i] = rand()/(float)RAND_MAX;
        srcB[i] = rand()/(float)RAND_MAX;
    }

    cl_int clStatus;

    // Ambil list platforms
    cl_uint num_platforms;
    clGetPlatformIDs(0, NULL, &num_platforms);
    cl_platform_id *platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id)*num_platforms);
    clGetPlatformIDs(num_platforms, platforms, NULL);

    // Pakai platform sesuai platformId
    cl_platform_id cpPlatform = platforms[platformId];

    // Ambil list devices
    cl_uint num_devices;
    clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
    cl_device_id *device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
    clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);

    // Pakai device sesuai deviceId
    cl_device_id cdDevice = device_list[deviceId];

    // Buat context
    cl_context cxGPUContext = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &clStatus);

    // Buat command queue (OpenCL < 2.0)
    cl_command_queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &clStatus);

    // Buat command-queue (OpenCL >= 2.0)
    // cl_command_queue cqCommandQueue = clCreateCommandQueueWithProperties(cxGPUContext, cdDevice, 0, &clStatus);

    // Alokasi memory di device
    cl_mem cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(float) * iNumElements, NULL, &clStatus);
    cl_mem cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(float) * iNumElements, NULL, &clStatus);
    cl_mem cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(float) * iNumElements, NULL, &clStatus);

    // Tulis (salin) memory data dari host ke device
    clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * iNumElements, srcA, 0, NULL, NULL);
    clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * iNumElements, srcB, 0, NULL, NULL);

    // Buat program dan build dari fungsi kernel
    size_t szKernelLength;
    char *cSourceCL = oclLoadProgSource("vectorAdd.cl", "// My comment\n", &szKernelLength);
    cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &clStatus);
    clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);

    // Buat kernel
    cl_kernel ckKernel = clCreateKernel(cpProgram, "VectorAdd", &clStatus);

    // Tentukan argumen kernel
    clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
    clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
    clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
    clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);

    // Jalankan kernel
    size_t szLocalWorkSize = 256; // ukuran work-group (block)
    size_t szGlobalWorkSize = iNumElements; // jumlah seluruh work-items (threads)
    clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);

    // Baca (salin) memory hasil dari device kembali ke host
    clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * iNumElements, dst, 0, NULL, NULL);

    // Validasi hasil
//     i = 0;
//     for (i = 0; i < iNumElements; i++) {
//         if (fabs(srcA[i] + srcB[i] - dst[i]) > 1e5) {
//             fprintf(stderr, "Result verification failed at element %d!\n", i);
//             exit(EXIT_FAILURE);
//         }
//     }
//     printf("Test PASSED\n");

    // Dealokasi objek openCL
    if(ckKernel)clReleaseKernel(ckKernel);
    if(cpProgram)clReleaseProgram(cpProgram);
    if(cqCommandQueue) {
        clStatus = clFlush(cqCommandQueue);
        clStatus = clFinish(cqCommandQueue);
    }
    if(cxGPUContext)clReleaseContext(cxGPUContext);

    // Dealokasi memory device
    if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
    if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
    if(cmDevDst)clReleaseMemObject(cmDevDst);

    // Dealokasi memory host
    free(srcA);
    free(srcB);
    free(dst);
    free(device_list);
    free(platforms);

    return 0;

}

## vectorAdd.cu
/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 * Vector addition: C = A + B.
 *
 * This sample is a very basic sample that implements element by element
 * vector addition. It is the same as the sample illustrating Chapter 2
 * of the programming guide with some additions like error checking.
 */

#include <stdio.h>

__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
{
    // jika menggunakan indeks 2D, akan terdapat atribut x, y
    // jika menggunakan indeks 3D, akan terdapat atribut x, y, z
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    // karena jumlah thread yang berjalan dapat >= total elemen
    if (i < numElements)
    {
        C[i] = A[i] + B[i];
    }
}

int main(void)
{
    // ukuran/total elemen vektor
    int numElements = 50000;
    size_t size = numElements * sizeof(float);

    float *h_A = (float *)malloc(size);
    float *h_B = (float *)malloc(size);
    float *h_C = (float *)malloc(size);

    float *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, size);
    cudaMalloc((void **)&d_B, size);
    cudaMalloc((void **)&d_C, size);

    for (int i = 0; i < numElements; ++i)
    {
        h_A[i] = rand()/(float)RAND_MAX;
        h_B[i] = rand()/(float)RAND_MAX;
    }

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;

    // (50000 + 256 - 1) / 256 = 196 blocks/grid
    // jadi ada 50176 threads yang akan dijalankan, yaitu lebih dari total elemen
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);

    // // alternatif
    // dim3 gridDim(blocksPerGrid);
    // dim3 blockDim(threadsPerBlock);
    // vectorAdd<<<gridDim, blockDim>>>(d_A, d_B, d_C, numElements);

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // // validasi hasil
    // for (int i = 0; i < numElements; ++i)
    // {
    //     if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
    //     {
    //         fprintf(stderr, "Result verification failed at element %d!\n", i);
    //         exit(EXIT_FAILURE);
    //     }
    // }
    // printf("Test PASSED\n");

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}
	Simple CUDA and OpenCL code

	Compilation:

	* CUDA (*.cu): nvcc filename.cu
	* CUDA + CUBLAS (*.cu): nvcc filename.cu -lcublas
	* OpenCL (*.c): gcc filename.c -lOpenCL
	// device_query.c
	// yohanes.gultom@gmail.com
	// Original source:
	// * http://stackoverflow.com/questions/17240071/what-is-the-right-way-to-call-clgetplatforminfo
	// * Banger, R, Bhattacharyya .K. "OpenCL Programming by Example". 2013. Packt publishing. p43

	#include <stdio.h>
	#include <stdlib.h>
	#ifdef __APPLE__
	#include <OpenCL/cl.h>
	#else
	#include <CL/cl.h>
	#endif
	#define NELEMS(x) (sizeof(x) / sizeof((x)[0]))

	const cl_platform_info attributeTypes[5] = {
	CL_PLATFORM_NAME,
	CL_PLATFORM_VENDOR,
	CL_PLATFORM_VERSION,
	CL_PLATFORM_PROFILE,
	CL_PLATFORM_EXTENSIONS
	};

	const char* const attributeNames[] = {
	"CL_PLATFORM_NAME",
	"CL_PLATFORM_VENDOR",
	"CL_PLATFORM_VERSION",
	"CL_PLATFORM_PROFILE",
	"CL_PLATFORM_EXTENSIONS"
	};

	void PrintDeviceInfo(cl_device_id device)
	{
	char queryBuffer[1024];
	int queryInt;
	cl_int clError;
	clError = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(queryBuffer), &queryBuffer, NULL);
	printf(" CL_DEVICE_NAME: %s\n", queryBuffer);
	queryBuffer[0] = '\0';
	clError = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(queryBuffer), &queryBuffer, NULL);
	printf(" CL_DEVICE_VENDOR: %s\n", queryBuffer);
	queryBuffer[0] = '\0';
	clError = clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(queryBuffer), &queryBuffer, NULL);
	printf(" CL_DRIVER_VERSION: %s\n", queryBuffer);
	queryBuffer[0] = '\0';
	clError = clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(queryBuffer), &queryBuffer, NULL);
	printf(" CL_DEVICE_VERSION: %s\n", queryBuffer);
	queryBuffer[0] = '\0';
	clError = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &queryInt, NULL);
	printf(" CL_DEVICE_MAX_COMPUTE_UNITS: %d\n", queryInt);
	}

	int main(void) {
	int i, j, k, num_attributes;
	char* info;

	cl_platform_id * platforms = NULL;
	cl_uint num_platforms;
	cl_device_id *device_list = NULL;
	cl_uint num_devices;
	cl_int clStatus;
	size_t infoSize;

	// Get platform and device information
	clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
	platforms = (cl_platform_id ) malloc(sizeof(cl_platform_id) num_platforms);
	clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);

	// for each platform print all attributes
	num_attributes = NELEMS(attributeTypes);
	// printf("\nAttribute Count = %d ", num_attributes);
	for (i = 0; i < num_platforms; i++) {
	printf("Platform - %d\n", i+1);
	for (j = 0; j < num_attributes; j++) {
	// get platform attribute value size
	clGetPlatformInfo(platforms[i], attributeTypes[j], 0, NULL, &infoSize);
	info = (char*) malloc(infoSize);
	// get platform attribute value
	clGetPlatformInfo(platforms[i], attributeTypes[j], infoSize, info, NULL);
	printf(" %d.%d %-11s: %s\n", i+1, j+1, attributeNames[j], info);
	}
	//Get the devices list and choose the device you want to run on
	clStatus = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
	device_list = (cl_device_id ) malloc(sizeof(cl_device_id)num_devices);
	clStatus = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
	for (k = 0; k < num_devices; k++) {
	printf(" Device - %d:\n", (k+1));
	PrintDeviceInfo(device_list[k]);
	}
	}


	free(platforms);
	// free(device_list);
	return 0;
	}
	__kernel void matrixMul(__global float* C, __global float* A, __global float* B, int width)
	{
	// 2D Thread ID
	int tx = get_global_id(0);
	int ty = get_global_id(1);

	// value stores the element that is
	// computed by the thread
	float value = 0;
	int i = 0;
	for (i = 0; i < width; ++i)
	{
	value += A[ty * width + i] * B[i * width + tx];
	}

	// Write the matrix to device memory each
	// thread writes one element
	C[ty * width + tx] = value;
	}
	/**
	* Perkalian matriks persegi
	* Source: http://gpgpu-computing4.blogspot.co.id/2009/09/matrix-multiplication-2-opencl.html
	**/

	#define CL_USE_DEPRECATED_OPENCL_1_2_APIS

	#include <stdlib.h>
	#include <stdio.h>
	#include <math.h>
	#ifdef __APPLE__
	#include <OpenCL/cl.h>
	#else
	#include <CL/cl.h>
	#endif

	#define WIDTH 1024 // ukuran baris matriks
	#define TILE_SIZE 16 // ukuran baris submatriks
	#define MAX_SOURCE_SIZE (0x100000)

	char oclLoadProgSource(char fileName, char comment, size_t source_size)
	{
	/* Load the source code containing the kernel*/
	FILE *fp = fopen(fileName, "r");
	if (!fp) {
	fprintf(stderr, "Failed to load kernel.\n");
	exit(1);
	}
	char source_str = (char)malloc(MAX_SOURCE_SIZE);
	*source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);
	return source_str;
	}

	void randomInit(float* data, int size)
	{
	int i = 0;
	for (i = 0; i < size; ++i)
	data[i] = rand() / (float)RAND_MAX;
	}

	void validateMatrixMul(float* C, float* A, float* B, int width) {
	int i, j, k = 0;
	float sum = .0f;
	for (i = 0; i < width; i++) {
	for (j = 0; j < width; j++) {
	sum = .0f;
	for (k = 0; k < width; k++) {
	sum = sum + A[iwidth+k] B[k*width+j];
	}
	if (fabs(C[i*width+j] - sum) > 1e-3)
	{
	fprintf(stderr, "Result verification failed at element %d!\n", i*width+j);
	exit(EXIT_FAILURE);
	}
	}
	}
	}


	int main(void)
	{
	// Isi sesuai dengan indeks platform yang ingin digunakan
	// Indeks berdasarkan hasil device_query.c
	int platformId = 0;
	int deviceId = 0;

	// alokasi memory variable di host
	unsigned int size = WIDTH * WIDTH;
	unsigned int mem_size = sizeof(float) * size;
	float* h_A = (float*) malloc(mem_size);
	float* h_B = (float*) malloc(mem_size);
	float* h_C = (float*) malloc(mem_size);

	// inisialisasi acak
	randomInit(h_A, size);
	randomInit(h_B, size);

	cl_int clStatus;

	// Ambil list platforms
	cl_uint num_platforms;
	clGetPlatformIDs(0, NULL, &num_platforms);
	cl_platform_id platforms = (cl_platform_id ) malloc(sizeof(cl_platform_id)*num_platforms);
	clGetPlatformIDs(num_platforms, platforms, NULL);

	// Pakai platform sesuai platformId
	cl_platform_id cpPlatform = platforms[platformId];

	// Ambil list devices
	cl_uint num_devices;
	clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
	cl_device_id device_list = (cl_device_id ) malloc(sizeof(cl_device_id)*num_devices);
	clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);

	// Pakai device sesuai deviceId
	cl_device_id cdDevice = device_list[deviceId];

	// Buat context
	cl_context cxGPUContext = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &clStatus);

	// Buat command queue (OpenCL < 2.0)
	cl_command_queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &clStatus);

	// Buat command-queue (OpenCL >= 2.0)
	// cl_command_queue cqCommandQueue = clCreateCommandQueueWithProperties(cxGPUContext, cdDevice, 0, &clStatus);

	// Setup device memory
	cl_mem d_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size, NULL, &clStatus);
	cl_mem d_B = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size, NULL, &clStatus);
	cl_mem d_C = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, mem_size, NULL, &clStatus);

	// Tulis (salin) memory data dari host ke device
	clEnqueueWriteBuffer(cqCommandQueue, d_A, CL_FALSE, 0, sizeof(cl_float) * size, h_A, 0, NULL, NULL);
	clEnqueueWriteBuffer(cqCommandQueue, d_B, CL_FALSE, 0, sizeof(cl_float) * size, h_B, 0, NULL, NULL);

	// baca kernel dari file eksternal dan buat program
	size_t szKernelLength;
	char *cSourceCL = oclLoadProgSource("mmul.cl", "// My comment\n", &szKernelLength);
	cl_program clProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &clStatus);

	clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL);
	cl_kernel clKernel = clCreateKernel(clProgram, "matrixMul", &clStatus);

	// tentukan argumen kernel
	int w = WIDTH;
	clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C);
	clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A);
	clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B);
	clSetKernelArg(clKernel, 3, sizeof(cl_int), (void *)&w);

	// jalankan kernel
	size_t localWorkSize[] = {TILE_SIZE, TILE_SIZE}; // ukuran work-group (block)
	size_t globalWorkSize[] = {WIDTH, WIDTH}; // jumlah seluruh work-items (threads)
	clEnqueueNDRangeKernel(cqCommandQueue, clKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);

	// salin hasil dari memory device
	clEnqueueReadBuffer(cqCommandQueue, d_C, CL_TRUE, 0, mem_size, h_C, 0, NULL, NULL);

	// dealokasi objek-objek OpenCL
	clReleaseMemObject(d_A);
	clReleaseMemObject(d_C);
	clReleaseMemObject(d_B);

	clReleaseContext(cxGPUContext);
	clReleaseKernel(clKernel);
	clReleaseProgram(clProgram);
	if(cqCommandQueue) {
	clFlush(cqCommandQueue);
	clFinish(cqCommandQueue);
	}

	// validasi
	// validateMatrixMul(h_C, h_A, h_B, WIDTH);
	// printf("Test PASSED\n");

	// dealokasi matriks
	free(h_A);
	free(h_B);
	free(h_C);
	free(device_list);
	free(platforms);

	return 0;
	}
	/**
	* Perkalian paralel matriks bujur sangkar dengan CUBLAS
	*
	* Referensi: https://raw.githubusercontent.com/sol-prog/cuda_cublas_curand_thrust/master/mmul_1.cu
	*
	**/

	#include <stdio.h>
	#include <cublas_v2.h>

	#define WIDTH 1024

	void randomInit(float* data, int size)
	{
	for (int i = 0; i < size; ++i)
	data[i] = rand() / (float)RAND_MAX;
	}

	void validateMatrixMul(float* C, float* A, float* B, int width) {
	int i, j, k = 0;
	float sum = .0f;
	for (i = 0; i < width; i++) {
	for (j = 0; j < width; j++) {
	sum = .0f;
	for (k = 0; k < width; k++) {
	sum = sum + A[iwidth+k] B[k*width+j];
	}
	if (fabs(C[i*width+j] - sum) > 1e-3)
	{
	fprintf(stderr, "Result verification failed at element %d!\n", i*width+j);
	exit(EXIT_FAILURE);
	}
	}
	}
	}

	int main() {

	// Alokasi variable di memory host
	unsigned int size = WIDTH * WIDTH;
	unsigned int mem_size = sizeof(float) * size;
	float* h_A = (float*) malloc(mem_size);
	float* h_B = (float*) malloc(mem_size);
	float* h_C = (float*) malloc(mem_size);

	// inisalisasi acak
	randomInit(h_A, size);
	randomInit(h_B, size);

	// Alokasi variable di memory device
	float d_A, d_B, *d_C;
	cudaMalloc(&d_A,WIDTH * WIDTH * sizeof(float));
	cudaMalloc(&d_B,WIDTH * WIDTH * sizeof(float));
	cudaMalloc(&d_C,WIDTH * WIDTH * sizeof(float));

	// Salin variable dari memory host ke device
	cudaMemcpy(d_A,h_A,WIDTH * WIDTH * sizeof(float),cudaMemcpyHostToDevice);
	cudaMemcpy(d_B,h_B,WIDTH * WIDTH * sizeof(float),cudaMemcpyHostToDevice);

	// Eksekusi perkalian matriks
	const float alf = 1.0f;
	const float bet = 0.0f;
	const float *alpha = &alf;
	const float *beta = &bet;
	cublasHandle_t handle;
	cublasCreate(&handle);

	// Catatan: Posisi d_A dan d_B positions ditukar karena kita menggunakan row-major format https://ipfs.io/ipfs/QmXoypizjW3WknFiJnKLwHCnL72vedxjQkDDP1mXWo6uco/wiki/Row-major_order.html
	cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, WIDTH, WIDTH, alpha, d_B, WIDTH, d_A, WIDTH, beta, d_C, WIDTH);

	// Salin variable hasil dari memory device ke host
	cudaMemcpy(h_C,d_C,WIDTH * WIDTH * sizeof(float),cudaMemcpyDeviceToHost);

	// Dealokasi memory device
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);

	// validateMatrixMul(h_C, h_A, h_B, WIDTH);
	// printf("Test PASSED\n");

	// Dealokasi memory host
	free(h_A);
	free(h_B);
	free(h_C);

	return EXIT_SUCCESS;
	}
	/**
	* Perkalian paralel matriks bujur sangkar
	*
	* Referensi: http://gpgpu-computing4.blogspot.co.id/2009/08/matrix-multiplication-2.html
	*
	**/

	#include <stdlib.h>
	#include <stdio.h>
	#include <math.h>

	#define WIDTH 1024 // ukuran matriks
	#define TILE_SIZE 16 // ukuran tile/submatriks

	__global__ void matrixMul( float* C, float* A, float* B, int width)
	{

	// 2D Thread ID
	int tx = blockIdx.x * blockDim.x + threadIdx.x;
	int ty = blockIdx.y * blockDim.y + threadIdx.y;

	// lakukan multiplikasi untuk elemen
	// C[tx, ty] atau C[ty * width + tx]
	float value = 0;
	for (int i = 0; i < width; ++i)
	{
	float elementA = A[ty * width + i];
	float elementB = B[i * width + tx];
	value += elementA * elementB;
	}

	C[ty * width + tx] = value;
	}

	void randomInit(float* data, int size)
	{
	for (int i = 0; i < size; ++i)
	data[i] = rand() / (float)RAND_MAX;
	}

	void validateMatrixMul(float* C, float* A, float* B, int width) {
	int i, j, k = 0;
	float sum = .0f;
	for (i = 0; i < width; i++) {
	for (j = 0; j < width; j++) {
	sum = .0f;
	for (k = 0; k < width; k++) {
	sum = sum + A[iwidth+k] B[k*width+j];
	}
	if (fabs(C[i*width+j] - sum) > 1e-3)
	{
	fprintf(stderr, "Result verification failed at element %d!\n", i*width+j);
	exit(EXIT_FAILURE);
	}
	}
	}
	}


	int main()
	{
	// alokasi host memory
	unsigned int size = WIDTH * WIDTH;
	unsigned int mem_size = sizeof(float) * size;
	float* h_A = (float*) malloc(mem_size);
	float* h_B = (float*) malloc(mem_size);
	float* h_C = (float*) malloc(mem_size);

	// inisalisasi acak
	randomInit(h_A, size);
	randomInit(h_B, size);

	// alokasi device memory
	float d_A, d_B, *d_C;
	cudaMalloc((void**) &d_A, mem_size);
	cudaMalloc((void**) &d_B, mem_size);
	cudaMalloc((void**) &d_C, mem_size);

	// salin data ke device memory
	cudaMemcpy(d_A, h_A, mem_size, cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, h_B, mem_size, cudaMemcpyHostToDevice);

	// jalankan kernel
	// dimensi block 2D = 16 * 16 threads
	// dimensi grid 2D = 64 * 64 blocks
	// total threads = 64 * 64 * 16 * 16 = 1048576 threads
	dim3 blockDim(TILE_SIZE, TILE_SIZE);
	dim3 gridDim(WIDTH / TILE_SIZE, WIDTH / TILE_SIZE);
	matrixMul<<< gridDim, blockDim >>>(d_C, d_A, d_B, WIDTH);

	// salin hasil dari device
	cudaMemcpy(h_C, d_C, mem_size, cudaMemcpyDeviceToHost);

	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);

	// validasi
	// validateMatrixMul(h_C, h_A, h_B, WIDTH);
	// printf("Test PASSED\n");

	// dealokasi
	free(h_A);
	free(h_B);
	free(h_C);

	}
	// SAXPY (Single precision real Alpha X plus Y)
	// Original source: Banger, R, Bhattacharyya .K. OpenCL Programming by Example. 2013. Packt publishing
	// By: yohanes.gultom@gmail.com

	__kernel void saxpy_kernel(float alpha, __global float A, __global float B, __global float *C)
	{
	//Get the index of the work-item
	int index = get_global_id(0);
	C[index] = alpha* A[index] + B[index];
	}
	/**
	* Simplified SAXPY OpenCL
	* Tested on: CL_PLATFORM_VERSION: OpenCL 1.2 CUDA 9.0.282
	*/

	#define CL_USE_DEPRECATED_OPENCL_1_2_APIS

	#include <stdio.h>
	#include <stdlib.h>
	#include <math.h>
	#ifdef __APPLE__
	#include <OpenCL/cl.h>
	#else
	#include <CL/cl.h>
	#endif

	#define VECTOR_SIZE 1024
	#define MAX_SOURCE_SIZE (0x100000)
	char oclLoadProgSource(char fileName, char comment, size_t source_size)
	{
	/* Load the source code containing the kernel*/
	FILE *fp = fopen(fileName, "r");
	if (!fp) {
	fprintf(stderr, "Failed to load kernel.\n");
	exit(1);
	}
	char source_str = (char)malloc(MAX_SOURCE_SIZE);
	*source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);
	return source_str;
	}

	int main(void) {

	// Isi sesuai dengan indeks platform yang ingin digunakan
	// Indeks berdasarkan hasil device_query.c
	int platformId = 0;
	int deviceId = 0;

	int i;
	char *kernel_filename = "saxpy.cl";
	char *kernel_comment = "// saxpy";
	size_t kernelLength;

	// Allocate space for vectors A, B and C
	float alpha = 2.0;
	float A = (float)malloc(sizeof(float)*VECTOR_SIZE);
	float B = (float)malloc(sizeof(float)*VECTOR_SIZE);
	float C = (float)malloc(sizeof(float)*VECTOR_SIZE);
	for(i = 0; i < VECTOR_SIZE; i++)
	{
	A[i] = i;
	B[i] = VECTOR_SIZE - i;
	C[i] = 0;
	}

	// Get platform and device information
	cl_platform_id * platforms = NULL;
	cl_uint num_platforms;
	cl_device_id *device_list = NULL;
	cl_uint num_devices;
	cl_context context;
	char *kernel_content = NULL;

	//Set up the Platform
	cl_int clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
	platforms = (cl_platform_id ) malloc(sizeof(cl_platform_id)num_platforms);
	clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);

	//Get the devices list and choose the device you want to run on
	clStatus = clGetDeviceIDs( platforms[platformId], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
	device_list = (cl_device_id ) malloc(sizeof(cl_device_id)num_devices);
	clStatus = clGetDeviceIDs( platforms[platformId], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);

	// Create one OpenCL context for each device in the platform
	context = clCreateContext( NULL, num_devices, device_list, NULL, NULL, &clStatus);

	// Create a command queue (OpenCL < 2.0)
	cl_command_queue command_queue = clCreateCommandQueue(context, device_list[deviceId], 0, &clStatus);

	// Create a command queue (OpenCL >= 2.0)
	// cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_list[deviceId], 0, &clStatus);

	// Create memory buffers on the device for each vector
	cl_mem A_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
	cl_mem B_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
	cl_mem C_clmem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);

	// Copy the Buffer A and B to the device
	clStatus = clEnqueueWriteBuffer(command_queue, A_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), A, 0, NULL, NULL);
	clStatus = clEnqueueWriteBuffer(command_queue, B_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), B, 0, NULL, NULL);

	// Create a program from the kernel source
	kernel_content = oclLoadProgSource(kernel_filename, kernel_comment, &kernelLength);
	// printf("%s\n", kernel_content);
	cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernel_content, NULL, &clStatus);

	// Build the program
	clStatus = clBuildProgram(program, 1, device_list, NULL, NULL, NULL);

	// Create the OpenCL kernel
	cl_kernel kernel = clCreateKernel(program, "saxpy_kernel", &clStatus);

	// Set the arguments of the kernel
	clStatus = clSetKernelArg(kernel, 0, sizeof(float), (void *)&alpha);
	clStatus = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&A_clmem);
	clStatus = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&B_clmem);
	clStatus = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&C_clmem);

	// Execute the OpenCL kernel on the list
	size_t global_size = VECTOR_SIZE; // Process the entire lists
	size_t local_size = 64;

	// Process one item at a time
	clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);

	// Read the cl memory C_clmem on device to the host variable C
	clStatus = clEnqueueReadBuffer(command_queue, C_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), C, 0, NULL, NULL);

	// Clean up and wait for all the comands to complete.
	clStatus = clFlush(command_queue);
	clStatus = clFinish(command_queue);

	// Validate result
	// for (i = 0; i < VECTOR_SIZE; ++i)
	// {
	// if (fabs(alpha * A[i] + B[i] - C[i]) > 1e-5)
	// {
	// fprintf(stderr, "Result verification failed at element %d!\n", i);
	// exit(EXIT_FAILURE);
	// }
	// }
	// printf("Test PASSED\n");

	// Finally release all OpenCL allocated objects and host buffers.
	clStatus = clReleaseKernel(kernel);
	clStatus = clReleaseProgram(program);
	clStatus = clReleaseMemObject(A_clmem);
	clStatus = clReleaseMemObject(B_clmem);
	clStatus = clReleaseMemObject(C_clmem);
	clStatus = clReleaseCommandQueue(command_queue);
	clStatus = clReleaseContext(context);
	free(A);
	free(B);
	free(C);
	free(platforms);
	free(device_list);
	return 0;
	}
	/**
	* How to get global thread index on various grid/block indexing schemes
	* Source: http://www.martinpeniak.com/index.php?option=com_content&view=article&catid=17:updates&id=288:cuda-thread-indexing-explained
	*
	*/

	// 1D grid of 1D blocks
	__device__ int getGlobalIdx_1D_1D()
	{
	return blockIdx.x * blockDim.x + threadIdx.x;
	}

	// 1D grid of 2D blocks
	__device__ int getGlobalIdx_1D_2D()
	{
	return blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
	}

	// 1D grid of 3D blocks
	__device__ int getGlobalIdx_1D_3D()
	{
	return blockIdx.x * blockDim.x * blockDim.y * blockDim.z
	+ threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
	}

	// 2D grid of 1D blocks
	__device__ int getGlobalIdx_2D_1D()
	{
	int blockId = blockIdx.y * gridDim.x + blockIdx.x;
	int threadId = blockId * blockDim.x + threadIdx.x;
	return threadId;
	}

	// 2D grid of 2D blocks
	__device__ int getGlobalIdx_2D_2D()
	{
	int blockId = blockIdx.x + blockIdx.y * gridDim.x;
	int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
	return threadId;
	}

	// 2D grid of 3D blocks
	__device__ int getGlobalIdx_2D_3D()
	{
	int blockId = blockIdx.x
	+ blockIdx.y * gridDim.x;
	int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
	+ (threadIdx.z * (blockDim.x * blockDim.y))
	+ (threadIdx.y * blockDim.x)
	+ threadIdx.x;
	return threadId;
	}

	// 3D grid of 1D blocks
	__device__ int getGlobalIdx_3D_1D()
	{
	int blockId = blockIdx.x
	+ blockIdx.y * gridDim.x
	+ gridDim.x * gridDim.y * blockIdx.z;
	int threadId = blockId * blockDim.x + threadIdx.x;
	return threadId;
	}


	// 3D grid of 2D blocks
	__device__ int getGlobalIdx_3D_2D()
	{
	int blockId = blockIdx.x
	+ blockIdx.y * gridDim.x
	+ gridDim.x * gridDim.y * blockIdx.z;
	int threadId = blockId * (blockDim.x * blockDim.y)
	+ (threadIdx.y * blockDim.x)
	+ threadIdx.x;
	return threadId;
	}


	// 3D grid of 3D blocks
	__device__ int getGlobalIdx_3D_3D()
	{
	int blockId = blockIdx.x
	+ blockIdx.y * gridDim.x
	+ gridDim.x * gridDim.y * blockIdx.z;
	int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
	+ (threadIdx.z * (blockDim.x * blockDim.y))
	+ (threadIdx.y * blockDim.x)
	+ threadIdx.x;
	return threadId;
	}
	/**
	* Vector addition: C = A + B.
	* Serial CPU execution
	*/

	#include <stdio.h>
	#include <stdlib.h>

	int main(void)
	{
	// ukuran/total elemen vektor
	int numElements = 50000;
	size_t size = numElements * sizeof(float);

	float h_A = (float )malloc(size);
	float h_B = (float )malloc(size);
	float h_C = (float )malloc(size);

	for (int i = 0; i < numElements; ++i)
	{
	h_A[i] = rand()/(float)RAND_MAX;
	h_B[i] = rand()/(float)RAND_MAX;
	}

	for (int i = 0; i < numElements; ++i)
	{
	h_C[i] = h_A[i] + h_B[i];
	}

	free(h_A);
	free(h_B);
	free(h_C);

	return 0;
	}