Skip to content

Instantly share code, notes, and snippets.

@yohanesgultom
Last active May 27, 2023 04:51
Show Gist options
  • Save yohanesgultom/b7e32f7649ac39e00ad65bcb83dfd72e to your computer and use it in GitHub Desktop.
Save yohanesgultom/b7e32f7649ac39e00ad65bcb83dfd72e to your computer and use it in GitHub Desktop.
Simple CUDA and OpenCL code
Simple CUDA and OpenCL code
Compilation:
* CUDA (*.cu): nvcc filename.cu
* CUDA + CUBLAS (*.cu): nvcc filename.cu -lcublas
* OpenCL (*.c): gcc filename.c -lOpenCL
// device_query.c
// yohanes.gultom@gmail.com
// Original source:
// * http://stackoverflow.com/questions/17240071/what-is-the-right-way-to-call-clgetplatforminfo
// * Banger, R, Bhattacharyya .K. "OpenCL Programming by Example". 2013. Packt publishing​. p43
#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#define NELEMS(x) (sizeof(x) / sizeof((x)[0]))
const cl_platform_info attributeTypes[5] = {
CL_PLATFORM_NAME,
CL_PLATFORM_VENDOR,
CL_PLATFORM_VERSION,
CL_PLATFORM_PROFILE,
CL_PLATFORM_EXTENSIONS
};
const char* const attributeNames[] = {
"CL_PLATFORM_NAME",
"CL_PLATFORM_VENDOR",
"CL_PLATFORM_VERSION",
"CL_PLATFORM_PROFILE",
"CL_PLATFORM_EXTENSIONS"
};
void PrintDeviceInfo(cl_device_id device)
{
char queryBuffer[1024];
int queryInt;
cl_int clError;
clError = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(queryBuffer), &queryBuffer, NULL);
printf(" CL_DEVICE_NAME: %s\n", queryBuffer);
queryBuffer[0] = '\0';
clError = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(queryBuffer), &queryBuffer, NULL);
printf(" CL_DEVICE_VENDOR: %s\n", queryBuffer);
queryBuffer[0] = '\0';
clError = clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(queryBuffer), &queryBuffer, NULL);
printf(" CL_DRIVER_VERSION: %s\n", queryBuffer);
queryBuffer[0] = '\0';
clError = clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(queryBuffer), &queryBuffer, NULL);
printf(" CL_DEVICE_VERSION: %s\n", queryBuffer);
queryBuffer[0] = '\0';
clError = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &queryInt, NULL);
printf(" CL_DEVICE_MAX_COMPUTE_UNITS: %d\n", queryInt);
}
int main(void) {
int i, j, k, num_attributes;
char* info;
cl_platform_id * platforms = NULL;
cl_uint num_platforms;
cl_device_id *device_list = NULL;
cl_uint num_devices;
cl_int clStatus;
size_t infoSize;
// Get platform and device information
clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms);
clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);
// for each platform print all attributes
num_attributes = NELEMS(attributeTypes);
// printf("\nAttribute Count = %d ", num_attributes);
for (i = 0; i < num_platforms; i++) {
printf("Platform - %d\n", i+1);
for (j = 0; j < num_attributes; j++) {
// get platform attribute value size
clGetPlatformInfo(platforms[i], attributeTypes[j], 0, NULL, &infoSize);
info = (char*) malloc(infoSize);
// get platform attribute value
clGetPlatformInfo(platforms[i], attributeTypes[j], infoSize, info, NULL);
printf(" %d.%d %-11s: %s\n", i+1, j+1, attributeNames[j], info);
}
//Get the devices list and choose the device you want to run on
clStatus = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
clStatus = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
for (k = 0; k < num_devices; k++) {
printf(" Device - %d:\n", (k+1));
PrintDeviceInfo(device_list[k]);
}
}
free(platforms);
// free(device_list);
return 0;
}
__kernel void matrixMul(__global float* C, __global float* A, __global float* B, int width)
{
// 2D Thread ID
int tx = get_global_id(0);
int ty = get_global_id(1);
// value stores the element that is
// computed by the thread
float value = 0;
int i = 0;
for (i = 0; i < width; ++i)
{
value += A[ty * width + i] * B[i * width + tx];
}
// Write the matrix to device memory each
// thread writes one element
C[ty * width + tx] = value;
}
/**
* Perkalian matriks persegi
* Source: http://gpgpu-computing4.blogspot.co.id/2009/09/matrix-multiplication-2-opencl.html
**/
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#define WIDTH 1024 // ukuran baris matriks
#define TILE_SIZE 16 // ukuran baris submatriks
#define MAX_SOURCE_SIZE (0x100000)
char *oclLoadProgSource(char *fileName, char *comment, size_t *source_size)
{
/* Load the source code containing the kernel*/
FILE *fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
char *source_str = (char*)malloc(MAX_SOURCE_SIZE);
*source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
return source_str;
}
void randomInit(float* data, int size)
{
int i = 0;
for (i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
void validateMatrixMul(float* C, float* A, float* B, int width) {
int i, j, k = 0;
float sum = .0f;
for (i = 0; i < width; i++) {
for (j = 0; j < width; j++) {
sum = .0f;
for (k = 0; k < width; k++) {
sum = sum + A[i*width+k] * B[k*width+j];
}
if (fabs(C[i*width+j] - sum) > 1e-3)
{
fprintf(stderr, "Result verification failed at element %d!\n", i*width+j);
exit(EXIT_FAILURE);
}
}
}
}
int main(void)
{
// Isi sesuai dengan indeks platform yang ingin digunakan
// Indeks berdasarkan hasil device_query.c
int platformId = 0;
int deviceId = 0;
// alokasi memory variable di host
unsigned int size = WIDTH * WIDTH;
unsigned int mem_size = sizeof(float) * size;
float* h_A = (float*) malloc(mem_size);
float* h_B = (float*) malloc(mem_size);
float* h_C = (float*) malloc(mem_size);
// inisialisasi acak
randomInit(h_A, size);
randomInit(h_B, size);
cl_int clStatus;
// Ambil list platforms
cl_uint num_platforms;
clGetPlatformIDs(0, NULL, &num_platforms);
cl_platform_id *platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id)*num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL);
// Pakai platform sesuai platformId
cl_platform_id cpPlatform = platforms[platformId];
// Ambil list devices
cl_uint num_devices;
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
cl_device_id *device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
// Pakai device sesuai deviceId
cl_device_id cdDevice = device_list[deviceId];
// Buat context
cl_context cxGPUContext = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &clStatus);
// Buat command queue (OpenCL < 2.0)
cl_command_queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &clStatus);
// Buat command-queue (OpenCL >= 2.0)
// cl_command_queue cqCommandQueue = clCreateCommandQueueWithProperties(cxGPUContext, cdDevice, 0, &clStatus);
// Setup device memory
cl_mem d_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size, NULL, &clStatus);
cl_mem d_B = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size, NULL, &clStatus);
cl_mem d_C = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, mem_size, NULL, &clStatus);
// Tulis (salin) memory data dari host ke device
clEnqueueWriteBuffer(cqCommandQueue, d_A, CL_FALSE, 0, sizeof(cl_float) * size, h_A, 0, NULL, NULL);
clEnqueueWriteBuffer(cqCommandQueue, d_B, CL_FALSE, 0, sizeof(cl_float) * size, h_B, 0, NULL, NULL);
// baca kernel dari file eksternal dan buat program
size_t szKernelLength;
char *cSourceCL = oclLoadProgSource("mmul.cl", "// My comment\n", &szKernelLength);
cl_program clProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &clStatus);
clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL);
cl_kernel clKernel = clCreateKernel(clProgram, "matrixMul", &clStatus);
// tentukan argumen kernel
int w = WIDTH;
clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C);
clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A);
clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B);
clSetKernelArg(clKernel, 3, sizeof(cl_int), (void *)&w);
// jalankan kernel
size_t localWorkSize[] = {TILE_SIZE, TILE_SIZE}; // ukuran work-group (block)
size_t globalWorkSize[] = {WIDTH, WIDTH}; // jumlah seluruh work-items (threads)
clEnqueueNDRangeKernel(cqCommandQueue, clKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
// salin hasil dari memory device
clEnqueueReadBuffer(cqCommandQueue, d_C, CL_TRUE, 0, mem_size, h_C, 0, NULL, NULL);
// dealokasi objek-objek OpenCL
clReleaseMemObject(d_A);
clReleaseMemObject(d_C);
clReleaseMemObject(d_B);
clReleaseContext(cxGPUContext);
clReleaseKernel(clKernel);
clReleaseProgram(clProgram);
if(cqCommandQueue) {
clFlush(cqCommandQueue);
clFinish(cqCommandQueue);
}
// validasi
// validateMatrixMul(h_C, h_A, h_B, WIDTH);
// printf("Test PASSED\n");
// dealokasi matriks
free(h_A);
free(h_B);
free(h_C);
free(device_list);
free(platforms);
return 0;
}
/**
* Perkalian paralel matriks bujur sangkar dengan CUBLAS
*
* Referensi: https://raw.githubusercontent.com/sol-prog/cuda_cublas_curand_thrust/master/mmul_1.cu
*
**/
#include <stdio.h>
#include <cublas_v2.h>
#define WIDTH 1024
void randomInit(float* data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
void validateMatrixMul(float* C, float* A, float* B, int width) {
int i, j, k = 0;
float sum = .0f;
for (i = 0; i < width; i++) {
for (j = 0; j < width; j++) {
sum = .0f;
for (k = 0; k < width; k++) {
sum = sum + A[i*width+k] * B[k*width+j];
}
if (fabs(C[i*width+j] - sum) > 1e-3)
{
fprintf(stderr, "Result verification failed at element %d!\n", i*width+j);
exit(EXIT_FAILURE);
}
}
}
}
int main() {
// Alokasi variable di memory host
unsigned int size = WIDTH * WIDTH;
unsigned int mem_size = sizeof(float) * size;
float* h_A = (float*) malloc(mem_size);
float* h_B = (float*) malloc(mem_size);
float* h_C = (float*) malloc(mem_size);
// inisalisasi acak
randomInit(h_A, size);
randomInit(h_B, size);
// Alokasi variable di memory device
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A,WIDTH * WIDTH * sizeof(float));
cudaMalloc(&d_B,WIDTH * WIDTH * sizeof(float));
cudaMalloc(&d_C,WIDTH * WIDTH * sizeof(float));
// Salin variable dari memory host ke device
cudaMemcpy(d_A,h_A,WIDTH * WIDTH * sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_B,h_B,WIDTH * WIDTH * sizeof(float),cudaMemcpyHostToDevice);
// Eksekusi perkalian matriks
const float alf = 1.0f;
const float bet = 0.0f;
const float *alpha = &alf;
const float *beta = &bet;
cublasHandle_t handle;
cublasCreate(&handle);
// Catatan: Posisi d_A dan d_B positions ditukar karena kita menggunakan row-major format https://ipfs.io/ipfs/QmXoypizjW3WknFiJnKLwHCnL72vedxjQkDDP1mXWo6uco/wiki/Row-major_order.html
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, WIDTH, WIDTH, alpha, d_B, WIDTH, d_A, WIDTH, beta, d_C, WIDTH);
// Salin variable hasil dari memory device ke host
cudaMemcpy(h_C,d_C,WIDTH * WIDTH * sizeof(float),cudaMemcpyDeviceToHost);
// Dealokasi memory device
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// validateMatrixMul(h_C, h_A, h_B, WIDTH);
// printf("Test PASSED\n");
// Dealokasi memory host
free(h_A);
free(h_B);
free(h_C);
return EXIT_SUCCESS;
}
/**
* Perkalian paralel matriks bujur sangkar
*
* Referensi: http://gpgpu-computing4.blogspot.co.id/2009/08/matrix-multiplication-2.html
*
**/
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define WIDTH 1024 // ukuran matriks
#define TILE_SIZE 16 // ukuran tile/submatriks
__global__ void matrixMul( float* C, float* A, float* B, int width)
{
// 2D Thread ID
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int ty = blockIdx.y * blockDim.y + threadIdx.y;
// lakukan multiplikasi untuk elemen
// C[tx, ty] atau C[ty * width + tx]
float value = 0;
for (int i = 0; i < width; ++i)
{
float elementA = A[ty * width + i];
float elementB = B[i * width + tx];
value += elementA * elementB;
}
C[ty * width + tx] = value;
}
void randomInit(float* data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
void validateMatrixMul(float* C, float* A, float* B, int width) {
int i, j, k = 0;
float sum = .0f;
for (i = 0; i < width; i++) {
for (j = 0; j < width; j++) {
sum = .0f;
for (k = 0; k < width; k++) {
sum = sum + A[i*width+k] * B[k*width+j];
}
if (fabs(C[i*width+j] - sum) > 1e-3)
{
fprintf(stderr, "Result verification failed at element %d!\n", i*width+j);
exit(EXIT_FAILURE);
}
}
}
}
int main()
{
// alokasi host memory
unsigned int size = WIDTH * WIDTH;
unsigned int mem_size = sizeof(float) * size;
float* h_A = (float*) malloc(mem_size);
float* h_B = (float*) malloc(mem_size);
float* h_C = (float*) malloc(mem_size);
// inisalisasi acak
randomInit(h_A, size);
randomInit(h_B, size);
// alokasi device memory
float *d_A, *d_B, *d_C;
cudaMalloc((void**) &d_A, mem_size);
cudaMalloc((void**) &d_B, mem_size);
cudaMalloc((void**) &d_C, mem_size);
// salin data ke device memory
cudaMemcpy(d_A, h_A, mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mem_size, cudaMemcpyHostToDevice);
// jalankan kernel
// dimensi block 2D = 16 * 16 threads
// dimensi grid 2D = 64 * 64 blocks
// total threads = 64 * 64 * 16 * 16 = 1048576 threads
dim3 blockDim(TILE_SIZE, TILE_SIZE);
dim3 gridDim(WIDTH / TILE_SIZE, WIDTH / TILE_SIZE);
matrixMul<<< gridDim, blockDim >>>(d_C, d_A, d_B, WIDTH);
// salin hasil dari device
cudaMemcpy(h_C, d_C, mem_size, cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// validasi
// validateMatrixMul(h_C, h_A, h_B, WIDTH);
// printf("Test PASSED\n");
// dealokasi
free(h_A);
free(h_B);
free(h_C);
}
// SAXPY (Single precision real Alpha X plus Y)
// Original source: Banger, R, Bhattacharyya .K. OpenCL Programming by Example. 2013. Packt publishing
// By: yohanes.gultom@gmail.com
__kernel void saxpy_kernel(float alpha, __global float *A, __global float *B, __global float *C)
{
//Get the index of the work-item
int index = get_global_id(0);
C[index] = alpha* A[index] + B[index];
}
/**
* Simplified SAXPY OpenCL
* Tested on: CL_PLATFORM_VERSION: OpenCL 1.2 CUDA 9.0.282
*/
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#define VECTOR_SIZE 1024
#define MAX_SOURCE_SIZE (0x100000)
char *oclLoadProgSource(char *fileName, char *comment, size_t *source_size)
{
/* Load the source code containing the kernel*/
FILE *fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
char *source_str = (char*)malloc(MAX_SOURCE_SIZE);
*source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
return source_str;
}
int main(void) {
// Isi sesuai dengan indeks platform yang ingin digunakan
// Indeks berdasarkan hasil device_query.c
int platformId = 0;
int deviceId = 0;
int i;
char *kernel_filename = "saxpy.cl";
char *kernel_comment = "// saxpy";
size_t kernelLength;
// Allocate space for vectors A, B and C
float alpha = 2.0;
float *A = (float*)malloc(sizeof(float)*VECTOR_SIZE);
float *B = (float*)malloc(sizeof(float)*VECTOR_SIZE);
float *C = (float*)malloc(sizeof(float)*VECTOR_SIZE);
for(i = 0; i < VECTOR_SIZE; i++)
{
A[i] = i;
B[i] = VECTOR_SIZE - i;
C[i] = 0;
}
// Get platform and device information
cl_platform_id * platforms = NULL;
cl_uint num_platforms;
cl_device_id *device_list = NULL;
cl_uint num_devices;
cl_context context;
char *kernel_content = NULL;
//Set up the Platform
cl_int clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id)*num_platforms);
clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);
//Get the devices list and choose the device you want to run on
clStatus = clGetDeviceIDs( platforms[platformId], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
clStatus = clGetDeviceIDs( platforms[platformId], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
// Create one OpenCL context for each device in the platform
context = clCreateContext( NULL, num_devices, device_list, NULL, NULL, &clStatus);
// Create a command queue (OpenCL < 2.0)
cl_command_queue command_queue = clCreateCommandQueue(context, device_list[deviceId], 0, &clStatus);
// Create a command queue (OpenCL >= 2.0)
// cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_list[deviceId], 0, &clStatus);
// Create memory buffers on the device for each vector
cl_mem A_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
cl_mem B_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
cl_mem C_clmem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);
// Copy the Buffer A and B to the device
clStatus = clEnqueueWriteBuffer(command_queue, A_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), A, 0, NULL, NULL);
clStatus = clEnqueueWriteBuffer(command_queue, B_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), B, 0, NULL, NULL);
// Create a program from the kernel source
kernel_content = oclLoadProgSource(kernel_filename, kernel_comment, &kernelLength);
// printf("%s\n", kernel_content);
cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernel_content, NULL, &clStatus);
// Build the program
clStatus = clBuildProgram(program, 1, device_list, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "saxpy_kernel", &clStatus);
// Set the arguments of the kernel
clStatus = clSetKernelArg(kernel, 0, sizeof(float), (void *)&alpha);
clStatus = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&A_clmem);
clStatus = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&B_clmem);
clStatus = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&C_clmem);
// Execute the OpenCL kernel on the list
size_t global_size = VECTOR_SIZE; // Process the entire lists
size_t local_size = 64;
// Process one item at a time
clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
// Read the cl memory C_clmem on device to the host variable C
clStatus = clEnqueueReadBuffer(command_queue, C_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), C, 0, NULL, NULL);
// Clean up and wait for all the comands to complete.
clStatus = clFlush(command_queue);
clStatus = clFinish(command_queue);
// Validate result
// for (i = 0; i < VECTOR_SIZE; ++i)
// {
// if (fabs(alpha * A[i] + B[i] - C[i]) > 1e-5)
// {
// fprintf(stderr, "Result verification failed at element %d!\n", i);
// exit(EXIT_FAILURE);
// }
// }
// printf("Test PASSED\n");
// Finally release all OpenCL allocated objects and host buffers.
clStatus = clReleaseKernel(kernel);
clStatus = clReleaseProgram(program);
clStatus = clReleaseMemObject(A_clmem);
clStatus = clReleaseMemObject(B_clmem);
clStatus = clReleaseMemObject(C_clmem);
clStatus = clReleaseCommandQueue(command_queue);
clStatus = clReleaseContext(context);
free(A);
free(B);
free(C);
free(platforms);
free(device_list);
return 0;
}
/**
* How to get global thread index on various grid/block indexing schemes
* Source: http://www.martinpeniak.com/index.php?option=com_content&view=article&catid=17:updates&id=288:cuda-thread-indexing-explained
*
*/
// 1D grid of 1D blocks
__device__ int getGlobalIdx_1D_1D()
{
return blockIdx.x * blockDim.x + threadIdx.x;
}
// 1D grid of 2D blocks
__device__ int getGlobalIdx_1D_2D()
{
return blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
}
// 1D grid of 3D blocks
__device__ int getGlobalIdx_1D_3D()
{
return blockIdx.x * blockDim.x * blockDim.y * blockDim.z
+ threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
}
// 2D grid of 1D blocks
__device__ int getGlobalIdx_2D_1D()
{
int blockId = blockIdx.y * gridDim.x + blockIdx.x;
int threadId = blockId * blockDim.x + threadIdx.x;
return threadId;
}
// 2D grid of 2D blocks
__device__ int getGlobalIdx_2D_2D()
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
return threadId;
}
// 2D grid of 3D blocks
__device__ int getGlobalIdx_2D_3D()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
}
// 3D grid of 1D blocks
__device__ int getGlobalIdx_3D_1D()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * blockDim.x + threadIdx.x;
return threadId;
}
// 3D grid of 2D blocks
__device__ int getGlobalIdx_3D_2D()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y)
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
}
// 3D grid of 3D blocks
__device__ int getGlobalIdx_3D_3D()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
}
/**
* Vector addition: C = A + B.
* Serial CPU execution
*/
#include <stdio.h>
#include <stdlib.h>
int main(void)
{
// ukuran/total elemen vektor
int numElements = 50000;
size_t size = numElements * sizeof(float);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
for (int i = 0; i < numElements; ++i)
{
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
}
for (int i = 0; i < numElements; ++i)
{
h_C[i] = h_A[i] + h_B[i];
}
free(h_A);
free(h_B);
free(h_C);
return 0;
}
__kernel void VectorAdd(__global const float* a, __global const float* b, __global float* c, int iNumElements)
{
// ambil indeks global work-item (thread)
int iGID = get_global_id(0);
// jumlah work-items (threads) bisa melebihi iNumElements
if (iGID < iNumElements)
{
// jumlahkan elemen vektor ke iGID
c[iGID] = a[iGID] + b[iGID];
}
}
/*
* Penjumlahan vektor
*
* Tested on CL_PLATFORM_VERSION: OpenCL 1.2 CUDA 9.0.282
*/
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#define NUM_ELEMENTS 50000
#define MAX_SOURCE_SIZE (0x100000)
char *oclLoadProgSource(char *fileName, char *comment, size_t *source_size)
{
/* Load the source code containing the kernel*/
FILE *fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
char *source_str = (char*)malloc(MAX_SOURCE_SIZE);
*source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
return source_str;
}
int main(void)
{
// Isi sesuai dengan indeks platform yang ingin digunakan
// Indeks berdasarkan hasil device_query.c
int platformId = 0;
int deviceId = 0;
int i = 0;
int iNumElements = NUM_ELEMENTS;
// Alokasi dan inisialisi variable di memory host
float *srcA = (float *)malloc(sizeof(float) * iNumElements);
float *srcB = (float *)malloc(sizeof(float) * iNumElements);
float *dst = (float *)malloc(sizeof(float) * iNumElements);
i = 0;
for (i = 0; i < iNumElements; ++i)
{
srcA[i] = rand()/(float)RAND_MAX;
srcB[i] = rand()/(float)RAND_MAX;
}
cl_int clStatus;
// Ambil list platforms
cl_uint num_platforms;
clGetPlatformIDs(0, NULL, &num_platforms);
cl_platform_id *platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id)*num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL);
// Pakai platform sesuai platformId
cl_platform_id cpPlatform = platforms[platformId];
// Ambil list devices
cl_uint num_devices;
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
cl_device_id *device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
// Pakai device sesuai deviceId
cl_device_id cdDevice = device_list[deviceId];
// Buat context
cl_context cxGPUContext = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &clStatus);
// Buat command queue (OpenCL < 2.0)
cl_command_queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &clStatus);
// Buat command-queue (OpenCL >= 2.0)
// cl_command_queue cqCommandQueue = clCreateCommandQueueWithProperties(cxGPUContext, cdDevice, 0, &clStatus);
// Alokasi memory di device
cl_mem cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(float) * iNumElements, NULL, &clStatus);
cl_mem cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(float) * iNumElements, NULL, &clStatus);
cl_mem cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(float) * iNumElements, NULL, &clStatus);
// Tulis (salin) memory data dari host ke device
clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * iNumElements, srcA, 0, NULL, NULL);
clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * iNumElements, srcB, 0, NULL, NULL);
// Buat program dan build dari fungsi kernel
size_t szKernelLength;
char *cSourceCL = oclLoadProgSource("vectorAdd.cl", "// My comment\n", &szKernelLength);
cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &clStatus);
clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
// Buat kernel
cl_kernel ckKernel = clCreateKernel(cpProgram, "VectorAdd", &clStatus);
// Tentukan argumen kernel
clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
// Jalankan kernel
size_t szLocalWorkSize = 256; // ukuran work-group (block)
size_t szGlobalWorkSize = iNumElements; // jumlah seluruh work-items (threads)
clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
// Baca (salin) memory hasil dari device kembali ke host
clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * iNumElements, dst, 0, NULL, NULL);
// Validasi hasil
// i = 0;
// for (i = 0; i < iNumElements; i++) {
// if (fabs(srcA[i] + srcB[i] - dst[i]) > 1e5) {
// fprintf(stderr, "Result verification failed at element %d!\n", i);
// exit(EXIT_FAILURE);
// }
// }
// printf("Test PASSED\n");
// Dealokasi objek openCL
if(ckKernel)clReleaseKernel(ckKernel);
if(cpProgram)clReleaseProgram(cpProgram);
if(cqCommandQueue) {
clStatus = clFlush(cqCommandQueue);
clStatus = clFinish(cqCommandQueue);
}
if(cxGPUContext)clReleaseContext(cxGPUContext);
// Dealokasi memory device
if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
if(cmDevDst)clReleaseMemObject(cmDevDst);
// Dealokasi memory host
free(srcA);
free(srcB);
free(dst);
free(device_list);
free(platforms);
return 0;
}
/**
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
* Vector addition: C = A + B.
*
* This sample is a very basic sample that implements element by element
* vector addition. It is the same as the sample illustrating Chapter 2
* of the programming guide with some additions like error checking.
*/
#include <stdio.h>
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
{
// jika menggunakan indeks 2D, akan terdapat atribut x, y
// jika menggunakan indeks 3D, akan terdapat atribut x, y, z
int i = blockDim.x * blockIdx.x + threadIdx.x;
// karena jumlah thread yang berjalan dapat >= total elemen
if (i < numElements)
{
C[i] = A[i] + B[i];
}
}
int main(void)
{
// ukuran/total elemen vektor
int numElements = 50000;
size_t size = numElements * sizeof(float);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
float *d_A, *d_B, *d_C;
cudaMalloc((void **)&d_A, size);
cudaMalloc((void **)&d_B, size);
cudaMalloc((void **)&d_C, size);
for (int i = 0; i < numElements; ++i)
{
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
}
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
// (50000 + 256 - 1) / 256 = 196 blocks/grid
// jadi ada 50176 threads yang akan dijalankan, yaitu lebih dari total elemen
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
// // alternatif
// dim3 gridDim(blocksPerGrid);
// dim3 blockDim(threadsPerBlock);
// vectorAdd<<<gridDim, blockDim>>>(d_A, d_B, d_C, numElements);
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// // validasi hasil
// for (int i = 0; i < numElements; ++i)
// {
// if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
// {
// fprintf(stderr, "Result verification failed at element %d!\n", i);
// exit(EXIT_FAILURE);
// }
// }
// printf("Test PASSED\n");
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment