Skip to content

Instantly share code, notes, and snippets.

@PlasmaPower
Last active September 30, 2022 22:51
Show Gist options
  • Save PlasmaPower/c8a650b3268dc5cd8a54755431427350 to your computer and use it in GitHub Desktop.
Save PlasmaPower/c8a650b3268dc5cd8a54755431427350 to your computer and use it in GitHub Desktop.
AMD ROCm OpenCL miscompilation
#define __global
#define __kernel
#define __constant const
#include <stdio.h>
void run_test(unsigned int zero, unsigned char *result);
int main() {
unsigned int result = 0;
run_test(0, (unsigned char *)&result);
printf("Host result: %x\n", result);
return 0;
}
typedef unsigned int uint32_t;
typedef unsigned long uint64_t;
#define mul32x32_64(a,b) (((uint64_t)(a))*(b))
__constant uint32_t reduce_mask_25 = (1 << 25) - 1;
__constant uint32_t reduce_mask_26 = (1 << 26) - 1;
static void curve25519_square_times(uint32_t *out, uint32_t in, int count, uint32_t zero) {
uint32_t r0 = in, r1 = 0;
uint64_t m0 = 0, m9 = 0;
uint32_t p = 0;
do {
m0 = mul32x32_64(r0, r0);
m9 = mul32x32_64(r0, 12345678);
r0 = (uint32_t)m0 & reduce_mask_26;
m9 += m0 >> 26;
p = (uint32_t)(m9 >> 25);
m0 = r0 + mul32x32_64(p,19);
r0 = (uint32_t)m0 & reduce_mask_26;
r1 += (uint32_t)(m0 >> 26);
} while (--count);
// Pretend r1 matters for the optimizer
r0 ^= r1 & zero;
*out = r0;
}
static void curve25519_recip(uint32_t *out, uint32_t z, __global unsigned char *result, uint32_t zero) {
uint32_t a = 0;
curve25519_square_times(&a, z, 1, zero);
*(__global uint32_t*)result = a;
curve25519_square_times(out, a, 2, zero);
}
__kernel void run_test(uint32_t zero, __global unsigned char *result) {
uint32_t x = 42693351;
// Hide this value from the optimizer
x ^= zero;
uint32_t out = 0;
curve25519_recip(&out, x, result, zero);
// Pretend to the optimizer that the output here matters (it also diverges)
*result ^= (out >> 16) & zero;
}
typedef unsigned int uint32_t;
typedef unsigned long uint64_t;
#define mul32x32_64(a,b) (((uint64_t)(a))*(b))
__constant uint32_t reduce_mask_25 = (1 << 25) - 1;
__constant uint32_t reduce_mask_26 = (1 << 26) - 1;
static void curve25519_square_times(uint32_t *out, uint32_t in, int count, uint32_t zero) {
uint32_t r0 = in, r1 = 0;
uint64_t m0 = 0, m9 = 0;
uint32_t p = 0;
do {
m0 = mul32x32_64(r0, r0);
m9 = mul32x32_64(r0, 12345678);
r0 = (uint32_t)m0 & reduce_mask_26;
m9 += m0 >> 26;
p = (uint32_t)(m9 >> 25);
m0 = r0 + mul32x32_64(p,19);
r0 = (uint32_t)m0 & reduce_mask_26;
r1 += (uint32_t)(m0 >> 26);
} while (--count);
// Pretend r1 matters for the optimizer
r0 ^= r1 & zero;
*out = r0;
}
static void curve25519_recip(uint32_t *out, uint32_t z, __global unsigned char *result, uint32_t zero) {
uint32_t a = 0;
curve25519_square_times(&a, z, 1, zero);
*(__global uint32_t*)result = a;
curve25519_square_times(out, a, 2, zero);
}
__kernel void run_test(uint32_t zero, __global unsigned char *result) {
uint32_t x = 42693351;
// Hide this value from the optimizer
x ^= zero;
uint32_t out = 0;
curve25519_recip(&out, x, result, zero);
// Pretend to the optimizer that the output here matters (it also diverges)
*result ^= (out >> 16) & zero;
}
// Based on https://github.com/rsnemmen/OpenCL-examples/blob/master/Hello_World/hello.c
// I usually use Rust but I thought a C example runner would be clearer.
#define PROGRAM_FILE "kernel.cl"
#define KERNEL_FUNC "run_test"
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
/* Find a GPU or CPU associated with the first available platform
The `platform` structure identifies the first platform identified by the
OpenCL runtime. A platform identifies a vendor's installation, so a system
may have an NVIDIA platform and an AMD platform.
The `device` structure corresponds to the first accessible device
associated with the platform. Because the second parameter is
`CL_DEVICE_TYPE_GPU`, this device must be a GPU.
*/
cl_device_id create_device() {
cl_platform_id platform;
cl_device_id dev;
int err;
/* Identify a platform */
err = clGetPlatformIDs(1, &platform, NULL);
if(err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
// Access a device
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if(err < 0) {
perror("Couldn't access any devices");
exit(1);
}
size_t len;
clGetDeviceInfo(dev, CL_DEVICE_NAME, 0, NULL, &len);
char* name = (char*)malloc(len);
clGetDeviceInfo(dev, CL_DEVICE_NAME, len, name, NULL);
printf("Running on GPU: %s\n", name);
return dev;
}
/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if(program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file
Creates a program from the source code in the add_numbers.cl file.
Specifically, the code reads the file's content into a char array
called program_buffer, and then calls clCreateProgramWithSource.
*/
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if(err < 0) {
perror("Couldn't create the program");
exit(1);
}
free(program_buffer);
/* Build program
The fourth parameter accepts options that configure the compilation.
These are similar to the flags used by gcc. For example, you can
define a macro with the option -DMACRO=VALUE and turn off optimization
with -cl-opt-disable.
*/
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if(err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*) malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
int main() {
/* OpenCL structures */
cl_device_id device;
cl_context context;
cl_program program;
cl_kernel kernel;
cl_command_queue queue;
cl_int i, j, err;
size_t local_size, global_size;
/* Create device and context
Creates a context containing only one device — the device structure
created earlier.
*/
device = create_device();
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if(err < 0) {
perror("Couldn't create a context");
exit(1);
}
/* Build program */
program = build_program(context, device, PROGRAM_FILE);
/* Create data buffer
• `global_size`: total number of work items that will be
executed on the GPU (e.g. total size of your array)
• `local_size`: size of local workgroup. Each workgroup contains
several work items and goes to a compute unit
In this example, the kernel is executed by eight work-items divided into
two work-groups of four work-items each. Returning to my analogy,
this corresponds to a school containing eight students divided into
two classrooms of four students each.
Notes:
• Intel recommends workgroup size of 64-128. Often 128 is minimum to
get good performance on GPU
• On NVIDIA Fermi, workgroup size must be at least 192 for full
utilization of cores
• Optimal workgroup size differs across applications
*/
global_size = 1;
local_size = 1;
uint32_t result;
cl_mem result_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 4, &result, &err);
if(err < 0) {
perror("Couldn't create a buffer");
exit(1);
};
/* Create a command queue
Does not support profiling or out-of-order-execution
*/
queue = clCreateCommandQueue(context, device, 0, &err);
if(err < 0) {
perror("Couldn't create a command queue");
exit(1);
};
/* Create a kernel */
kernel = clCreateKernel(program, KERNEL_FUNC, &err);
if(err < 0) {
perror("Couldn't create a kernel");
exit(1);
};
/* Create kernel arguments */
uint32_t zero = 0;
err = clSetKernelArg(kernel, 0, sizeof(zero), &zero);
err |= clSetKernelArg(kernel, 1, sizeof(result_buffer), &result_buffer);
if(err < 0) {
perror("Couldn't create a kernel argument");
exit(1);
}
/* Enqueue kernel
At this point, the application has created all the data structures
(device, kernel, program, command queue, and context) needed by an
OpenCL host application. Now, it deploys the kernel to a device.
Of the OpenCL functions that run on the host, clEnqueueNDRangeKernel
is probably the most important to understand. Not only does it deploy
kernels to devices, it also identifies how many work-items should
be generated to execute the kernel (global_size) and the number of
work-items in each work-group (local_size).
*/
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size,
&local_size, 0, NULL, NULL);
if(err < 0) {
perror("Couldn't enqueue the kernel");
exit(1);
}
/* Read the kernel's output */
err = clEnqueueReadBuffer(queue, result_buffer, CL_TRUE, 0,
sizeof(result), &result, 0, NULL, NULL);
if(err < 0) {
perror("Couldn't read the buffer");
exit(1);
}
printf("GPU result: %x\n", result);
/* Deallocate resources */
clReleaseKernel(kernel);
clReleaseMemObject(result_buffer);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment