Last active
September 30, 2022 22:51
-
-
Save PlasmaPower/c8a650b3268dc5cd8a54755431427350 to your computer and use it in GitHub Desktop.
AMD ROCm OpenCL miscompilation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define __global | |
#define __kernel | |
#define __constant const | |
#include <stdio.h> | |
void run_test(unsigned int zero, unsigned char *result); | |
int main() { | |
unsigned int result = 0; | |
run_test(0, (unsigned char *)&result); | |
printf("Host result: %x\n", result); | |
return 0; | |
} | |
typedef unsigned int uint32_t; | |
typedef unsigned long uint64_t; | |
#define mul32x32_64(a,b) (((uint64_t)(a))*(b)) | |
__constant uint32_t reduce_mask_25 = (1 << 25) - 1; | |
__constant uint32_t reduce_mask_26 = (1 << 26) - 1; | |
static void curve25519_square_times(uint32_t *out, uint32_t in, int count, uint32_t zero) { | |
uint32_t r0 = in, r1 = 0; | |
uint64_t m0 = 0, m9 = 0; | |
uint32_t p = 0; | |
do { | |
m0 = mul32x32_64(r0, r0); | |
m9 = mul32x32_64(r0, 12345678); | |
r0 = (uint32_t)m0 & reduce_mask_26; | |
m9 += m0 >> 26; | |
p = (uint32_t)(m9 >> 25); | |
m0 = r0 + mul32x32_64(p,19); | |
r0 = (uint32_t)m0 & reduce_mask_26; | |
r1 += (uint32_t)(m0 >> 26); | |
} while (--count); | |
// Pretend r1 matters for the optimizer | |
r0 ^= r1 & zero; | |
*out = r0; | |
} | |
static void curve25519_recip(uint32_t *out, uint32_t z, __global unsigned char *result, uint32_t zero) { | |
uint32_t a = 0; | |
curve25519_square_times(&a, z, 1, zero); | |
*(__global uint32_t*)result = a; | |
curve25519_square_times(out, a, 2, zero); | |
} | |
__kernel void run_test(uint32_t zero, __global unsigned char *result) { | |
uint32_t x = 42693351; | |
// Hide this value from the optimizer | |
x ^= zero; | |
uint32_t out = 0; | |
curve25519_recip(&out, x, result, zero); | |
// Pretend to the optimizer that the output here matters (it also diverges) | |
*result ^= (out >> 16) & zero; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
typedef unsigned int uint32_t; | |
typedef unsigned long uint64_t; | |
#define mul32x32_64(a,b) (((uint64_t)(a))*(b)) | |
__constant uint32_t reduce_mask_25 = (1 << 25) - 1; | |
__constant uint32_t reduce_mask_26 = (1 << 26) - 1; | |
static void curve25519_square_times(uint32_t *out, uint32_t in, int count, uint32_t zero) { | |
uint32_t r0 = in, r1 = 0; | |
uint64_t m0 = 0, m9 = 0; | |
uint32_t p = 0; | |
do { | |
m0 = mul32x32_64(r0, r0); | |
m9 = mul32x32_64(r0, 12345678); | |
r0 = (uint32_t)m0 & reduce_mask_26; | |
m9 += m0 >> 26; | |
p = (uint32_t)(m9 >> 25); | |
m0 = r0 + mul32x32_64(p,19); | |
r0 = (uint32_t)m0 & reduce_mask_26; | |
r1 += (uint32_t)(m0 >> 26); | |
} while (--count); | |
// Pretend r1 matters for the optimizer | |
r0 ^= r1 & zero; | |
*out = r0; | |
} | |
static void curve25519_recip(uint32_t *out, uint32_t z, __global unsigned char *result, uint32_t zero) { | |
uint32_t a = 0; | |
curve25519_square_times(&a, z, 1, zero); | |
*(__global uint32_t*)result = a; | |
curve25519_square_times(out, a, 2, zero); | |
} | |
__kernel void run_test(uint32_t zero, __global unsigned char *result) { | |
uint32_t x = 42693351; | |
// Hide this value from the optimizer | |
x ^= zero; | |
uint32_t out = 0; | |
curve25519_recip(&out, x, result, zero); | |
// Pretend to the optimizer that the output here matters (it also diverges) | |
*result ^= (out >> 16) & zero; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Based on https://github.com/rsnemmen/OpenCL-examples/blob/master/Hello_World/hello.c | |
// I usually use Rust but I thought a C example runner would be clearer. | |
#define PROGRAM_FILE "kernel.cl" | |
#define KERNEL_FUNC "run_test" | |
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS | |
#include <math.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <time.h> | |
#ifdef MAC | |
#include <OpenCL/cl.h> | |
#else | |
#include <CL/cl.h> | |
#endif | |
/* Find a GPU or CPU associated with the first available platform | |
The `platform` structure identifies the first platform identified by the | |
OpenCL runtime. A platform identifies a vendor's installation, so a system | |
may have an NVIDIA platform and an AMD platform. | |
The `device` structure corresponds to the first accessible device | |
associated with the platform. Because the second parameter is | |
`CL_DEVICE_TYPE_GPU`, this device must be a GPU. | |
*/ | |
cl_device_id create_device() { | |
cl_platform_id platform; | |
cl_device_id dev; | |
int err; | |
/* Identify a platform */ | |
err = clGetPlatformIDs(1, &platform, NULL); | |
if(err < 0) { | |
perror("Couldn't identify a platform"); | |
exit(1); | |
} | |
// Access a device | |
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL); | |
if(err < 0) { | |
perror("Couldn't access any devices"); | |
exit(1); | |
} | |
size_t len; | |
clGetDeviceInfo(dev, CL_DEVICE_NAME, 0, NULL, &len); | |
char* name = (char*)malloc(len); | |
clGetDeviceInfo(dev, CL_DEVICE_NAME, len, name, NULL); | |
printf("Running on GPU: %s\n", name); | |
return dev; | |
} | |
/* Create program from a file and compile it */ | |
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) { | |
cl_program program; | |
FILE *program_handle; | |
char *program_buffer, *program_log; | |
size_t program_size, log_size; | |
int err; | |
/* Read program file and place content into buffer */ | |
program_handle = fopen(filename, "r"); | |
if(program_handle == NULL) { | |
perror("Couldn't find the program file"); | |
exit(1); | |
} | |
fseek(program_handle, 0, SEEK_END); | |
program_size = ftell(program_handle); | |
rewind(program_handle); | |
program_buffer = (char*)malloc(program_size + 1); | |
program_buffer[program_size] = '\0'; | |
fread(program_buffer, sizeof(char), program_size, program_handle); | |
fclose(program_handle); | |
/* Create program from file | |
Creates a program from the source code in the add_numbers.cl file. | |
Specifically, the code reads the file's content into a char array | |
called program_buffer, and then calls clCreateProgramWithSource. | |
*/ | |
program = clCreateProgramWithSource(ctx, 1, | |
(const char**)&program_buffer, &program_size, &err); | |
if(err < 0) { | |
perror("Couldn't create the program"); | |
exit(1); | |
} | |
free(program_buffer); | |
/* Build program | |
The fourth parameter accepts options that configure the compilation. | |
These are similar to the flags used by gcc. For example, you can | |
define a macro with the option -DMACRO=VALUE and turn off optimization | |
with -cl-opt-disable. | |
*/ | |
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); | |
if(err < 0) { | |
/* Find size of log and print to std output */ | |
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, | |
0, NULL, &log_size); | |
program_log = (char*) malloc(log_size + 1); | |
program_log[log_size] = '\0'; | |
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, | |
log_size + 1, program_log, NULL); | |
printf("%s\n", program_log); | |
free(program_log); | |
exit(1); | |
} | |
return program; | |
} | |
int main() { | |
/* OpenCL structures */ | |
cl_device_id device; | |
cl_context context; | |
cl_program program; | |
cl_kernel kernel; | |
cl_command_queue queue; | |
cl_int i, j, err; | |
size_t local_size, global_size; | |
/* Create device and context | |
Creates a context containing only one device — the device structure | |
created earlier. | |
*/ | |
device = create_device(); | |
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); | |
if(err < 0) { | |
perror("Couldn't create a context"); | |
exit(1); | |
} | |
/* Build program */ | |
program = build_program(context, device, PROGRAM_FILE); | |
/* Create data buffer | |
• `global_size`: total number of work items that will be | |
executed on the GPU (e.g. total size of your array) | |
• `local_size`: size of local workgroup. Each workgroup contains | |
several work items and goes to a compute unit | |
In this example, the kernel is executed by eight work-items divided into | |
two work-groups of four work-items each. Returning to my analogy, | |
this corresponds to a school containing eight students divided into | |
two classrooms of four students each. | |
Notes: | |
• Intel recommends workgroup size of 64-128. Often 128 is minimum to | |
get good performance on GPU | |
• On NVIDIA Fermi, workgroup size must be at least 192 for full | |
utilization of cores | |
• Optimal workgroup size differs across applications | |
*/ | |
global_size = 1; | |
local_size = 1; | |
uint32_t result; | |
cl_mem result_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 4, &result, &err); | |
if(err < 0) { | |
perror("Couldn't create a buffer"); | |
exit(1); | |
}; | |
/* Create a command queue | |
Does not support profiling or out-of-order-execution | |
*/ | |
queue = clCreateCommandQueue(context, device, 0, &err); | |
if(err < 0) { | |
perror("Couldn't create a command queue"); | |
exit(1); | |
}; | |
/* Create a kernel */ | |
kernel = clCreateKernel(program, KERNEL_FUNC, &err); | |
if(err < 0) { | |
perror("Couldn't create a kernel"); | |
exit(1); | |
}; | |
/* Create kernel arguments */ | |
uint32_t zero = 0; | |
err = clSetKernelArg(kernel, 0, sizeof(zero), &zero); | |
err |= clSetKernelArg(kernel, 1, sizeof(result_buffer), &result_buffer); | |
if(err < 0) { | |
perror("Couldn't create a kernel argument"); | |
exit(1); | |
} | |
/* Enqueue kernel | |
At this point, the application has created all the data structures | |
(device, kernel, program, command queue, and context) needed by an | |
OpenCL host application. Now, it deploys the kernel to a device. | |
Of the OpenCL functions that run on the host, clEnqueueNDRangeKernel | |
is probably the most important to understand. Not only does it deploy | |
kernels to devices, it also identifies how many work-items should | |
be generated to execute the kernel (global_size) and the number of | |
work-items in each work-group (local_size). | |
*/ | |
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, | |
&local_size, 0, NULL, NULL); | |
if(err < 0) { | |
perror("Couldn't enqueue the kernel"); | |
exit(1); | |
} | |
/* Read the kernel's output */ | |
err = clEnqueueReadBuffer(queue, result_buffer, CL_TRUE, 0, | |
sizeof(result), &result, 0, NULL, NULL); | |
if(err < 0) { | |
perror("Couldn't read the buffer"); | |
exit(1); | |
} | |
printf("GPU result: %x\n", result); | |
/* Deallocate resources */ | |
clReleaseKernel(kernel); | |
clReleaseMemObject(result_buffer); | |
clReleaseCommandQueue(queue); | |
clReleaseProgram(program); | |
clReleaseContext(context); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment