Skip to content

Instantly share code, notes, and snippets.

@hube12
Last active October 9, 2019 22:04
Show Gist options
  • Save hube12/85528eaad176555b06a0edf7687d5db3 to your computer and use it in GitHub Desktop.
Save hube12/85528eaad176555b06a0edf7687d5db3 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <string.h>
#include <inttypes.h>
#include <time.h>
#ifdef __linux
#include <unistd.h>
#endif
#include <errno.h>
#define min(a, b) ((a) < (b) ? (a) : (b))
#include <CL/cl.h>
#define CLEAR_LINE "\x1b[K"
#include "clutil.h"
#define TYPE CL_DEVICE_TYPE_ALL
size_t flp2 (size_t x) {
x = x | (x >> 1u);
x = x | (x >> 2u);
x = x | (x >> 4u);
x = x | (x >> 8u);
x = x | (x >> 16u);
return x - (x >> 1u);
}
static inline uint64_t get_timer() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ((uint64_t) ts.tv_sec) * 1000000000ULL + ts.tv_nsec;
}
static inline uint64_t start_timer() {
return get_timer();
}
static inline uint64_t stop_timer(uint64_t start) {
return get_timer() - start;
}
#define N_STATES 5
#define N_RULES (N_STATES * (N_STATES+1) / 2 * (N_STATES-1))
#define max(a, b) ((a) > (b) ? (a) : (b))
const char DAN_AUTOMATON[N_STATES-1][N_STATES][N_STATES] = {{{0,1,0,1,0},{1,3,2,0,3},{0,2,0,3,0},{1,0,3,0,0},{0,3,0,0,0}},{{1,1,0,3,2},{1,1,0,3,0},{0,0,3,0,0},{3,3,0,0,0},{2,0,0,0,0}},{{2,3,2,0,0},{3,2,3,3,2},{2,3,1,0,0},{0,3,0,3,3},{0,2,0,3,0}},{{2,2,2,2,2},{2,0,2,2,0},{2,2,3,0,0},{2,2,0,4,4},{2,0,0,4,0}}};
void encode_automaton(const char *multi_dimensional_array, char *dest) {
for (int middle = 0; middle < N_STATES - 1; middle++) {
for (int left = 0; left < N_STATES; left++) {
for (int right = 0; right < N_STATES; right++) {
int l = min(left, right);
int r = max(left, right);
int index = middle * (N_RULES / (N_STATES - 1));
index += ((N_STATES * (N_STATES+1)) - ((N_STATES-l) * (N_STATES-l+1))) / 2;
index += r - l;
dest[index] = *(multi_dimensional_array + right + N_STATES * (left + N_STATES * middle));
}
}
}
}
// Returns factorial of n
size_t factorial(size_t n) {
size_t res = 1;
for (size_t i = 2; i <= n; i++)
res = res * i;
return res;
}
size_t nCr(size_t n, size_t r) {
return factorial(n) / (factorial(r) * factorial(n - r));
}
size_t poww(size_t x, size_t n) {
size_t result = 1;
for (size_t i = 0; i < n; i++)
result *= x;
return result;
}
int main(int argc, char **argv) {
char automaton[N_RULES];
encode_automaton(&DAN_AUTOMATON[0][0][0], automaton);
// Settings
// Whenever you add a CL file, remember to also edit the bottom of CMakeLists.txt
const char *kernel_file = "kernel.cl";
const char *kernel_name = "start";
const char *header_names[] = {"jrand.cl"};
#ifdef __linux
printf("PID: %u\n", getpid());
#endif
// Find devices
cl_uint num_platforms;
check(clGetPlatformIDs(0, NULL, &num_platforms), "getPlatformIDs (num)");
printf("%d platforms:\n", num_platforms);
cl_platform_id platforms[num_platforms];
check(clGetPlatformIDs(num_platforms, platforms, NULL), "getPlatformIDs");
cl_device_id device = NULL;
cl_uint cus = 0;
printf("Available platforms:\n");
for (int i = 0; i < num_platforms; i++) {
char *info = getPlatformInfo(platforms[i]);
printf("%d: %s\n", i, info);
free(info);
cl_uint num_devices;
check(clGetDeviceIDs(platforms[i], TYPE, 0, NULL, &num_devices), "getDeviceIDs (num)");
cl_device_id devices[num_devices];
check(clGetDeviceIDs(platforms[i], TYPE, num_devices, devices, NULL), "getDeviceIDs");
printf(" %d available devices:\n", num_devices);
for (int j = 0; j < num_devices; j++) {
device_info *infos = getDeviceInfo(devices[j]);
printf(" %s\n", infos->info_str);
if (infos->compute_units >= cus) {
cus = infos->compute_units;
device = devices[j];
}
}
putchar('\n');
}
if (!device) {
fprintf(stderr, "No devices found.\n");
return -1;
}
// Create CL context
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
device_info *info = getDeviceInfo(device);
printf("Using %s\n", info->info_str);
free(info);
cl_command_queue queue = clCreateCommandQueueWithProperties(context, device, 0, NULL);
cl_int error;
// Compile main source
const char *main_src = readFile(kernel_file);
cl_program main_cl = clCreateProgramWithSource(context, 1, &main_src, NULL, &error);
check(error, "Creating program");
printf("Compiling...\n");
error = clCompileProgram(main_cl, 0, NULL, NULL, 0, NULL, NULL, NULL, NULL);
if (error == CL_COMPILE_PROGRAM_FAILURE || error == CL_BUILD_PROGRAM_FAILURE) {
size_t log_size;
clGetProgramBuildInfo(main_cl, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char *) malloc(log_size);
clGetProgramBuildInfo(main_cl, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
printf("%s j\n", log);
exit(error);
} else if (error) {
fprintf(stderr, "Error compiling: %s\n", getErrorString(error));
exit(error);
}
// Link
const cl_program programs[] = {main_cl};
printf("Linking...\n");
cl_program program = clLinkProgram(context, 0, NULL, NULL, 1, programs, NULL, NULL, &error);
check(error, "Linking");
cl_kernel kernel = clCreateKernel(program, kernel_name, NULL);
int queryDim[3] = {0,0,0};
// Main program, host side
size_t local_size;
check(clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES , 3*sizeof(local_size), queryDim, NULL), "Get max work item sizes");
local_size = flp2(queryDim[0]);
cl_uint address_bits;
check(clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(address_bits), &address_bits, NULL), "Get device address bits");
size_t max_global_size = 1LLU << min(address_bits, 32);
printf("CL_DEVICE_MAX_WORK_ITEM_SIZES: (%d %d %d)\n", queryDim[0], queryDim[1], queryDim[2]);
printf("Local size: %lld; Global size: %lld\n", local_size, max_global_size);
cl_mem mem_base_automaton = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(automaton), NULL, NULL);
cl_mem mem_local_results = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uchar) * local_size, NULL, NULL);
cl_mem mem_local_successes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_ulong) * local_size, NULL, NULL);
cl_mem mem_global_results = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uchar) * (max_global_size / local_size), NULL, NULL);
cl_mem mem_global_successes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_ulong) * (max_global_size / local_size), NULL, NULL);
check(clEnqueueWriteBuffer(queue, mem_base_automaton, CL_FALSE, 0, sizeof(automaton), automaton, 0, NULL, NULL), "Write Base Automaton");
check(clSetKernelArg(kernel, 3, sizeof(cl_mem), &mem_base_automaton), "Base Automaton");
check(clSetKernelArg(kernel, 4, sizeof(cl_mem), &mem_local_results), "Local Results");
check(clSetKernelArg(kernel, 5, sizeof(cl_mem), &mem_local_successes), "Local Successes");
check(clSetKernelArg(kernel, 6, sizeof(cl_mem), &mem_global_results), "Global Results");
check(clSetKernelArg(kernel, 7, sizeof(cl_mem), &mem_global_successes), "Global Successes");
char *kf = malloc(strlen(kernel_file));
strcpy(kf, kernel_file);
kf[strlen(kf) - 3] = 0;
printf("Executing %s.%s\n", kf, kernel_name);
for (int rule_changes = 1; rule_changes < 7; rule_changes++) {
size_t combination_count = poww(N_STATES - 1, (size_t) rule_changes) * nCr(N_RULES - 1, (size_t)rule_changes);
check(clSetKernelArg(kernel, 1, sizeof(int), &rule_changes), "Rule changes");
check(clSetKernelArg(kernel, 2, sizeof(size_t), &combination_count), "Combination count");
size_t global_size = min(max_global_size, (combination_count & (combination_count-1)) == 0 ? combination_count : flp2(combination_count * 2));
//printf("Work Load possible: %d %d %d \n",CL_DEVICE_MAX_WORK_GROUP_SIZE,CL_DEVICE_MAX_WORK_ITEM_SIZES,CL_DEVICE_LOCAL_MEM_SIZE);
char best_result = 0;
size_t best_combination_id = 0;
uint64_t t = start_timer();
for (size_t offset = 0; offset < combination_count; offset += global_size) {
float perc = offset * 100.f / combination_count;
uint64_t t2 = start_timer();
check(clSetKernelArg(kernel, 0, sizeof(offset), &offset), "Argument offset");
printf("\rx %3.3f%%", perc);
fflush(stdout);
check(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL), "\nExecute");
check(clFinish(queue), "\nFinish execute");
printf("\r<- %3.3f%%", perc);
fflush(stdout);
char result;
size_t combination_id;
check(clEnqueueReadBuffer(queue, mem_global_results, CL_TRUE, 0, sizeof(result), &result, 0, NULL, NULL), "\nRead");
check(clEnqueueReadBuffer(queue, mem_global_successes, CL_TRUE, 0, sizeof(combination_id), &combination_id, 0, NULL, NULL),
"\nRead");
uint64_t d2 = stop_timer(t2);
printf("\rw %3.3f%% %.4f" CLEAR_LINE, perc, d2 / 1000000.f);
fflush(stdout);
if (result > best_result) {
best_result = result;
best_combination_id = combination_id;
}
uint64_t d = get_timer() - t;
double per_item = (double) d / (offset + global_size);
double eta = ((double) d / ((offset + global_size) * 1000000000.)) * (combination_count - offset - global_size);
uint64_t d2ms = d2 / 1000000;
printf(" %fs / %llu items = %lfns/item, %llums/batch, ETA: %lfs (%dh%dm%ds)", d / 1000000000.f,
offset + global_size,
per_item, d2ms, eta,
(int) (eta / 3600), ((int) (eta / 60)) % 60, ((int) eta) % 60);
}
printf("Best combination for %d rule changes:\n", rule_changes);
printf("Combination ID: %lld\n", best_combination_id);
size_t combination_id = best_combination_id;
int min_index = 0;
for (int i = 0; i < rule_changes; i++) {
int possible_indices = (N_RULES - 1) - min_index - (rule_changes - 1 - i);
int index = 1 + min_index + combination_id % possible_indices;
combination_id /= possible_indices;
int dest_state = combination_id % (N_STATES - 1);
dest_state += dest_state >= automaton[index];
combination_id /= (N_STATES - 1);
min_index = index + 1;
int working_index = index;
int middle = working_index / (N_RULES / (N_STATES - 1));
working_index %= N_RULES / (N_STATES - 1);
int left, right;
for (int row_size = 5; row_size > 0; row_size--) {
if (row_size > working_index) {
left = 5 - row_size;
right = left + working_index;
}
working_index -= row_size;
}
printf("(%d, %d, %d) (index %d) -> %d\n", left, middle, right, index, dest_state);
}
puts("\rDone. ");
uint64_t d = stop_timer(t);
printf("%fs / %llu items = %lfns/item\n", d / 1000000000.f, combination_count, (double) d / combination_count);
}
return 0;
}
#define N_STATES 5
#define N_RULES (N_STATES * (N_STATES+1) / 2 * (N_STATES-1))
#define MAX_SIZE 12
#define MAX_ITERS 50
static inline char apply_automaton(char *automaton, char left, char middle, char right) {
int l = min(left, right);
int r = max(left, right);
int index = (middle - (middle == N_STATES - 1)) * (N_RULES / (N_STATES - 1));
index += ((N_STATES * (N_STATES+1)) - ((N_STATES-l) * (N_STATES-l+1))) / 2;
index += r - l;
return automaton[index];
}
static inline int simulate(int size, char *automaton) {
char a[MAX_SIZE];
char b[MAX_SIZE];
char *system = a;
char *last_system = b;
char *temp;
last_system[0] = 1;
for (int i = 1; i < size; i++)
last_system[i] = 0;
int first_end_state = 0;
int first_all_end_state = 0;
for (int y = 0; y < MAX_ITERS; y++) {
int end_state_count = 0;
system[0] = apply_automaton(automaton, N_STATES-1, last_system[0], last_system[1]);
end_state_count += system[0] == N_STATES-1;
for (int x = 1, e = size-1; x < e; x++) {
system[x] = apply_automaton(automaton, last_system[x-1], last_system[x], last_system[x+1]);
end_state_count += system[x] == N_STATES-1;
}
system[size-1] = apply_automaton(automaton, last_system[size-2], last_system[size-1], N_STATES-1);
end_state_count += system[size-1] == N_STATES-1;
first_end_state = (first_end_state != 0) * first_end_state + (first_end_state == 0) * y * (end_state_count != 0);
first_all_end_state = (first_all_end_state != 0) * first_all_end_state + (first_all_end_state == 0) * y * (end_state_count == size);
temp = system;
system = last_system;
last_system = temp;
}
return (first_all_end_state != 0) * (first_all_end_state == first_end_state);
}
// combination_count = (N_STATES - 1) ^ rule_changes * nCr(N_RULES - 1, rule_changes)
__kernel void start(ulong offset, int rule_changes, ulong combination_count, __constant char *base_automaton,
__global char *local_results, __global size_t *local_successes, __global char *global_results, __global size_t *global_successes) {
size_t global_id = get_global_id(0);
size_t local_id = get_local_id(0);
size_t local_size = get_local_size(0);
size_t global_index = global_id / local_size;
// Combination to rule changes
size_t combination_id = offset + global_id;
if (combination_id > combination_count)
return;
char automaton[N_RULES];
for (int i = 0; i < N_RULES; i++)
automaton[i] = base_automaton[i];
int min_index = 0;
for (int i = 0; i < rule_changes; i++) {
int possible_indices = (N_RULES - 1) - min_index - (rule_changes - 1 - i);
int index = 1 + min_index + combination_id % possible_indices;
combination_id /= possible_indices;
int dest_state = combination_id % (N_STATES - 1);
dest_state += dest_state >= automaton[index];
combination_id /= (N_STATES - 1);
min_index = index + 1;
automaton[index] = dest_state;
}
// Simulate
int successes = simulate(7, automaton);
successes += simulate(8, automaton);
successes += simulate(10, automaton);
successes = (successes + 1) * simulate(9, automaton);
// Search for the result with the most successes
local_results[local_id] = successes;
local_successes[local_id] = offset + global_id;
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = local_size / 2; i > 0; i >>= 1) {
if (local_id < i && local_results[local_id + i] > local_results[local_id]) {
local_results[local_id] = local_results[local_id + i];
local_successes[local_id] = local_successes[local_id + i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_id == 0) {
global_results[global_index] = local_results[0];
global_successes[global_index] = local_successes[0];
}
barrier(CLK_GLOBAL_MEM_FENCE);
for (int i = get_global_size(0) / local_size / 2; i > 0; i >>= 2) {
if (local_id == 0 && global_index < i && global_results[global_index + i] > global_results[global_index]) {
global_results[global_index] = global_results[global_index + i];
global_successes[global_index] = global_successes[global_index + i];
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment