Skip to content

Instantly share code, notes, and snippets.

@athas
Created June 3, 2019 11:26
Show Gist options
  • Save athas/09a4da77f0ac1483fa6a4263602e6eaf to your computer and use it in GitHub Desktop.
Save athas/09a4da77f0ac1483fa6a4263602e6eaf to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>
#include <stdint.h>
#undef NDEBUG
#include <assert.h>
// Start of panic.h.
#include <stdarg.h>
static const char *fut_progname;
static void panic(int eval, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
fprintf(stderr, "%s: ", fut_progname);
vfprintf(stderr, fmt, ap);
va_end(ap);
exit(eval);
}
/* For generating arbitrary-sized error messages. It is the callers
responsibility to free the buffer at some point. */
static char* msgprintf(const char *s, ...) {
va_list vl;
va_start(vl, s);
size_t needed = 1 + vsnprintf(NULL, 0, s, vl);
char *buffer = malloc(needed);
va_start(vl, s); /* Must re-init. */
vsnprintf(buffer, needed, s, vl);
return buffer;
}
// End of panic.h.
// Start of timing.h.
// The function get_wall_time() returns the wall time in microseconds
// (with an unspecified offset).
#ifdef _WIN32
#include <windows.h>
static int64_t get_wall_time(void) {
LARGE_INTEGER time,freq;
assert(QueryPerformanceFrequency(&freq));
assert(QueryPerformanceCounter(&time));
return ((double)time.QuadPart / freq.QuadPart) * 1000000;
}
#else
/* Assuming POSIX */
#include <time.h>
#include <sys/time.h>
static int64_t get_wall_time(void) {
struct timeval time;
assert(gettimeofday(&time,NULL) == 0);
return time.tv_sec * 1000000 + time.tv_usec;
}
#endif
// End of timing.h.
#ifdef _MSC_VER
#define inline __inline
#endif
#include <string.h>
#include <inttypes.h>
#include <ctype.h>
#include <errno.h>
#include <assert.h>
// Start of lock.h.
/* A very simple cross-platform implementation of locks. Uses
pthreads on Unix and some Windows thing there. Futhark's
host-level code is not multithreaded, but user code may be, so we
need some mechanism for ensuring atomic access to API functions.
This is that mechanism. It is not exposed to user code at all, so
we do not have to worry about name collisions. */
#ifdef _WIN32
typedef HANDLE lock_t;
static lock_t create_lock(lock_t *lock) {
*lock = CreateMutex(NULL, /* Default security attributes. */
FALSE, /* Initially unlocked. */
NULL); /* Unnamed. */
}
static void lock_lock(lock_t *lock) {
assert(WaitForSingleObject(*lock, INFINITE) == WAIT_OBJECT_0);
}
static void lock_unlock(lock_t *lock) {
assert(ReleaseMutex(*lock));
}
static void free_lock(lock_t *lock) {
CloseHandle(*lock);
}
#else
/* Assuming POSIX */
#include <pthread.h>
typedef pthread_mutex_t lock_t;
static void create_lock(lock_t *lock) {
int r = pthread_mutex_init(lock, NULL);
assert(r == 0);
}
static void lock_lock(lock_t *lock) {
int r = pthread_mutex_lock(lock);
assert(r == 0);
}
static void lock_unlock(lock_t *lock) {
int r = pthread_mutex_unlock(lock);
assert(r == 0);
}
static void free_lock(lock_t *lock) {
/* Nothing to do for pthreads. */
(void)lock;
}
#endif
// End of lock.h.
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#define CL_SILENCE_DEPRECATION // For macOS.
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
typedef cl_mem fl_mem_t;
// Start of free_list.h.
/* An entry in the free list. May be invalid, to avoid having to
deallocate entries as soon as they are removed. There is also a
tag, to help with memory reuse. */
struct free_list_entry {
size_t size;
fl_mem_t mem;
const char *tag;
unsigned char valid;
};
struct free_list {
struct free_list_entry *entries; // Pointer to entries.
int capacity; // Number of entries.
int used; // Number of valid entries.
};
void free_list_init(struct free_list *l) {
l->capacity = 30; // Picked arbitrarily.
l->used = 0;
l->entries = malloc(sizeof(struct free_list_entry) * l->capacity);
for (int i = 0; i < l->capacity; i++) {
l->entries[i].valid = 0;
}
}
/* Remove invalid entries from the free list. */
void free_list_pack(struct free_list *l) {
int p = 0;
for (int i = 0; i < l->capacity; i++) {
if (l->entries[i].valid) {
l->entries[p] = l->entries[i];
p++;
}
}
// Now p == l->used.
l->entries = realloc(l->entries, l->used * sizeof(struct free_list_entry));
l->capacity = l->used;
}
void free_list_destroy(struct free_list *l) {
assert(l->used == 0);
free(l->entries);
}
int free_list_find_invalid(struct free_list *l) {
int i;
for (i = 0; i < l->capacity; i++) {
if (!l->entries[i].valid) {
break;
}
}
return i;
}
void free_list_insert(struct free_list *l, size_t size, fl_mem_t mem, const char *tag) {
int i = free_list_find_invalid(l);
if (i == l->capacity) {
// List is full; so we have to grow it.
int new_capacity = l->capacity * 2 * sizeof(struct free_list_entry);
l->entries = realloc(l->entries, new_capacity);
for (int j = 0; j < l->capacity; j++) {
l->entries[j+l->capacity].valid = 0;
}
l->capacity *= 2;
}
// Now 'i' points to the first invalid entry.
l->entries[i].valid = 1;
l->entries[i].size = size;
l->entries[i].mem = mem;
l->entries[i].tag = tag;
l->used++;
}
/* Find and remove a memory block of at least the desired size and
tag. Returns 0 on success. */
int free_list_find(struct free_list *l, const char *tag, size_t *size_out, fl_mem_t *mem_out) {
int i;
for (i = 0; i < l->capacity; i++) {
if (l->entries[i].valid && l->entries[i].tag == tag) {
l->entries[i].valid = 0;
*size_out = l->entries[i].size;
*mem_out = l->entries[i].mem;
l->used--;
return 0;
}
}
return 1;
}
/* Remove the first block in the free list. Returns 0 if a block was
removed, and nonzero if the free list was already empty. */
int free_list_first(struct free_list *l, fl_mem_t *mem_out) {
for (int i = 0; i < l->capacity; i++) {
if (l->entries[i].valid) {
l->entries[i].valid = 0;
*mem_out = l->entries[i].mem;
l->used--;
return 0;
}
}
return 1;
}
// End of free_list.h.
// Start of opencl.h.
#define OPENCL_SUCCEED_FATAL(e) opencl_succeed_fatal(e, #e, __FILE__, __LINE__)
#define OPENCL_SUCCEED_NONFATAL(e) opencl_succeed_nonfatal(e, #e, __FILE__, __LINE__)
// Take care not to override an existing error.
#define OPENCL_SUCCEED_OR_RETURN(e) { \
char *error = OPENCL_SUCCEED_NONFATAL(e); \
if (error) { \
if (!ctx->error) { \
ctx->error = error; \
return bad; \
} else { \
free(error); \
} \
} \
}
// OPENCL_SUCCEED_OR_RETURN returns the value of the variable 'bad' in
// scope. By default, it will be this one. Create a local variable
// of some other type if needed. This is a bit of a hack, but it
// saves effort in the code generator.
static const int bad = 1;
struct opencl_config {
int debugging;
int logging;
int preferred_device_num;
const char *preferred_platform;
const char *preferred_device;
int ignore_blacklist;
const char* dump_program_to;
const char* load_program_from;
const char* dump_binary_to;
const char* load_binary_from;
size_t default_group_size;
size_t default_num_groups;
size_t default_tile_size;
size_t default_threshold;
int default_group_size_changed;
int default_tile_size_changed;
int num_sizes;
const char **size_names;
const char **size_vars;
size_t *size_values;
const char **size_classes;
};
void opencl_config_init(struct opencl_config *cfg,
int num_sizes,
const char *size_names[],
const char *size_vars[],
size_t *size_values,
const char *size_classes[]) {
cfg->debugging = 0;
cfg->logging = 0;
cfg->preferred_device_num = 0;
cfg->preferred_platform = "";
cfg->preferred_device = "";
cfg->ignore_blacklist = 0;
cfg->dump_program_to = NULL;
cfg->load_program_from = NULL;
cfg->dump_binary_to = NULL;
cfg->load_binary_from = NULL;
// The following are dummy sizes that mean the concrete defaults
// will be set during initialisation via hardware-inspection-based
// heuristics.
cfg->default_group_size = 0;
cfg->default_num_groups = 0;
cfg->default_tile_size = 0;
cfg->default_threshold = 0;
cfg->default_group_size_changed = 0;
cfg->default_tile_size_changed = 0;
cfg->num_sizes = num_sizes;
cfg->size_names = size_names;
cfg->size_vars = size_vars;
cfg->size_values = size_values;
cfg->size_classes = size_classes;
}
struct opencl_context {
cl_device_id device;
cl_context ctx;
cl_command_queue queue;
struct opencl_config cfg;
struct free_list free_list;
size_t max_group_size;
size_t max_num_groups;
size_t max_tile_size;
size_t max_threshold;
size_t max_local_memory;
size_t lockstep_width;
};
struct opencl_device_option {
cl_platform_id platform;
cl_device_id device;
cl_device_type device_type;
char *platform_name;
char *device_name;
};
/* This function must be defined by the user. It is invoked by
setup_opencl() after the platform and device has been found, but
before the program is loaded. Its intended use is to tune
constants based on the selected platform and device. */
static void post_opencl_setup(struct opencl_context*, struct opencl_device_option*);
static char *strclone(const char *str) {
size_t size = strlen(str) + 1;
char *copy = malloc(size);
if (copy == NULL) {
return NULL;
}
memcpy(copy, str, size);
return copy;
}
// Read a file into a NUL-terminated string; returns NULL on error.
static char* slurp_file(const char *filename, size_t *size) {
char *s;
FILE *f = fopen(filename, "rb"); // To avoid Windows messing with linebreaks.
if (f == NULL) return NULL;
fseek(f, 0, SEEK_END);
size_t src_size = ftell(f);
fseek(f, 0, SEEK_SET);
s = (char*) malloc(src_size + 1);
if (fread(s, 1, src_size, f) != src_size) {
free(s);
s = NULL;
} else {
s[src_size] = '\0';
}
fclose(f);
if (size) {
*size = src_size;
}
return s;
}
static const char* opencl_error_string(unsigned int err)
{
switch (err) {
case CL_SUCCESS: return "Success!";
case CL_DEVICE_NOT_FOUND: return "Device not found.";
case CL_DEVICE_NOT_AVAILABLE: return "Device not available";
case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available";
case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure";
case CL_OUT_OF_RESOURCES: return "Out of resources";
case CL_OUT_OF_HOST_MEMORY: return "Out of host memory";
case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available";
case CL_MEM_COPY_OVERLAP: return "Memory copy overlap";
case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch";
case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported";
case CL_BUILD_PROGRAM_FAILURE: return "Program build failure";
case CL_MAP_FAILURE: return "Map failure";
case CL_INVALID_VALUE: return "Invalid value";
case CL_INVALID_DEVICE_TYPE: return "Invalid device type";
case CL_INVALID_PLATFORM: return "Invalid platform";
case CL_INVALID_DEVICE: return "Invalid device";
case CL_INVALID_CONTEXT: return "Invalid context";
case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties";
case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue";
case CL_INVALID_HOST_PTR: return "Invalid host pointer";
case CL_INVALID_MEM_OBJECT: return "Invalid memory object";
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor";
case CL_INVALID_IMAGE_SIZE: return "Invalid image size";
case CL_INVALID_SAMPLER: return "Invalid sampler";
case CL_INVALID_BINARY: return "Invalid binary";
case CL_INVALID_BUILD_OPTIONS: return "Invalid build options";
case CL_INVALID_PROGRAM: return "Invalid program";
case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable";
case CL_INVALID_KERNEL_NAME: return "Invalid kernel name";
case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition";
case CL_INVALID_KERNEL: return "Invalid kernel";
case CL_INVALID_ARG_INDEX: return "Invalid argument index";
case CL_INVALID_ARG_VALUE: return "Invalid argument value";
case CL_INVALID_ARG_SIZE: return "Invalid argument size";
case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments";
case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension";
case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size";
case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size";
case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset";
case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list";
case CL_INVALID_EVENT: return "Invalid event";
case CL_INVALID_OPERATION: return "Invalid operation";
case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object";
case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size";
case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level";
default: return "Unknown";
}
}
static void opencl_succeed_fatal(unsigned int ret,
const char *call,
const char *file,
int line) {
if (ret != CL_SUCCESS) {
panic(-1, "%s:%d: OpenCL call\n %s\nfailed with error code %d (%s)\n",
file, line, call, ret, opencl_error_string(ret));
}
}
static char* opencl_succeed_nonfatal(unsigned int ret,
const char *call,
const char *file,
int line) {
if (ret != CL_SUCCESS) {
return msgprintf("%s:%d: OpenCL call\n %s\nfailed with error code %d (%s)\n",
file, line, call, ret, opencl_error_string(ret));
} else {
return NULL;
}
}
void set_preferred_platform(struct opencl_config *cfg, const char *s) {
cfg->preferred_platform = s;
cfg->ignore_blacklist = 1;
}
void set_preferred_device(struct opencl_config *cfg, const char *s) {
int x = 0;
if (*s == '#') {
s++;
while (isdigit(*s)) {
x = x * 10 + (*s++)-'0';
}
// Skip trailing spaces.
while (isspace(*s)) {
s++;
}
}
cfg->preferred_device = s;
cfg->preferred_device_num = x;
cfg->ignore_blacklist = 1;
}
static char* opencl_platform_info(cl_platform_id platform,
cl_platform_info param) {
size_t req_bytes;
char *info;
OPENCL_SUCCEED_FATAL(clGetPlatformInfo(platform, param, 0, NULL, &req_bytes));
info = malloc(req_bytes);
OPENCL_SUCCEED_FATAL(clGetPlatformInfo(platform, param, req_bytes, info, NULL));
return info;
}
static char* opencl_device_info(cl_device_id device,
cl_device_info param) {
size_t req_bytes;
char *info;
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device, param, 0, NULL, &req_bytes));
info = malloc(req_bytes);
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device, param, req_bytes, info, NULL));
return info;
}
static void opencl_all_device_options(struct opencl_device_option **devices_out,
size_t *num_devices_out) {
size_t num_devices = 0, num_devices_added = 0;
cl_platform_id *all_platforms;
cl_uint *platform_num_devices;
cl_uint num_platforms;
// Find the number of platforms.
OPENCL_SUCCEED_FATAL(clGetPlatformIDs(0, NULL, &num_platforms));
// Make room for them.
all_platforms = calloc(num_platforms, sizeof(cl_platform_id));
platform_num_devices = calloc(num_platforms, sizeof(cl_uint));
// Fetch all the platforms.
OPENCL_SUCCEED_FATAL(clGetPlatformIDs(num_platforms, all_platforms, NULL));
// Count the number of devices for each platform, as well as the
// total number of devices.
for (cl_uint i = 0; i < num_platforms; i++) {
if (clGetDeviceIDs(all_platforms[i], CL_DEVICE_TYPE_ALL,
0, NULL, &platform_num_devices[i]) == CL_SUCCESS) {
num_devices += platform_num_devices[i];
} else {
platform_num_devices[i] = 0;
}
}
// Make room for all the device options.
struct opencl_device_option *devices =
calloc(num_devices, sizeof(struct opencl_device_option));
// Loop through the platforms, getting information about their devices.
for (cl_uint i = 0; i < num_platforms; i++) {
cl_platform_id platform = all_platforms[i];
cl_uint num_platform_devices = platform_num_devices[i];
if (num_platform_devices == 0) {
continue;
}
char *platform_name = opencl_platform_info(platform, CL_PLATFORM_NAME);
cl_device_id *platform_devices =
calloc(num_platform_devices, sizeof(cl_device_id));
// Fetch all the devices.
OPENCL_SUCCEED_FATAL(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL,
num_platform_devices, platform_devices, NULL));
// Loop through the devices, adding them to the devices array.
for (cl_uint i = 0; i < num_platform_devices; i++) {
char *device_name = opencl_device_info(platform_devices[i], CL_DEVICE_NAME);
devices[num_devices_added].platform = platform;
devices[num_devices_added].device = platform_devices[i];
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(platform_devices[i], CL_DEVICE_TYPE,
sizeof(cl_device_type),
&devices[num_devices_added].device_type,
NULL));
// We don't want the structs to share memory, so copy the platform name.
// Each device name is already unique.
devices[num_devices_added].platform_name = strclone(platform_name);
devices[num_devices_added].device_name = device_name;
num_devices_added++;
}
free(platform_devices);
free(platform_name);
}
free(all_platforms);
free(platform_num_devices);
*devices_out = devices;
*num_devices_out = num_devices;
}
// Returns 0 on success.
static int select_device_interactively(struct opencl_config *cfg) {
struct opencl_device_option *devices;
size_t num_devices;
int ret = 1;
opencl_all_device_options(&devices, &num_devices);
printf("Choose OpenCL device:\n");
const char *cur_platform = "";
for (size_t i = 0; i < num_devices; i++) {
struct opencl_device_option device = devices[i];
if (strcmp(cur_platform, device.platform_name) != 0) {
printf("Platform: %s\n", device.platform_name);
cur_platform = device.platform_name;
}
printf("[%d] %s\n", (int)i, device.device_name);
}
int selection;
printf("Choice: ");
if (scanf("%d", &selection) == 1) {
ret = 0;
cfg->preferred_platform = "";
cfg->preferred_device = "";
cfg->preferred_device_num = selection;
cfg->ignore_blacklist = 1;
}
// Free all the platform and device names.
for (size_t j = 0; j < num_devices; j++) {
free(devices[j].platform_name);
free(devices[j].device_name);
}
free(devices);
return ret;
}
static int is_blacklisted(const char *platform_name, const char *device_name,
const struct opencl_config *cfg) {
if (strcmp(cfg->preferred_platform, "") != 0 ||
strcmp(cfg->preferred_device, "") != 0) {
return 0;
} else if (strstr(platform_name, "Apple") != NULL &&
strstr(device_name, "Intel(R) Core(TM)") != NULL) {
return 1;
} else {
return 0;
}
}
static struct opencl_device_option get_preferred_device(const struct opencl_config *cfg) {
struct opencl_device_option *devices;
size_t num_devices;
opencl_all_device_options(&devices, &num_devices);
int num_device_matches = 0;
for (size_t i = 0; i < num_devices; i++) {
struct opencl_device_option device = devices[i];
if (strstr(device.platform_name, cfg->preferred_platform) != NULL &&
strstr(device.device_name, cfg->preferred_device) != NULL &&
(cfg->ignore_blacklist ||
!is_blacklisted(device.platform_name, device.device_name, cfg)) &&
num_device_matches++ == cfg->preferred_device_num) {
// Free all the platform and device names, except the ones we have chosen.
for (size_t j = 0; j < num_devices; j++) {
if (j != i) {
free(devices[j].platform_name);
free(devices[j].device_name);
}
}
free(devices);
return device;
}
}
panic(1, "Could not find acceptable OpenCL device.\n");
exit(1); // Never reached
}
static void describe_device_option(struct opencl_device_option device) {
fprintf(stderr, "Using platform: %s\n", device.platform_name);
fprintf(stderr, "Using device: %s\n", device.device_name);
}
static cl_build_status build_opencl_program(cl_program program, cl_device_id device, const char* options) {
cl_int clBuildProgram_error = clBuildProgram(program, 1, &device, options, NULL, NULL);
// Avoid termination due to CL_BUILD_PROGRAM_FAILURE
if (clBuildProgram_error != CL_SUCCESS &&
clBuildProgram_error != CL_BUILD_PROGRAM_FAILURE) {
OPENCL_SUCCEED_FATAL(clBuildProgram_error);
}
cl_build_status build_status;
OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program,
device,
CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status),
&build_status,
NULL));
if (build_status != CL_SUCCESS) {
char *build_log;
size_t ret_val_size;
OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size));
build_log = malloc(ret_val_size+1);
OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL));
// The spec technically does not say whether the build log is zero-terminated, so let's be careful.
build_log[ret_val_size] = '\0';
fprintf(stderr, "Build log:\n%s\n", build_log);
free(build_log);
}
return build_status;
}
/* Fields in a bitmask indicating which types we must be sure are
available. */
enum opencl_required_type { OPENCL_F64 = 1 };
// We take as input several strings representing the program, because
// C does not guarantee that the compiler supports particularly large
// literals. Notably, Visual C has a limit of 2048 characters. The
// array must be NULL-terminated.
static cl_program setup_opencl_with_command_queue(struct opencl_context *ctx,
cl_command_queue queue,
const char *srcs[],
int required_types,
const char *extra_build_opts[]) {
int error;
ctx->queue = queue;
OPENCL_SUCCEED_FATAL(clGetCommandQueueInfo(ctx->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx->ctx, NULL));
// Fill out the device info. This is redundant work if we are
// called from setup_opencl() (which is the common case), but I
// doubt it matters much.
struct opencl_device_option device_option;
OPENCL_SUCCEED_FATAL(clGetCommandQueueInfo(ctx->queue, CL_QUEUE_DEVICE,
sizeof(cl_device_id),
&device_option.device,
NULL));
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_PLATFORM,
sizeof(cl_platform_id),
&device_option.platform,
NULL));
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_TYPE,
sizeof(cl_device_type),
&device_option.device_type,
NULL));
device_option.platform_name = opencl_platform_info(device_option.platform, CL_PLATFORM_NAME);
device_option.device_name = opencl_device_info(device_option.device, CL_DEVICE_NAME);
ctx->device = device_option.device;
if (required_types & OPENCL_F64) {
cl_uint supported;
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
sizeof(cl_uint), &supported, NULL));
if (!supported) {
panic(1, "Program uses double-precision floats, but this is not supported on the chosen device: %s\n",
device_option.device_name);
}
}
size_t max_group_size;
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(size_t), &max_group_size, NULL));
size_t max_tile_size = sqrt(max_group_size);
cl_ulong max_local_memory;
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_LOCAL_MEM_SIZE,
sizeof(size_t), &max_local_memory, NULL));
// Make sure this function is defined.
post_opencl_setup(ctx, &device_option);
if (max_group_size < ctx->cfg.default_group_size) {
if (ctx->cfg.default_group_size_changed) {
fprintf(stderr, "Note: Device limits default group size to %zu (down from %zu).\n",
max_group_size, ctx->cfg.default_group_size);
}
ctx->cfg.default_group_size = max_group_size;
}
if (max_tile_size < ctx->cfg.default_tile_size) {
if (ctx->cfg.default_tile_size_changed) {
fprintf(stderr, "Note: Device limits default tile size to %zu (down from %zu).\n",
max_tile_size, ctx->cfg.default_tile_size);
}
ctx->cfg.default_tile_size = max_tile_size;
}
ctx->max_group_size = max_group_size;
ctx->max_tile_size = max_tile_size; // No limit.
ctx->max_threshold = ctx->max_num_groups = 0; // No limit.
ctx->max_local_memory = max_local_memory;
// Now we go through all the sizes, clamp them to the valid range,
// or set them to the default.
for (int i = 0; i < ctx->cfg.num_sizes; i++) {
const char *size_class = ctx->cfg.size_classes[i];
size_t *size_value = &ctx->cfg.size_values[i];
const char* size_name = ctx->cfg.size_names[i];
size_t max_value, default_value;
if (strstr(size_class, "group_size") == size_class) {
max_value = max_group_size;
default_value = ctx->cfg.default_group_size;
} else if (strstr(size_class, "num_groups") == size_class) {
max_value = max_group_size; // Futhark assumes this constraint.
default_value = ctx->cfg.default_num_groups;
} else if (strstr(size_class, "tile_size") == size_class) {
max_value = sqrt(max_group_size);
default_value = ctx->cfg.default_tile_size;
} else if (strstr(size_class, "threshold") == size_class) {
max_value = 0; // No limit.
default_value = ctx->cfg.default_threshold;
} else {
panic(1, "Unknown size class for size '%s': %s\n", size_name, size_class);
}
if (*size_value == 0) {
*size_value = default_value;
} else if (max_value > 0 && *size_value > max_value) {
fprintf(stderr, "Note: Device limits %s to %d (down from %d)\n",
size_name, (int)max_value, (int)*size_value);
*size_value = max_value;
}
}
if (ctx->lockstep_width == 0) {
ctx->lockstep_width = 1;
}
if (ctx->cfg.logging) {
fprintf(stderr, "Lockstep width: %d\n", (int)ctx->lockstep_width);
fprintf(stderr, "Default group size: %d\n", (int)ctx->cfg.default_group_size);
fprintf(stderr, "Default number of groups: %d\n", (int)ctx->cfg.default_num_groups);
}
char *fut_opencl_src = NULL;
size_t src_size = 0;
// Maybe we have to read OpenCL source from somewhere else (used for debugging).
if (ctx->cfg.load_program_from != NULL) {
fut_opencl_src = slurp_file(ctx->cfg.load_program_from, NULL);
assert(fut_opencl_src != NULL);
} else {
// Build the OpenCL program. First we have to concatenate all the fragments.
for (const char **src = srcs; src && *src; src++) {
src_size += strlen(*src);
}
fut_opencl_src = malloc(src_size + 1);
size_t n, i;
for (i = 0, n = 0; srcs && srcs[i]; i++) {
strncpy(fut_opencl_src+n, srcs[i], src_size-n);
n += strlen(srcs[i]);
}
fut_opencl_src[src_size] = 0;
}
cl_program prog;
error = CL_SUCCESS;
const char* src_ptr[] = {fut_opencl_src};
if (ctx->cfg.dump_program_to != NULL) {
FILE *f = fopen(ctx->cfg.dump_program_to, "w");
assert(f != NULL);
fputs(fut_opencl_src, f);
fclose(f);
}
if (ctx->cfg.load_binary_from == NULL) {
prog = clCreateProgramWithSource(ctx->ctx, 1, src_ptr, &src_size, &error);
OPENCL_SUCCEED_FATAL(error);
int compile_opts_size = 1024;
for (int i = 0; i < ctx->cfg.num_sizes; i++) {
compile_opts_size += strlen(ctx->cfg.size_names[i]) + 20;
}
for (int i = 0; extra_build_opts[i] != NULL; i++) {
compile_opts_size += strlen(extra_build_opts[i] + 1);
}
char *compile_opts = malloc(compile_opts_size);
int w = snprintf(compile_opts, compile_opts_size,
"-DLOCKSTEP_WIDTH=%d ",
(int)ctx->lockstep_width);
for (int i = 0; i < ctx->cfg.num_sizes; i++) {
w += snprintf(compile_opts+w, compile_opts_size-w,
"-D%s=%d ",
ctx->cfg.size_vars[i],
(int)ctx->cfg.size_values[i]);
}
for (int i = 0; extra_build_opts[i] != NULL; i++) {
w += snprintf(compile_opts+w, compile_opts_size-w,
"%s ", extra_build_opts[i]);
}
OPENCL_SUCCEED_FATAL(build_opencl_program(prog, device_option.device, compile_opts));
free(compile_opts);
} else {
size_t binary_size;
unsigned char *fut_opencl_bin =
(unsigned char*) slurp_file(ctx->cfg.load_binary_from, &binary_size);
assert(fut_opencl_src != NULL);
const unsigned char *binaries[1] = { fut_opencl_bin };
cl_int status = 0;
prog = clCreateProgramWithBinary(ctx->ctx, 1, &device_option.device,
&binary_size, binaries,
&status, &error);
OPENCL_SUCCEED_FATAL(status);
OPENCL_SUCCEED_FATAL(error);
}
free(fut_opencl_src);
if (ctx->cfg.dump_binary_to != NULL) {
size_t binary_size;
OPENCL_SUCCEED_FATAL(clGetProgramInfo(prog, CL_PROGRAM_BINARY_SIZES,
sizeof(size_t), &binary_size, NULL));
unsigned char *binary = malloc(binary_size);
unsigned char *binaries[1] = { binary };
OPENCL_SUCCEED_FATAL(clGetProgramInfo(prog, CL_PROGRAM_BINARIES,
sizeof(unsigned char*), binaries, NULL));
FILE *f = fopen(ctx->cfg.dump_binary_to, "w");
assert(f != NULL);
fwrite(binary, sizeof(char), binary_size, f);
fclose(f);
}
return prog;
}
static cl_program setup_opencl(struct opencl_context *ctx,
const char *srcs[],
int required_types,
const char *extra_build_opts[]) {
ctx->lockstep_width = 0; // Real value set later.
free_list_init(&ctx->free_list);
struct opencl_device_option device_option = get_preferred_device(&ctx->cfg);
if (ctx->cfg.logging) {
describe_device_option(device_option);
}
// Note that NVIDIA's OpenCL requires the platform property
cl_context_properties properties[] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)device_option.platform,
0
};
cl_int clCreateContext_error;
ctx->ctx = clCreateContext(properties, 1, &device_option.device, NULL, NULL, &clCreateContext_error);
OPENCL_SUCCEED_FATAL(clCreateContext_error);
cl_int clCreateCommandQueue_error;
cl_command_queue queue = clCreateCommandQueue(ctx->ctx, device_option.device, 0, &clCreateCommandQueue_error);
OPENCL_SUCCEED_FATAL(clCreateCommandQueue_error);
return setup_opencl_with_command_queue(ctx, queue, srcs, required_types, extra_build_opts);
}
// Allocate memory from driver. The problem is that OpenCL may perform
// lazy allocation, so we cannot know whether an allocation succeeded
// until the first time we try to use it. Hence we immediately
// perform a write to see if the allocation succeeded. This is slow,
// but the assumption is that this operation will be rare (most things
// will go through the free list).
int opencl_alloc_actual(struct opencl_context *ctx, size_t size, cl_mem *mem_out) {
int error;
*mem_out = clCreateBuffer(ctx->ctx, CL_MEM_READ_WRITE, size, NULL, &error);
if (error != CL_SUCCESS) {
return error;
}
int x = 2;
error = clEnqueueWriteBuffer(ctx->queue, *mem_out, 1, 0, sizeof(x), &x, 0, NULL, NULL);
// No need to wait for completion here. clWaitForEvents() cannot
// return mem object allocation failures. This implies that the
// buffer is faulted onto the device on enqueue. (Observation by
// Andreas Kloeckner.)
return error;
}
int opencl_alloc(struct opencl_context *ctx, size_t min_size, const char *tag, cl_mem *mem_out) {
assert(min_size >= 0);
if (min_size < sizeof(int)) {
min_size = sizeof(int);
}
size_t size;
if (free_list_find(&ctx->free_list, tag, &size, mem_out) == 0) {
// Successfully found a free block. Is it big enough?
//
// FIXME: we might also want to check whether the block is *too
// big*, to avoid internal fragmentation. However, this can
// sharply impact performance on programs where arrays change size
// frequently. Fortunately, such allocations are usually fairly
// short-lived, as they are necessarily within a loop, so the risk
// of internal fragmentation resulting in an OOM situation is
// limited. However, it would be preferable if we could go back
// and *shrink* oversize allocations when we encounter an OOM
// condition. That is technically feasible, since we do not
// expose OpenCL pointer values directly to the application, but
// instead rely on a level of indirection.
if (size >= min_size) {
return CL_SUCCESS;
} else {
// Not just right - free it.
int error = clReleaseMemObject(*mem_out);
if (error != CL_SUCCESS) {
return error;
}
}
}
// We have to allocate a new block from the driver. If the
// allocation does not succeed, then we might be in an out-of-memory
// situation. We now start freeing things from the free list until
// we think we have freed enough that the allocation will succeed.
// Since we don't know how far the allocation is from fitting, we
// have to check after every deallocation. This might be pretty
// expensive. Let's hope that this case is hit rarely.
int error = opencl_alloc_actual(ctx, min_size, mem_out);
while (error == CL_MEM_OBJECT_ALLOCATION_FAILURE) {
if (ctx->cfg.debugging) {
fprintf(stderr, "Out of OpenCL memory: releasing entry from the free list...\n");
}
cl_mem mem;
if (free_list_first(&ctx->free_list, &mem) == 0) {
error = clReleaseMemObject(mem);
if (error != CL_SUCCESS) {
return error;
}
} else {
break;
}
error = opencl_alloc_actual(ctx, min_size, mem_out);
}
return error;
}
int opencl_free(struct opencl_context *ctx, cl_mem mem, const char *tag) {
size_t size;
cl_mem existing_mem;
// If there is already a block with this tag, then remove it.
if (free_list_find(&ctx->free_list, tag, &size, &existing_mem) == 0) {
int error = clReleaseMemObject(existing_mem);
if (error != CL_SUCCESS) {
return error;
}
}
int error = clGetMemObjectInfo(mem, CL_MEM_SIZE, sizeof(size_t), &size, NULL);
if (error == CL_SUCCESS) {
free_list_insert(&ctx->free_list, size, mem, tag);
}
return error;
}
int opencl_free_all(struct opencl_context *ctx) {
cl_mem mem;
free_list_pack(&ctx->free_list);
while (free_list_first(&ctx->free_list, &mem) == 0) {
int error = clReleaseMemObject(mem);
if (error != CL_SUCCESS) {
return error;
}
}
return CL_SUCCESS;
}
// End of opencl.h.
const char *opencl_program[] =
{"#ifdef cl_clang_storage_class_specifiers\n#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable\n#endif\n#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n__kernel void dummy_kernel(__global unsigned char *dummy, int n)\n{\n const int thread_gid = get_global_id(0);\n \n if (thread_gid >= n)\n return;\n}\ntypedef char int8_t;\ntypedef short int16_t;\ntypedef int int32_t;\ntypedef long int64_t;\ntypedef uchar uint8_t;\ntypedef ushort uint16_t;\ntypedef uint uint32_t;\ntypedef ulong uint64_t;\n#define ALIGNED_LOCAL_MEMORY(m,size) __local unsigned char m[size] __attribute__ ((align))\n#ifdef cl_nv_pragma_unroll\nstatic inline void mem_fence_global()\n{\n asm(\"membar.gl;\");\n}\n#else\nstatic inline void mem_fence_global()\n{\n mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n}\n#endif\nstatic inline void mem_fence_local()\n{\n mem_fence(CLK_LOCAL_MEM_FENCE);\n}\nstatic inline int8_t add8(int8_t x, int8_t y)\n{\n return x + y;\n}\nstatic inline int16_t add16(int16_t x, int16_t y)\n{\n return x + y;\n}\nstatic inline int32_t add32(int32_t x, int32_t y)\n{\n return x + y;\n}\nstatic inline int64_t add64(int64_t x, int64_t y)\n{\n return x + y;\n}\nstatic inline int8_t sub8(int8_t x, int8_t y)\n{\n return x - y;\n}\nstatic inline int16_t sub16(int16_t x, int16_t y)\n{\n return x - y;\n}\nstatic inline int32_t sub32(int32_t x, int32_t y)\n{\n return x - y;\n}\nstatic inline int64_t sub64(int64_t x, int64_t y)\n{\n return x - y;\n}\nstatic inline int8_t mul8(int8_t x, int8_t y)\n{\n return x * y;\n}\nstatic inline int16_t mul16(int16_t x, int16_t y)\n{\n return x * y;\n}\nstatic inline int32_t mul32(int32_t x, int32_t y)\n{\n return x * y;\n}\nstatic inline int64_t mul64(int64_t x, int64_t y)\n{\n return x * y;\n}\nstatic inline uint8_t udiv8(uint8_t x, uint8_t y)\n{\n return x / y;\n}\nstatic inline uint16_t udiv16(uint16_t x, uint16_t y)\n{\n return x / y;\n}\nstatic inline uint32_t udiv32(uint32_t x, uint32_t y)\n{\n ",
" return x / y;\n}\nstatic inline uint64_t udiv64(uint64_t x, uint64_t y)\n{\n return x / y;\n}\nstatic inline uint8_t umod8(uint8_t x, uint8_t y)\n{\n return x % y;\n}\nstatic inline uint16_t umod16(uint16_t x, uint16_t y)\n{\n return x % y;\n}\nstatic inline uint32_t umod32(uint32_t x, uint32_t y)\n{\n return x % y;\n}\nstatic inline uint64_t umod64(uint64_t x, uint64_t y)\n{\n return x % y;\n}\nstatic inline int8_t sdiv8(int8_t x, int8_t y)\n{\n int8_t q = x / y;\n int8_t r = x % y;\n \n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\nstatic inline int16_t sdiv16(int16_t x, int16_t y)\n{\n int16_t q = x / y;\n int16_t r = x % y;\n \n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\nstatic inline int32_t sdiv32(int32_t x, int32_t y)\n{\n int32_t q = x / y;\n int32_t r = x % y;\n \n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\nstatic inline int64_t sdiv64(int64_t x, int64_t y)\n{\n int64_t q = x / y;\n int64_t r = x % y;\n \n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\nstatic inline int8_t smod8(int8_t x, int8_t y)\n{\n int8_t r = x % y;\n \n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\nstatic inline int16_t smod16(int16_t x, int16_t y)\n{\n int16_t r = x % y;\n \n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\nstatic inline int32_t smod32(int32_t x, int32_t y)\n{\n int32_t r = x % y;\n \n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\nstatic inline int64_t smod64(int64_t x, int64_t y)\n{\n int64_t r = x % y;\n \n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\nstatic inline int8_t squot8(int8_t x, int8_t y)\n{\n return x / y;\n}\nstatic inline int16_t squot16(int16_t x, int16_t y)\n{\n return x / y;\n}\nstatic inline int32_t squot32(int32_t x, int32_t y)\n{\n return x / y;\n}\nstatic inline int64_t squot64(int64_t x, int64_t y)\n{\n return x / y;\n}\nstatic inline int8_t srem8(int8_t x, int8_t y)\n{\n return x % y",
";\n}\nstatic inline int16_t srem16(int16_t x, int16_t y)\n{\n return x % y;\n}\nstatic inline int32_t srem32(int32_t x, int32_t y)\n{\n return x % y;\n}\nstatic inline int64_t srem64(int64_t x, int64_t y)\n{\n return x % y;\n}\nstatic inline int8_t smin8(int8_t x, int8_t y)\n{\n return x < y ? x : y;\n}\nstatic inline int16_t smin16(int16_t x, int16_t y)\n{\n return x < y ? x : y;\n}\nstatic inline int32_t smin32(int32_t x, int32_t y)\n{\n return x < y ? x : y;\n}\nstatic inline int64_t smin64(int64_t x, int64_t y)\n{\n return x < y ? x : y;\n}\nstatic inline uint8_t umin8(uint8_t x, uint8_t y)\n{\n return x < y ? x : y;\n}\nstatic inline uint16_t umin16(uint16_t x, uint16_t y)\n{\n return x < y ? x : y;\n}\nstatic inline uint32_t umin32(uint32_t x, uint32_t y)\n{\n return x < y ? x : y;\n}\nstatic inline uint64_t umin64(uint64_t x, uint64_t y)\n{\n return x < y ? x : y;\n}\nstatic inline int8_t smax8(int8_t x, int8_t y)\n{\n return x < y ? y : x;\n}\nstatic inline int16_t smax16(int16_t x, int16_t y)\n{\n return x < y ? y : x;\n}\nstatic inline int32_t smax32(int32_t x, int32_t y)\n{\n return x < y ? y : x;\n}\nstatic inline int64_t smax64(int64_t x, int64_t y)\n{\n return x < y ? y : x;\n}\nstatic inline uint8_t umax8(uint8_t x, uint8_t y)\n{\n return x < y ? y : x;\n}\nstatic inline uint16_t umax16(uint16_t x, uint16_t y)\n{\n return x < y ? y : x;\n}\nstatic inline uint32_t umax32(uint32_t x, uint32_t y)\n{\n return x < y ? y : x;\n}\nstatic inline uint64_t umax64(uint64_t x, uint64_t y)\n{\n return x < y ? y : x;\n}\nstatic inline uint8_t shl8(uint8_t x, uint8_t y)\n{\n return x << y;\n}\nstatic inline uint16_t shl16(uint16_t x, uint16_t y)\n{\n return x << y;\n}\nstatic inline uint32_t shl32(uint32_t x, uint32_t y)\n{\n return x << y;\n}\nstatic inline uint64_t shl64(uint64_t x, uint64_t y)\n{\n return x << y;\n}\nstatic inline uint8_t lshr8(uint8_t x, uint8_t y)\n{\n return x >> y;\n}\nstatic inline uint16_t lshr16(uint16_t x, uint16_t y)\n{\n return x >> y;\n}\nstatic inline uint32_t ",
"lshr32(uint32_t x, uint32_t y)\n{\n return x >> y;\n}\nstatic inline uint64_t lshr64(uint64_t x, uint64_t y)\n{\n return x >> y;\n}\nstatic inline int8_t ashr8(int8_t x, int8_t y)\n{\n return x >> y;\n}\nstatic inline int16_t ashr16(int16_t x, int16_t y)\n{\n return x >> y;\n}\nstatic inline int32_t ashr32(int32_t x, int32_t y)\n{\n return x >> y;\n}\nstatic inline int64_t ashr64(int64_t x, int64_t y)\n{\n return x >> y;\n}\nstatic inline uint8_t and8(uint8_t x, uint8_t y)\n{\n return x & y;\n}\nstatic inline uint16_t and16(uint16_t x, uint16_t y)\n{\n return x & y;\n}\nstatic inline uint32_t and32(uint32_t x, uint32_t y)\n{\n return x & y;\n}\nstatic inline uint64_t and64(uint64_t x, uint64_t y)\n{\n return x & y;\n}\nstatic inline uint8_t or8(uint8_t x, uint8_t y)\n{\n return x | y;\n}\nstatic inline uint16_t or16(uint16_t x, uint16_t y)\n{\n return x | y;\n}\nstatic inline uint32_t or32(uint32_t x, uint32_t y)\n{\n return x | y;\n}\nstatic inline uint64_t or64(uint64_t x, uint64_t y)\n{\n return x | y;\n}\nstatic inline uint8_t xor8(uint8_t x, uint8_t y)\n{\n return x ^ y;\n}\nstatic inline uint16_t xor16(uint16_t x, uint16_t y)\n{\n return x ^ y;\n}\nstatic inline uint32_t xor32(uint32_t x, uint32_t y)\n{\n return x ^ y;\n}\nstatic inline uint64_t xor64(uint64_t x, uint64_t y)\n{\n return x ^ y;\n}\nstatic inline char ult8(uint8_t x, uint8_t y)\n{\n return x < y;\n}\nstatic inline char ult16(uint16_t x, uint16_t y)\n{\n return x < y;\n}\nstatic inline char ult32(uint32_t x, uint32_t y)\n{\n return x < y;\n}\nstatic inline char ult64(uint64_t x, uint64_t y)\n{\n return x < y;\n}\nstatic inline char ule8(uint8_t x, uint8_t y)\n{\n return x <= y;\n}\nstatic inline char ule16(uint16_t x, uint16_t y)\n{\n return x <= y;\n}\nstatic inline char ule32(uint32_t x, uint32_t y)\n{\n return x <= y;\n}\nstatic inline char ule64(uint64_t x, uint64_t y)\n{\n return x <= y;\n}\nstatic inline char slt8(int8_t x, int8_t y)\n{\n return x < y;\n}\nstatic inline char slt16(int16_t x, int16_t y)\n{\n return x",
" < y;\n}\nstatic inline char slt32(int32_t x, int32_t y)\n{\n return x < y;\n}\nstatic inline char slt64(int64_t x, int64_t y)\n{\n return x < y;\n}\nstatic inline char sle8(int8_t x, int8_t y)\n{\n return x <= y;\n}\nstatic inline char sle16(int16_t x, int16_t y)\n{\n return x <= y;\n}\nstatic inline char sle32(int32_t x, int32_t y)\n{\n return x <= y;\n}\nstatic inline char sle64(int64_t x, int64_t y)\n{\n return x <= y;\n}\nstatic inline int8_t pow8(int8_t x, int8_t y)\n{\n int8_t res = 1, rem = y;\n \n while (rem != 0) {\n if (rem & 1)\n res *= x;\n rem >>= 1;\n x *= x;\n }\n return res;\n}\nstatic inline int16_t pow16(int16_t x, int16_t y)\n{\n int16_t res = 1, rem = y;\n \n while (rem != 0) {\n if (rem & 1)\n res *= x;\n rem >>= 1;\n x *= x;\n }\n return res;\n}\nstatic inline int32_t pow32(int32_t x, int32_t y)\n{\n int32_t res = 1, rem = y;\n \n while (rem != 0) {\n if (rem & 1)\n res *= x;\n rem >>= 1;\n x *= x;\n }\n return res;\n}\nstatic inline int64_t pow64(int64_t x, int64_t y)\n{\n int64_t res = 1, rem = y;\n \n while (rem != 0) {\n if (rem & 1)\n res *= x;\n rem >>= 1;\n x *= x;\n }\n return res;\n}\nstatic inline bool itob_i8_bool(int8_t x)\n{\n return x;\n}\nstatic inline bool itob_i16_bool(int16_t x)\n{\n return x;\n}\nstatic inline bool itob_i32_bool(int32_t x)\n{\n return x;\n}\nstatic inline bool itob_i64_bool(int64_t x)\n{\n return x;\n}\nstatic inline int8_t btoi_bool_i8(bool x)\n{\n return x;\n}\nstatic inline int16_t btoi_bool_i16(bool x)\n{\n return x;\n}\nstatic inline int32_t btoi_bool_i32(bool x)\n{\n return x;\n}\nstatic inline int64_t btoi_bool_i64(bool x)\n{\n return x;\n}\n#define sext_i8_i8(x) ((int8_t) (int8_t) x)\n#define sext_i8_i16(x) ((int16_t) (int8_t) x)\n#define sext_i8_i32(x) ((int32_t) (int8_t) x)\n#define sext_i8_i64(x) ((int64_t) (int8_t) x)\n#define sext_i16_i8(x) ((int8_t) (int16_t) x)\n#define sext_i",
"16_i16(x) ((int16_t) (int16_t) x)\n#define sext_i16_i32(x) ((int32_t) (int16_t) x)\n#define sext_i16_i64(x) ((int64_t) (int16_t) x)\n#define sext_i32_i8(x) ((int8_t) (int32_t) x)\n#define sext_i32_i16(x) ((int16_t) (int32_t) x)\n#define sext_i32_i32(x) ((int32_t) (int32_t) x)\n#define sext_i32_i64(x) ((int64_t) (int32_t) x)\n#define sext_i64_i8(x) ((int8_t) (int64_t) x)\n#define sext_i64_i16(x) ((int16_t) (int64_t) x)\n#define sext_i64_i32(x) ((int32_t) (int64_t) x)\n#define sext_i64_i64(x) ((int64_t) (int64_t) x)\n#define zext_i8_i8(x) ((uint8_t) (uint8_t) x)\n#define zext_i8_i16(x) ((uint16_t) (uint8_t) x)\n#define zext_i8_i32(x) ((uint32_t) (uint8_t) x)\n#define zext_i8_i64(x) ((uint64_t) (uint8_t) x)\n#define zext_i16_i8(x) ((uint8_t) (uint16_t) x)\n#define zext_i16_i16(x) ((uint16_t) (uint16_t) x)\n#define zext_i16_i32(x) ((uint32_t) (uint16_t) x)\n#define zext_i16_i64(x) ((uint64_t) (uint16_t) x)\n#define zext_i32_i8(x) ((uint8_t) (uint32_t) x)\n#define zext_i32_i16(x) ((uint16_t) (uint32_t) x)\n#define zext_i32_i32(x) ((uint32_t) (uint32_t) x)\n#define zext_i32_i64(x) ((uint64_t) (uint32_t) x)\n#define zext_i64_i8(x) ((uint8_t) (uint64_t) x)\n#define zext_i64_i16(x) ((uint16_t) (uint64_t) x)\n#define zext_i64_i32(x) ((uint32_t) (uint64_t) x)\n#define zext_i64_i64(x) ((uint64_t) (uint64_t) x)\nstatic inline float fdiv32(float x, float y)\n{\n return x / y;\n}\nstatic inline float fadd32(float x, float y)\n{\n return x + y;\n}\nstatic inline float fsub32(float x, float y)\n{\n return x - y;\n}\nstatic inline float fmul32(float x, float y)\n{\n return x * y;\n}\nstatic inline float fmin32(float x, float y)\n{\n return x < y ? x : y;\n}\nstatic inline float fmax32(float x, float y)\n{\n return x < y ? y : x;\n}\nstatic inline float fpow32(float x, float y)\n{\n return pow(x, y);\n}\nstatic inline char cmplt32(float x, float y)\n{\n return x < y;\n}\nstatic inline char cmple32(float x, float y)\n{\n return x <= y;\n}\nstatic inline float sitofp_i8_f32(int8_t x)\n{\n return x;\n}\nstatic inline float ",
"sitofp_i16_f32(int16_t x)\n{\n return x;\n}\nstatic inline float sitofp_i32_f32(int32_t x)\n{\n return x;\n}\nstatic inline float sitofp_i64_f32(int64_t x)\n{\n return x;\n}\nstatic inline float uitofp_i8_f32(uint8_t x)\n{\n return x;\n}\nstatic inline float uitofp_i16_f32(uint16_t x)\n{\n return x;\n}\nstatic inline float uitofp_i32_f32(uint32_t x)\n{\n return x;\n}\nstatic inline float uitofp_i64_f32(uint64_t x)\n{\n return x;\n}\nstatic inline int8_t fptosi_f32_i8(float x)\n{\n return x;\n}\nstatic inline int16_t fptosi_f32_i16(float x)\n{\n return x;\n}\nstatic inline int32_t fptosi_f32_i32(float x)\n{\n return x;\n}\nstatic inline int64_t fptosi_f32_i64(float x)\n{\n return x;\n}\nstatic inline uint8_t fptoui_f32_i8(float x)\n{\n return x;\n}\nstatic inline uint16_t fptoui_f32_i16(float x)\n{\n return x;\n}\nstatic inline uint32_t fptoui_f32_i32(float x)\n{\n return x;\n}\nstatic inline uint64_t fptoui_f32_i64(float x)\n{\n return x;\n}\nstatic inline float futrts_log32(float x)\n{\n return log(x);\n}\nstatic inline float futrts_log2_32(float x)\n{\n return log2(x);\n}\nstatic inline float futrts_log10_32(float x)\n{\n return log10(x);\n}\nstatic inline float futrts_sqrt32(float x)\n{\n return sqrt(x);\n}\nstatic inline float futrts_exp32(float x)\n{\n return exp(x);\n}\nstatic inline float futrts_cos32(float x)\n{\n return cos(x);\n}\nstatic inline float futrts_sin32(float x)\n{\n return sin(x);\n}\nstatic inline float futrts_tan32(float x)\n{\n return tan(x);\n}\nstatic inline float futrts_acos32(float x)\n{\n return acos(x);\n}\nstatic inline float futrts_asin32(float x)\n{\n return asin(x);\n}\nstatic inline float futrts_atan32(float x)\n{\n return atan(x);\n}\nstatic inline float futrts_atan2_32(float x, float y)\n{\n return atan2(x, y);\n}\nstatic inline float futrts_gamma32(float x)\n{\n return tgamma(x);\n}\nstatic inline float futrts_lgamma32(float x)\n{\n return lgamma(x);\n}\nstatic inline float futrts_round32(float x)\n{\n return rint(x);\n}\nstatic inline char futrts_isnan32(fl",
"oat x)\n{\n return isnan(x);\n}\nstatic inline char futrts_isinf32(float x)\n{\n return isinf(x);\n}\nstatic inline int32_t futrts_to_bits32(float x)\n{\n union {\n float f;\n int32_t t;\n } p;\n \n p.f = x;\n return p.t;\n}\nstatic inline float futrts_from_bits32(int32_t x)\n{\n union {\n int32_t f;\n float t;\n } p;\n \n p.f = x;\n return p.t;\n}\nstatic inline double fdiv64(double x, double y)\n{\n return x / y;\n}\nstatic inline double fadd64(double x, double y)\n{\n return x + y;\n}\nstatic inline double fsub64(double x, double y)\n{\n return x - y;\n}\nstatic inline double fmul64(double x, double y)\n{\n return x * y;\n}\nstatic inline double fmin64(double x, double y)\n{\n return x < y ? x : y;\n}\nstatic inline double fmax64(double x, double y)\n{\n return x < y ? y : x;\n}\nstatic inline double fpow64(double x, double y)\n{\n return pow(x, y);\n}\nstatic inline char cmplt64(double x, double y)\n{\n return x < y;\n}\nstatic inline char cmple64(double x, double y)\n{\n return x <= y;\n}\nstatic inline double sitofp_i8_f64(int8_t x)\n{\n return x;\n}\nstatic inline double sitofp_i16_f64(int16_t x)\n{\n return x;\n}\nstatic inline double sitofp_i32_f64(int32_t x)\n{\n return x;\n}\nstatic inline double sitofp_i64_f64(int64_t x)\n{\n return x;\n}\nstatic inline double uitofp_i8_f64(uint8_t x)\n{\n return x;\n}\nstatic inline double uitofp_i16_f64(uint16_t x)\n{\n return x;\n}\nstatic inline double uitofp_i32_f64(uint32_t x)\n{\n return x;\n}\nstatic inline double uitofp_i64_f64(uint64_t x)\n{\n return x;\n}\nstatic inline int8_t fptosi_f64_i8(double x)\n{\n return x;\n}\nstatic inline int16_t fptosi_f64_i16(double x)\n{\n return x;\n}\nstatic inline int32_t fptosi_f64_i32(double x)\n{\n return x;\n}\nstatic inline int64_t fptosi_f64_i64(double x)\n{\n return x;\n}\nstatic inline uint8_t fptoui_f64_i8(double x)\n{\n return x;\n}\nstatic inline uint16_t fptoui_f64_i16(double x)\n{\n return x;\n}\nstatic inline uint32_t fptoui_f64_i32(double x)\n{\n r",
"eturn x;\n}\nstatic inline uint64_t fptoui_f64_i64(double x)\n{\n return x;\n}\nstatic inline double futrts_log64(double x)\n{\n return log(x);\n}\nstatic inline double futrts_log2_64(double x)\n{\n return log2(x);\n}\nstatic inline double futrts_log10_64(double x)\n{\n return log10(x);\n}\nstatic inline double futrts_sqrt64(double x)\n{\n return sqrt(x);\n}\nstatic inline double futrts_exp64(double x)\n{\n return exp(x);\n}\nstatic inline double futrts_cos64(double x)\n{\n return cos(x);\n}\nstatic inline double futrts_sin64(double x)\n{\n return sin(x);\n}\nstatic inline double futrts_tan64(double x)\n{\n return tan(x);\n}\nstatic inline double futrts_acos64(double x)\n{\n return acos(x);\n}\nstatic inline double futrts_asin64(double x)\n{\n return asin(x);\n}\nstatic inline double futrts_atan64(double x)\n{\n return atan(x);\n}\nstatic inline double futrts_atan2_64(double x, double y)\n{\n return atan2(x, y);\n}\nstatic inline double futrts_gamma64(double x)\n{\n return tgamma(x);\n}\nstatic inline double futrts_lgamma64(double x)\n{\n return lgamma(x);\n}\nstatic inline double futrts_round64(double x)\n{\n return rint(x);\n}\nstatic inline char futrts_isnan64(double x)\n{\n return isnan(x);\n}\nstatic inline char futrts_isinf64(double x)\n{\n return isinf(x);\n}\nstatic inline int64_t futrts_to_bits64(double x)\n{\n union {\n double f;\n int64_t t;\n } p;\n \n p.f = x;\n return p.t;\n}\nstatic inline double futrts_from_bits64(int64_t x)\n{\n union {\n int64_t f;\n double t;\n } p;\n \n p.f = x;\n return p.t;\n}\nstatic inline float fpconv_f32_f32(float x)\n{\n return x;\n}\nstatic inline double fpconv_f32_f64(float x)\n{\n return x;\n}\nstatic inline float fpconv_f64_f32(double x)\n{\n return x;\n}\nstatic inline double fpconv_f64_f64(double x)\n{\n return x;\n}\n__kernel void copy_83061(int32_t K_68510, int32_t D_68526, __global\n unsigned char *mem_81429, __global\n unsigned char *mem_81433)\n{\n con",
"st int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t copy_gtid_83061;\n int32_t copy_ltid_83062;\n int32_t copy_gid_83063;\n \n copy_gtid_83061 = get_global_id(0);\n copy_ltid_83062 = get_local_id(0);\n copy_gid_83063 = get_group_id(0);\n if (slt32(copy_gtid_83061, K_68510 * D_68526)) {\n *(__global double *) &mem_81433[(squot32(copy_gtid_83061, D_68526) *\n D_68526 + (copy_gtid_83061 -\n squot32(copy_gtid_83061,\n D_68526) *\n D_68526)) * 8] = *(__global\n double *) &mem_81429[(0 *\n K_68510 +\n ((copy_gtid_83061 -\n squot32(copy_gtid_83061,\n D_68526) *\n D_68526) *\n K_68510 +\n squot32(copy_gtid_83061,\n D_68526))) *\n 8];\n }\n}\n__kernel void map_69238(int32_t N_68316, int32_t K_68318, int32_t D_68320,\n int32_t D_68322, int32_t triD_68324, int32_t D_68333,\n int32_t i",
"nner_ldim_80101, __global\n unsigned char *alphas_mem_80367, __global\n unsigned char *means_mem_80368, __global\n unsigned char *qs_mem_80369, __global\n unsigned char *icf_mem_80370, __global\n unsigned char *mem_80384, __global\n unsigned char *mem_80390, __global\n unsigned char *mem_80393)\n{\n const int32_t group_sizze_69861 = gmm_objectivezigroup_sizze_69218;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n ALIGNED_LOCAL_MEMORY(mem_80387_backing_0, 8 *\n sext_i32_i64(gmm_objectivezigroup_sizze_69218));\n \n int32_t global_tid_69238;\n int32_t local_tid_69239;\n int32_t group_sizze_82036;\n int32_t wave_sizze_82035;\n int32_t group_id_69240;\n \n global_tid_69238 = get_global_id(0);\n local_tid_69239 = get_local_id(0);\n group_sizze_82036 = get_local_size(0);\n wave_sizze_82035 = LOCKSTEP_WIDTH;\n group_id_69240 = get_group_id(0);\n \n int32_t gtid_69216;\n int32_t inner_ltid_80100;\n \n gtid_69216 = srem32(global_tid_69238, inner_ldim_80101) +\n squot32(global_tid_69238, inner_ldim_80101) * inner_ldim_80101;\n inner_ltid_80100 = srem32(global_tid_69238, inner_ldim_80101);\n \n __local char *mem_80387;\n \n mem_80387 = (__local char *) mem_80387_backing_0;\n \n double res_69882;\n double res_69986;\n \n if (slt32(gtid_69216, N_68316)) {\n double x_69885 = 0.0;\n \n for (int32_t chunk_offset_69884 = 0; chunk_offset_69884 < K_68318;\n chunk_offset_69884++) {\n double alphas_elem_69898 = *(__global\n double *) &alphas_mem_80367[chunk_offset_69884 *\n 8];\n double res_69904;\n double x_69907 = 0.0;\n int32_t c",
"hunk_sizze_69905;\n int32_t chunk_offset_69906 = 0;\n \n while (slt32(chunk_offset_69906, D_68333)) {\n if (slt32(D_68333 - chunk_offset_69906, group_sizze_69861)) {\n chunk_sizze_69905 = D_68333 - chunk_offset_69906;\n } else {\n chunk_sizze_69905 = group_sizze_69861;\n }\n \n double res_69909;\n double sync_80107;\n \n for (int32_t comb_iter_82038 = 0; comb_iter_82038 <\n squot32(group_sizze_69861 + inner_ldim_80101 - 1,\n inner_ldim_80101); comb_iter_82038++) {\n int32_t cid_80099;\n int32_t flat_comb_id_82039 = comb_iter_82038 *\n inner_ldim_80101 + local_tid_69239;\n \n cid_80099 = flat_comb_id_82039;\n if (slt32(cid_80099, chunk_sizze_69905) && 1) {\n double x_chunk_outer_elem_80098 = *(__global\n double *) &qs_mem_80369[(chunk_offset_69884 *\n D_68322 +\n chunk_offset_69906 +\n local_tid_69239) *\n 8];\n \n *(__local double *) &mem_80387[cid_80099 * 8] =\n x_chunk_outer_elem_80098;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n double acc_69912 = x_69907;\n int32_t groupstream_mapaccum_dummy_chunk_sizze_69910;\n \n groupstream_mapaccum_dummy_",
"chunk_sizze_69910 = 1;\n if (chunk_sizze_69905 == group_sizze_69861) {\n for (int32_t i_69911 = 0; i_69911 < group_sizze_69861;\n i_69911++) {\n double x_69914;\n double res_69917;\n \n x_69914 = *(__local double *) &mem_80387[i_69911 * 8];\n res_69917 = acc_69912 + x_69914;\n \n double acc_tmp_82040 = res_69917;\n \n acc_69912 = acc_tmp_82040;\n }\n } else {\n for (int32_t i_69911 = 0; i_69911 < chunk_sizze_69905;\n i_69911++) {\n double x_69914;\n double res_69917;\n \n x_69914 = *(__local double *) &mem_80387[i_69911 * 8];\n res_69917 = acc_69912 + x_69914;\n \n double acc_tmp_82041 = res_69917;\n \n acc_69912 = acc_tmp_82041;\n }\n }\n res_69909 = acc_69912;\n sync_80107 = res_69909;\n barrier(CLK_LOCAL_MEM_FENCE);\n x_69907 = sync_80107;\n chunk_offset_69906 += group_sizze_69861;\n }\n res_69904 = x_69907;\n \n double x_69918;\n double res_69928;\n double y_69981;\n double res_69982;\n double res_69983;\n double res_69985;\n \n x_69918 = alphas_elem_69898 + res_69904;\n for (int32_t i_69923 = 0; i_69923 < D_68333; i_69923++) {\n double x_elem_elem_69924;\n double means_elem_elem_69925;\n double res_69926;\n \n x_elem_elem_69924 = *(__global double *) &mem_80384",
"[(i_69923 *\n N_68316 +\n gtid_69216) *\n 8];\n means_elem_elem_69925 = *(__global\n double *) &means_mem_80368[(chunk_offset_69884 *\n D_68320 +\n i_69923) *\n 8];\n res_69926 = x_elem_elem_69924 - means_elem_elem_69925;\n *(__global double *) &mem_80390[(group_id_69240 *\n (inner_ldim_80101 * D_68333) +\n local_tid_69239 + i_69923 *\n inner_ldim_80101) * 8] =\n res_69926;\n }\n \n double x_69931 = 0.0;\n \n for (int32_t chunk_offset_69930 = 0; chunk_offset_69930 < D_68333;\n chunk_offset_69930++) {\n double qs_elem_elem_69941;\n double res_69943;\n double res_69978;\n double res_69980;\n \n qs_elem_elem_69941 = *(__global\n double *) &qs_mem_80369[(chunk_offset_69884 *\n D_68322 +\n chunk_offset_69930) *\n 8];\n \n double x_69946 = 0.0;\n \n for (int32_t chunk_offset_69945 = 0; chunk_offset_69945 <\n D_68333; chunk_offset_69945++) {\n double x_69956;\n bool co",
"nd_69958;\n double res_69959;\n double res_69975;\n double res_69977;\n \n x_69956 = *(__global double *) &mem_80390[(group_id_69240 *\n (inner_ldim_80101 *\n D_68333) +\n local_tid_69239 +\n chunk_offset_69945 *\n inner_ldim_80101) *\n 8];\n cond_69958 = slt32(chunk_offset_69930, chunk_offset_69945);\n if (cond_69958) {\n res_69959 = 0.0;\n } else {\n bool cond_69960;\n double res_69961;\n \n cond_69960 = chunk_offset_69930 == chunk_offset_69945;\n if (cond_69960) {\n double res_69962;\n \n res_69962 = futrts_exp64(qs_elem_elem_69941);\n res_69961 = res_69962;\n } else {\n int32_t y_69963;\n int32_t x_69964;\n int32_t res_69965;\n int32_t gmm_knossos_tri_arg_69966;\n int32_t y_69967;\n int32_t x_69968;\n int32_t res_69969;\n int32_t x_69970;\n int32_t x_69971;\n int32_t y_69972;\n int32_t i_69973;\n double res_69974;\n \n y_69963 = D_68333 - 1;\n ",
" x_69964 = D_68333 * y_69963;\n res_69965 = sdiv32(x_69964, 2);\n gmm_knossos_tri_arg_69966 = D_68333 -\n chunk_offset_69945;\n y_69967 = gmm_knossos_tri_arg_69966 - 1;\n x_69968 = gmm_knossos_tri_arg_69966 * y_69967;\n res_69969 = sdiv32(x_69968, 2);\n x_69970 = res_69965 - res_69969;\n x_69971 = chunk_offset_69930 - chunk_offset_69945;\n y_69972 = x_69971 - 1;\n i_69973 = x_69970 + y_69972;\n res_69974 = *(__global\n double *) &icf_mem_80370[(chunk_offset_69884 *\n triD_68324 +\n i_69973) *\n 8];\n res_69961 = res_69974;\n }\n res_69959 = res_69961;\n }\n res_69975 = x_69956 * res_69959;\n res_69977 = x_69946 + res_69975;\n \n double x_tmp_82044 = res_69977;\n \n x_69946 = x_tmp_82044;\n }\n res_69943 = x_69946;\n res_69978 = res_69943 * res_69943;\n res_69980 = x_69931 + res_69978;\n \n double x_tmp_82043 = res_69980;\n \n x_69931 = x_tmp_82043;\n }\n res_69928 = x_69931;\n y_69981 = 0.5 * res_69928;\n res_69982 = x_69918 - y_69981;\n res_69983 = futrts_exp64(res_69982);\n res_69985 = x_69885 + res_69983;\n \n double x_tmp_82037 = res_69985;\n ",
" \n x_69885 = x_tmp_82037;\n }\n res_69882 = x_69885;\n res_69986 = futrts_log64(res_69882);\n }\n if (slt32(gtid_69216, N_68316)) {\n *(__global double *) &mem_80393[gtid_69216 * 8] = res_69986;\n }\n}\n__kernel void map_69582(int32_t N_68316, __global unsigned char *mem_80432,\n __global unsigned char *mem_80435)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_69582;\n int32_t local_tid_69583;\n int32_t group_sizze_82102;\n int32_t wave_sizze_82101;\n int32_t group_id_69584;\n \n global_tid_69582 = get_global_id(0);\n local_tid_69583 = get_local_id(0);\n group_sizze_82102 = get_local_size(0);\n wave_sizze_82101 = LOCKSTEP_WIDTH;\n group_id_69584 = get_group_id(0);\n \n int32_t gtid_69575;\n \n gtid_69575 = global_tid_69582;\n \n double res_70226;\n double res_70227;\n \n if (slt32(gtid_69575, N_68316)) {\n res_70226 = *(__global double *) &mem_80432[gtid_69575 * 8];\n res_70227 = futrts_log64(res_70226);\n }\n if (slt32(gtid_69575, N_68316)) {\n *(__global double *) &mem_80435[gtid_69575 * 8] = res_70227;\n }\n}\n__kernel void map_70494(int32_t K_68318, int32_t K_68321, int32_t K_68323,\n int32_t triD_68324, int32_t D_68333, double x_68452,\n double res_68453, double y_68475,\n int32_t num_groups_70963, int32_t virt_groups_70970,\n __global unsigned char *mem_80461, __global\n unsigned char *mem_80465, __global\n unsigned char *mem_80468)\n{\n const int32_t group_sizze_70953 = gmm_objectivezigroup_sizze_70474;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_70494;\n int32_t local_tid_70495;\n int32_t group_sizze_82195;\n int32_t wave_sizze_82194;\n int32_t group_id_70496;\n ",
" \n global_tid_70494 = get_global_id(0);\n local_tid_70495 = get_local_id(0);\n group_sizze_82195 = get_local_size(0);\n wave_sizze_82194 = LOCKSTEP_WIDTH;\n group_id_70496 = get_group_id(0);\n \n int32_t gtid_70472;\n int32_t phys_group_id_82196;\n \n phys_group_id_82196 = get_group_id(0);\n for (int32_t i_82197 = 0; i_82197 < squot32(virt_groups_70970 -\n phys_group_id_82196 +\n num_groups_70963 - 1,\n num_groups_70963); i_82197++) {\n int32_t virt_group_id_82198 = phys_group_id_82196 + i_82197 *\n num_groups_70963;\n \n gtid_70472 = virt_group_id_82198 * group_sizze_70953 + local_tid_70495;\n \n double res_70975;\n double res_70991;\n double y_71006;\n double y_71007;\n double x_71008;\n double res_71009;\n double y_71023;\n double x_71024;\n double res_71025;\n \n if (slt32(gtid_70472, K_68318)) {\n double x_70978 = 0.0;\n \n for (int32_t chunk_offset_70977 = 0; chunk_offset_70977 < D_68333;\n chunk_offset_70977++) {\n double qs_elem_elem_70985;\n double res_70987;\n double res_70988;\n double res_70990;\n \n qs_elem_elem_70985 = *(__global\n double *) &mem_80465[(chunk_offset_70977 *\n K_68321 +\n gtid_70472) * 8];\n res_70987 = futrts_exp64(qs_elem_elem_70985);\n res_70988 = res_70987 * res_70987;\n res_70990 = x_70978 + res_70988;\n \n double x_tmp_82199 = res_70990;\n \n x_70978 = x_tmp_82199;\n }\n res_",
"70975 = x_70978;\n \n double x_70994 = 0.0;\n \n for (int32_t chunk_offset_70993 = 0; chunk_offset_70993 <\n triD_68324; chunk_offset_70993++) {\n double x_71001;\n double res_71003;\n double res_71005;\n \n x_71001 = *(__global double *) &mem_80461[(chunk_offset_70993 *\n K_68323 +\n gtid_70472) * 8];\n res_71003 = x_71001 * x_71001;\n res_71005 = x_70994 + res_71003;\n \n double x_tmp_82200 = res_71005;\n \n x_70994 = x_tmp_82200;\n }\n res_70991 = x_70994;\n y_71006 = res_70975 + res_70991;\n y_71007 = x_68452 * y_71006;\n x_71008 = 0.5 * y_71007;\n \n double x_71012 = 0.0;\n \n for (int32_t chunk_offset_71011 = 0; chunk_offset_71011 < D_68333;\n chunk_offset_71011++) {\n double x_71019;\n double res_71022;\n \n x_71019 = *(__global double *) &mem_80465[(chunk_offset_71011 *\n K_68321 +\n gtid_70472) * 8];\n res_71022 = x_71012 + x_71019;\n \n double x_tmp_82201 = res_71022;\n \n x_71012 = x_tmp_82201;\n }\n res_71009 = x_71012;\n y_71023 = res_68453 * res_71009;\n x_71024 = x_71008 - y_71023;\n res_71025 = x_71024 - y_68475;\n }\n if (slt32(gtid_70472, K_68318)) {\n *(__global double *) &mem_80468[gtid_70472 * 8] = res_71025;\n }\n }\n}\n__kernel void map_70670(int32_t K_68318, double res_68453, double y_68475,\n ",
" __global unsigned char *mem_80489, __global\n unsigned char *mem_80492, __global\n unsigned char *mem_80495)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_70670;\n int32_t local_tid_70671;\n int32_t group_sizze_82331;\n int32_t wave_sizze_82330;\n int32_t group_id_70672;\n \n global_tid_70670 = get_global_id(0);\n local_tid_70671 = get_local_id(0);\n group_sizze_82331 = get_local_size(0);\n wave_sizze_82330 = LOCKSTEP_WIDTH;\n group_id_70672 = get_group_id(0);\n \n int32_t gtid_70663;\n \n gtid_70663 = global_tid_70670;\n \n double x_71173;\n double res_71174;\n double y_71175;\n double x_71176;\n double res_71177;\n \n if (slt32(gtid_70663, K_68318)) {\n x_71173 = *(__global double *) &mem_80489[gtid_70663 * 8];\n res_71174 = *(__global double *) &mem_80492[gtid_70663 * 8];\n y_71175 = res_68453 * res_71174;\n x_71176 = x_71173 - y_71175;\n res_71177 = x_71176 - y_68475;\n }\n if (slt32(gtid_70663, K_68318)) {\n *(__global double *) &mem_80495[gtid_70663 * 8] = res_71177;\n }\n}\n__kernel void map_70713(int32_t K_68318, double x_68452, __global\n unsigned char *mem_80483, __global\n unsigned char *mem_80486, __global\n unsigned char *mem_80489)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_70713;\n int32_t local_tid_70714;\n int32_t group_sizze_82292;\n int32_t wave_sizze_82291;\n int32_t group_id_70715;\n \n global_tid_70713 = get_global_id(0);\n local_tid_70714 = get_local_id(0);\n group_sizze_82292 = get_local_size(0);\n wave_sizze_82291 = LOCKSTEP_WIDTH;\n group_id_70715 = get_group_id(0);\n \n int32_t gtid_70706;\n \n gtid_70706 = global_tid_70713;\n \n double res_71138;\n double res_71139;\n d",
"ouble y_71140;\n double y_71141;\n double x_71142;\n \n if (slt32(gtid_70706, K_68318)) {\n res_71138 = *(__global double *) &mem_80483[gtid_70706 * 8];\n res_71139 = *(__global double *) &mem_80486[gtid_70706 * 8];\n y_71140 = res_71138 + res_71139;\n y_71141 = x_68452 * y_71140;\n x_71142 = 0.5 * y_71141;\n }\n if (slt32(gtid_70706, K_68318)) {\n *(__global double *) &mem_80489[gtid_70706 * 8] = x_71142;\n }\n}\n__kernel void map_71445(int32_t N_68508, int32_t K_68510, int32_t D_68512,\n int32_t D_68514, int32_t triD_68516, double d_r_68524,\n int32_t D_68526, int32_t num_groups_71747,\n int32_t virt_groups_71754, __global\n unsigned char *alphas_mem_80367, __global\n unsigned char *means_mem_80368, __global\n unsigned char *qs_mem_80369, __global\n unsigned char *icf_mem_80370, __global\n unsigned char *mem_80374, __global\n unsigned char *mem_80377, __global\n unsigned char *mem_80380, __global\n unsigned char *mem_80384, __global\n unsigned char *mem_80388, __global\n unsigned char *mem_80392, __global\n unsigned char *mem_80395, __global\n unsigned char *mem_80398, __global\n unsigned char *mem_80402, __global\n unsigned char *mem_80411, __global\n unsigned char *mem_80415, __global\n unsigned char *mem_80419, __global\n unsigned char *mem_80422, __global\n unsigned char *mem_80426, __global\n unsigned char *mem_80430, __global\n unsigned char *mem_80433, __global\n unsigned char *mem_80436, __g",
"lobal\n unsigned char *mem_80439, __global\n unsigned char *mem_80442, __global\n unsigned char *mem_80445, __global\n unsigned char *mem_80448, __global\n unsigned char *mem_80480, __global\n unsigned char *mem_80485, __global\n unsigned char *mem_80490, __global\n unsigned char *mem_80495, __global\n unsigned char *mem_81655, __global\n unsigned char *mem_81658, __global\n unsigned char *mem_81665, __global\n unsigned char *mem_81668)\n{\n const int32_t group_sizze_71737 = rev_gmm_objectivezigroup_sizze_71425;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_71445;\n int32_t local_tid_71446;\n int32_t group_sizze_82366;\n int32_t wave_sizze_82365;\n int32_t group_id_71447;\n \n global_tid_71445 = get_global_id(0);\n local_tid_71446 = get_local_id(0);\n group_sizze_82366 = get_local_size(0);\n wave_sizze_82365 = LOCKSTEP_WIDTH;\n group_id_71447 = get_group_id(0);\n \n int32_t gtid_71423;\n int32_t phys_group_id_82367;\n \n phys_group_id_82367 = get_group_id(0);\n for (int32_t i_82368 = 0; i_82368 < squot32(virt_groups_71754 -\n phys_group_id_82367 +\n num_groups_71747 - 1,\n num_groups_71747); i_82368++) {\n int32_t virt_group_id_82369 = phys_group_id_82367 + i_82368 *\n num_groups_71747;\n \n gtid_71423 = virt_group_id_82369 * group_sizze_71737 + local_tid_71446;\n \n double res_71763;\n double x_71832;\n double res_71833;\n \n if (slt32(gtid_71423, N_68508)) {\n double redout_71765 = 0.0;\n ",
" \n for (int32_t i_71767 = 0; i_71767 < K_68510; i_71767++) {\n double alphas_elem_71768;\n double res_71773;\n double x_71780;\n double res_71790;\n double y_71825;\n double res_71826;\n double res_71827;\n double res_71830;\n \n alphas_elem_71768 = *(__global\n double *) &alphas_mem_80367[i_71767 * 8];\n \n double redout_71774 = 0.0;\n \n for (int32_t i_71775 = 0; i_71775 < D_68526; i_71775++) {\n double x_71776;\n double res_71779;\n \n x_71776 = *(__global double *) &qs_mem_80369[(i_71767 *\n D_68514 +\n i_71775) * 8];\n res_71779 = redout_71774 + x_71776;\n \n double redout_tmp_82372 = res_71779;\n \n redout_71774 = redout_tmp_82372;\n }\n res_71773 = redout_71774;\n x_71780 = alphas_elem_71768 + res_71773;\n for (int32_t i_71785 = 0; i_71785 < D_68526; i_71785++) {\n double x_elem_elem_71786;\n double means_elem_elem_71787;\n double res_71788;\n \n x_elem_elem_71786 = *(__global\n double *) &mem_80374[(i_71785 *\n N_68508 +\n gtid_71423) *\n 8];\n means_elem_elem_71787 = *(__global\n double *) &means_mem_80368[",
"(i_71767 *\n D_68512 +\n i_71785) *\n 8];\n res_71788 = x_elem_elem_71786 - means_elem_elem_71787;\n *(__global double *) &mem_80380[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n local_tid_71446 + i_71785 *\n group_sizze_71737) * 8] =\n res_71788;\n }\n \n double redout_71791 = 0.0;\n \n for (int32_t i_71792 = 0; i_71792 < D_68526; i_71792++) {\n double qs_elem_elem_71794;\n double res_71795;\n double res_71821;\n double res_71824;\n \n qs_elem_elem_71794 = *(__global\n double *) &qs_mem_80369[(i_71767 *\n D_68514 +\n i_71792) *\n 8];\n \n double redout_71796 = 0.0;\n \n for (int32_t i_71797 = 0; i_71797 < D_68526; i_71797++) {\n double x_71799;\n bool cond_71800;\n double res_71801;\n double res_71817;\n double res_71820;\n \n x_71799 = *(__global\n double *) &mem_80380[(group_id_71447 *\n ",
" (group_sizze_71737 *\n D_68526) +\n local_tid_71446 +\n i_71797 *\n group_sizze_71737) *\n 8];\n cond_71800 = slt32(i_71792, i_71797);\n if (cond_71800) {\n res_71801 = 0.0;\n } else {\n bool cond_71802;\n double res_71803;\n \n cond_71802 = i_71792 == i_71797;\n if (cond_71802) {\n double res_71804;\n \n res_71804 = futrts_exp64(qs_elem_elem_71794);\n res_71803 = res_71804;\n } else {\n int32_t y_71805;\n int32_t x_71806;\n int32_t res_71807;\n int32_t gmm_knossos_tri_arg_71808;\n int32_t y_71809;\n int32_t x_71810;\n int32_t res_71811;\n int32_t x_71812;\n int32_t x_71813;\n int32_t y_71814;\n int32_t i_71815;\n double res_71816;\n \n y_71805 = D_68526 - 1;\n x_71806 = D_68526 * y_71805;\n res_71807 = sdiv32(x_71806, 2);\n gmm_knossos_tri_arg_71808 = D_68526 - i_71797;\n y_7180",
"9 = gmm_knossos_tri_arg_71808 - 1;\n x_71810 = gmm_knossos_tri_arg_71808 * y_71809;\n res_71811 = sdiv32(x_71810, 2);\n x_71812 = res_71807 - res_71811;\n x_71813 = i_71792 - i_71797;\n y_71814 = x_71813 - 1;\n i_71815 = x_71812 + y_71814;\n res_71816 = *(__global\n double *) &icf_mem_80370[(i_71767 *\n triD_68516 +\n i_71815) *\n 8];\n res_71803 = res_71816;\n }\n res_71801 = res_71803;\n }\n res_71817 = x_71799 * res_71801;\n res_71820 = redout_71796 + res_71817;\n \n double redout_tmp_82375 = res_71820;\n \n redout_71796 = redout_tmp_82375;\n }\n res_71795 = redout_71796;\n res_71821 = res_71795 * res_71795;\n res_71824 = redout_71791 + res_71821;\n \n double redout_tmp_82374 = res_71824;\n \n redout_71791 = redout_tmp_82374;\n }\n res_71790 = redout_71791;\n y_71825 = 0.5 * res_71790;\n res_71826 = x_71780 - y_71825;\n res_71827 = futrts_exp64(res_71826);\n res_71830 = redout_71765 + res_71827;\n *(__global double *) &mem_80377[(group_id_71447 *\n (group_sizze_71737 * K_68510) +\n ",
" local_tid_71446 + i_71767 *\n group_sizze_71737) * 8] =\n res_71826;\n \n double redout_tmp_82370 = res_71830;\n \n redout_71765 = redout_tmp_82370;\n }\n res_71763 = redout_71765;\n x_71832 = 1.0 / res_71763;\n res_71833 = d_r_68524 * x_71832;\n for (int32_t i_71847 = 0; i_71847 < K_68510; i_71847++) {\n double res_elem_71848;\n double res_71852;\n double res_71853;\n double y_71893;\n double rev_sqnorm_arg_71894;\n \n res_elem_71848 = *(__global\n double *) &mem_80377[(group_id_71447 *\n (group_sizze_71737 *\n K_68510) +\n local_tid_71446 +\n i_71847 *\n group_sizze_71737) *\n 8];\n res_71852 = futrts_exp64(res_elem_71848);\n res_71853 = res_71833 * res_71852;\n for (int32_t i_71862 = 0; i_71862 < D_68526; i_71862++) {\n double qs_elem_elem_71864;\n double x_elem_elem_71865;\n double means_elem_elem_71866;\n double res_71890;\n \n qs_elem_elem_71864 = *(__global\n double *) &qs_mem_80369[(i_71847 *\n D_68514 +\n i_71862) *\n 8];\n",
" x_elem_elem_71865 = *(__global\n double *) &mem_80374[(i_71862 *\n N_68508 +\n gtid_71423) *\n 8];\n means_elem_elem_71866 = *(__global\n double *) &means_mem_80368[(i_71847 *\n D_68512 +\n i_71862) *\n 8];\n for (int32_t i_71870 = 0; i_71870 < D_68526; i_71870++) {\n bool cond_71872;\n double res_71873;\n \n cond_71872 = slt32(i_71862, i_71870);\n if (cond_71872) {\n res_71873 = 0.0;\n } else {\n bool cond_71874;\n double res_71875;\n \n cond_71874 = i_71862 == i_71870;\n if (cond_71874) {\n double res_71876;\n \n res_71876 = futrts_exp64(qs_elem_elem_71864);\n res_71875 = res_71876;\n } else {\n int32_t y_71877;\n int32_t x_71878;\n int32_t res_71879;\n int32_t gmm_knossos_tri_arg_71880;\n int32_t y_71881;\n int32_t x_71882;\n int32_t res_71883;\n int32_t x_71884;\n ",
" int32_t x_71885;\n int32_t y_71886;\n int32_t i_71887;\n double res_71888;\n \n y_71877 = D_68526 - 1;\n x_71878 = D_68526 * y_71877;\n res_71879 = sdiv32(x_71878, 2);\n gmm_knossos_tri_arg_71880 = D_68526 - i_71870;\n y_71881 = gmm_knossos_tri_arg_71880 - 1;\n x_71882 = gmm_knossos_tri_arg_71880 * y_71881;\n res_71883 = sdiv32(x_71882, 2);\n x_71884 = res_71879 - res_71883;\n x_71885 = i_71862 - i_71870;\n y_71886 = x_71885 - 1;\n i_71887 = x_71884 + y_71886;\n res_71888 = *(__global\n double *) &icf_mem_80370[(i_71847 *\n triD_68516 +\n i_71887) *\n 8];\n res_71875 = res_71888;\n }\n res_71873 = res_71875;\n }\n *(__global double *) &mem_80402[(group_id_71447 *\n (group_sizze_71737 *\n D_68526 * D_68526) +\n local_tid_71446 +\n i_71862 *\n (group_sizze_71737 *\n D_68526) + ",
"i_71870 *\n group_sizze_71737) *\n 8] = res_71873;\n }\n res_71890 = x_elem_elem_71865 - means_elem_elem_71866;\n *(__global double *) &mem_80398[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n local_tid_71446 + i_71862 *\n group_sizze_71737) * 8] =\n res_71890;\n }\n y_71893 = 0.0 - res_71853;\n rev_sqnorm_arg_71894 = 0.5 * y_71893;\n for (int32_t i_71898 = 0; i_71898 < D_68526; i_71898++) {\n double res_71900;\n double res_71909;\n double res_71910;\n double redout_71901 = 0.0;\n \n for (int32_t i_71902 = 0; i_71902 < D_68526; i_71902++) {\n double x_71903;\n double x_71904;\n double res_71905;\n double res_71908;\n \n x_71903 = *(__global\n double *) &mem_80398[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n local_tid_71446 +\n i_71902 *\n group_sizze_71737) *\n 8];\n x_71904 = *(__global\n double *) &mem_80402[(group_id_71447 *\n ",
" (group_sizze_71737 *\n D_68526 * D_68526) +\n local_tid_71446 +\n (i_71898 *\n (group_sizze_71737 *\n D_68526) + i_71902 *\n group_sizze_71737)) *\n 8];\n res_71905 = x_71903 * x_71904;\n res_71908 = redout_71901 + res_71905;\n \n double redout_tmp_82384 = res_71908;\n \n redout_71901 = redout_tmp_82384;\n }\n res_71900 = redout_71901;\n res_71909 = rev_sqnorm_arg_71894 * res_71900;\n res_71910 = res_71909 + res_71909;\n *(__global double *) &mem_80411[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n local_tid_71446 + i_71898 *\n group_sizze_71737) * 8] =\n res_71910;\n }\n for (int32_t i_71922 = 0; i_71922 < D_68526; i_71922++) {\n double x_71923;\n double qs_elem_elem_71926;\n double res_71927;\n double res_71936;\n \n x_71923 = *(__global double *) &mem_80411[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n ",
" local_tid_71446 +\n i_71922 *\n group_sizze_71737) *\n 8];\n qs_elem_elem_71926 = *(__global\n double *) &qs_mem_80369[(i_71847 *\n D_68514 +\n i_71922) *\n 8];\n \n double redout_71928 = 0.0;\n \n for (int32_t i_71929 = 0; i_71929 < D_68526; i_71929++) {\n double x_71930;\n double x_71931;\n double res_71932;\n double res_71935;\n \n x_71930 = *(__global\n double *) &mem_80402[(group_id_71447 *\n (group_sizze_71737 *\n D_68526 * D_68526) +\n local_tid_71446 +\n (i_71929 *\n (group_sizze_71737 *\n D_68526) + i_71922 *\n group_sizze_71737)) *\n 8];\n x_71931 = *(__global\n double *) &mem_80411[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n ",
" local_tid_71446 +\n i_71929 *\n group_sizze_71737) *\n 8];\n res_71932 = x_71930 * x_71931;\n res_71935 = redout_71928 + res_71932;\n \n double redout_tmp_82388 = res_71935;\n \n redout_71928 = redout_tmp_82388;\n }\n res_71927 = redout_71928;\n res_71936 = 0.0 - res_71927;\n for (int32_t i_71943 = 0; i_71943 < D_68526; i_71943++) {\n double x_71944;\n double res_71946;\n bool cond_71947;\n bool cond_71948;\n \n x_71944 = *(__global\n double *) &mem_80398[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n local_tid_71446 +\n i_71943 *\n group_sizze_71737) *\n 8];\n res_71946 = x_71923 * x_71944;\n cond_71947 = slt32(i_71922, i_71943);\n cond_71948 = i_71922 == i_71943;\n if (cond_71947) {\n for (int32_t i_82391 = 0; i_82391 < D_68526;\n i_82391++) {\n *(__global\n double *) &mem_80433[(group_id_71447 *\n (grou",
"p_sizze_71737 *\n D_68526) +\n local_tid_71446 +\n i_82391 *\n group_sizze_71737) *\n 8] = 0.0;\n }\n for (int32_t i_82392 = 0; i_82392 < triD_68516;\n i_82392++) {\n *(__global\n double *) &mem_80436[(group_id_71447 *\n (group_sizze_71737 *\n triD_68516) +\n local_tid_71446 +\n i_82392 *\n group_sizze_71737) *\n 8] = 0.0;\n }\n for (int32_t i_82393 = 0; i_82393 < D_68526;\n i_82393++) {\n *(__global\n double *) &mem_81668[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n local_tid_71446 +\n i_82393 *\n group_sizze_71737) *\n 8] = *(__global\n double *) &mem_80433[(group_id_71447 *\n (group_sizze_71737 *\n ",
" D_68526) +\n local_tid_71446 +\n i_82393 *\n group_sizze_71737) *\n 8];\n }\n for (int32_t i_82394 = 0; i_82394 < triD_68516;\n i_82394++) {\n *(__global\n double *) &mem_81665[(group_id_71447 *\n (group_sizze_71737 *\n triD_68516) +\n local_tid_71446 +\n i_82394 *\n group_sizze_71737) *\n 8] = *(__global\n double *) &mem_80436[(group_id_71447 *\n (group_sizze_71737 *\n triD_68516) +\n local_tid_71446 +\n i_82394 *\n group_sizze_71737) *\n 8];\n }\n } else {\n if (cond_71948) {\n double res",
"_71955;\n double deltaVec_arg_71956;\n \n res_71955 = futrts_exp64(qs_elem_elem_71926);\n deltaVec_arg_71956 = res_71946 * res_71955;\n for (int32_t i_71961 = 0; i_71961 < D_68526;\n i_71961++) {\n bool cond_71963;\n double res_71964;\n \n cond_71963 = i_71961 == i_71922;\n if (cond_71963) {\n res_71964 = deltaVec_arg_71956;\n } else {\n res_71964 = 0.0;\n }\n *(__global\n double *) &mem_80439[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n local_tid_71446 +\n i_71961 *\n group_sizze_71737) *\n 8] = res_71964;\n }\n for (int32_t i_82396 = 0; i_82396 < triD_68516;\n i_82396++) {\n *(__global\n double *) &mem_80442[(group_id_71447 *\n (group_sizze_71737 *\n triD_68516) +\n local_tid_71446 +\n ",
" i_82396 *\n group_sizze_71737) *\n 8] = 0.0;\n }\n for (int32_t i_82397 = 0; i_82397 < D_68526;\n i_82397++) {\n *(__global\n double *) &mem_81658[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n local_tid_71446 +\n i_82397 *\n group_sizze_71737) *\n 8] = *(__global\n double *) &mem_80439[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n local_tid_71446 +\n i_82397 *\n group_sizze_71737) *\n 8];\n }\n for (int32_t i_82398 = 0; i_82398 < triD_68516;\n i_82398++) {\n *(__global\n double *) &mem_81655[(group_id_71447 *\n (group_si",
"zze_71737 *\n triD_68516) +\n local_tid_71446 +\n i_82398 *\n group_sizze_71737) *\n 8] = *(__global\n double *) &mem_80442[(group_id_71447 *\n (group_sizze_71737 *\n triD_68516) +\n local_tid_71446 +\n i_82398 *\n group_sizze_71737) *\n 8];\n }\n } else {\n int32_t y_71967;\n int32_t x_71968;\n int32_t res_71969;\n int32_t deltaVec_arg_71970;\n \n y_71967 = i_71922 - 1;\n x_71968 = i_71922 * y_71967;\n res_71969 = sdiv32(x_71968, 2);\n deltaVec_arg_71970 = i_71943 + res_71969;\n for (int32_t i_71975 = 0; i_71975 < triD_68516;\n i_71975++) {\n bool cond_71977;\n double res_71978;\n \n cond_71977 ",
"= i_71975 == deltaVec_arg_71970;\n if (cond_71977) {\n res_71978 = res_71946;\n } else {\n res_71978 = 0.0;\n }\n *(__global\n double *) &mem_80445[(group_id_71447 *\n (group_sizze_71737 *\n triD_68516) +\n local_tid_71446 +\n i_71975 *\n group_sizze_71737) *\n 8] = res_71978;\n }\n for (int32_t i_82400 = 0; i_82400 < D_68526;\n i_82400++) {\n *(__global\n double *) &mem_80448[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n local_tid_71446 +\n i_82400 *\n group_sizze_71737) *\n 8] = 0.0;\n }\n for (int32_t i_82401 = 0; i_82401 < D_68526;\n i_82401++) {\n *(__global\n double *) &mem_81658[(group_id_71447 *\n (group_sizze_71737 *\n ",
" D_68526) +\n local_tid_71446 +\n i_82401 *\n group_sizze_71737) *\n 8] = *(__global\n double *) &mem_80448[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n local_tid_71446 +\n i_82401 *\n group_sizze_71737) *\n 8];\n }\n for (int32_t i_82402 = 0; i_82402 < triD_68516;\n i_82402++) {\n *(__global\n double *) &mem_81655[(group_id_71447 *\n (group_sizze_71737 *\n triD_68516) +\n local_tid_71446 +\n i_82402 *\n group_sizze_71737) *\n 8] = *(__global\n double *) &mem_80445[(group_id_71447 *\n ",
" (group_sizze_71737 *\n triD_68516) +\n local_tid_71446 +\n i_82402 *\n group_sizze_71737) *\n 8];\n }\n }\n for (int32_t i_82403 = 0; i_82403 < D_68526;\n i_82403++) {\n *(__global\n double *) &mem_81668[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n local_tid_71446 +\n i_82403 *\n group_sizze_71737) *\n 8] = *(__global\n double *) &mem_81658[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n local_tid_71446 +\n i_82403 *\n group_sizze_71737) *\n 8];\n }\n ",
" for (int32_t i_82404 = 0; i_82404 < triD_68516;\n i_82404++) {\n *(__global\n double *) &mem_81665[(group_id_71447 *\n (group_sizze_71737 *\n triD_68516) +\n local_tid_71446 +\n i_82404 *\n group_sizze_71737) *\n 8] = *(__global\n double *) &mem_81655[(group_id_71447 *\n (group_sizze_71737 *\n triD_68516) +\n local_tid_71446 +\n i_82404 *\n group_sizze_71737) *\n 8];\n }\n }\n for (int32_t i_82405 = 0; i_82405 < triD_68516;\n i_82405++) {\n *(__global double *) &mem_80430[(group_id_71447 *\n (group_sizze_71737 *\n triD_68516 *\n D_68526) +\n local_tid_71446 +\n i_71943 *\n ",
" (group_sizze_71737 *\n triD_68516) +\n i_82405 *\n group_sizze_71737) *\n 8] = *(__global\n double *) &mem_81665[(group_id_71447 *\n (group_sizze_71737 *\n triD_68516) +\n local_tid_71446 +\n i_82405 *\n group_sizze_71737) *\n 8];\n }\n for (int32_t i_82406 = 0; i_82406 < D_68526;\n i_82406++) {\n *(__global double *) &mem_80426[(group_id_71447 *\n (group_sizze_71737 *\n D_68526 *\n D_68526) +\n local_tid_71446 +\n i_71943 *\n (group_sizze_71737 *\n D_68526) +\n i_82406 *\n group_sizze_71737) *",
"\n 8] = *(__global\n double *) &mem_81668[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n local_tid_71446 +\n i_82406 *\n group_sizze_71737) *\n 8];\n }\n }\n for (int32_t i_71987 = 0; i_71987 < D_68526; i_71987++) {\n double res_71989;\n double redout_71990 = 0.0;\n \n for (int32_t i_71991 = 0; i_71991 < D_68526;\n i_71991++) {\n double x_71992;\n double res_71995;\n \n x_71992 = *(__global\n double *) &mem_80426[(group_id_71447 *\n (group_sizze_71737 *\n D_68526 *\n D_68526) +\n local_tid_71446 +\n (i_71991 *\n (group_sizze_71737 *\n D_68526) +\n ",
" i_71987 *\n group_sizze_71737)) *\n 8];\n res_71995 = redout_71990 + x_71992;\n \n double redout_tmp_82408 = res_71995;\n \n redout_71990 = redout_tmp_82408;\n }\n res_71989 = redout_71990;\n *(__global double *) &mem_80415[(group_id_71447 *\n (group_sizze_71737 *\n D_68526 * D_68526) +\n local_tid_71446 +\n i_71922 *\n (group_sizze_71737 *\n D_68526) + i_71987 *\n group_sizze_71737) *\n 8] = res_71989;\n }\n for (int32_t i_72001 = 0; i_72001 < triD_68516; i_72001++) {\n double res_72003;\n double redout_72004 = 0.0;\n \n for (int32_t i_72005 = 0; i_72005 < D_68526;\n i_72005++) {\n double x_72006;\n double res_72009;\n \n x_72006 = *(__global\n double *) &mem_80430[(group_id_71447 *\n (group_sizze_71737 *\n triD_68516 *\n D_68526) +\n ",
" local_tid_71446 +\n (i_72005 *\n (group_sizze_71737 *\n triD_68516) +\n i_72001 *\n group_sizze_71737)) *\n 8];\n res_72009 = redout_72004 + x_72006;\n \n double redout_tmp_82410 = res_72009;\n \n redout_72004 = redout_tmp_82410;\n }\n res_72003 = redout_72004;\n *(__global double *) &mem_80419[(group_id_71447 *\n (group_sizze_71737 *\n triD_68516 *\n D_68526) +\n local_tid_71446 +\n i_71922 *\n (group_sizze_71737 *\n triD_68516) +\n i_72001 *\n group_sizze_71737) *\n 8] = res_72003;\n }\n *(__global double *) &mem_80422[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n local_tid_71446 + i_71922 *\n ",
" group_sizze_71737) * 8] =\n res_71936;\n }\n for (int32_t i_82411 = 0; i_82411 < D_68526; i_82411++) {\n *(__global double *) &mem_80384[(group_id_71447 *\n (group_sizze_71737 *\n D_68526 * K_68510) +\n local_tid_71446 + i_71847 *\n (group_sizze_71737 *\n D_68526) + i_82411 *\n group_sizze_71737) * 8] =\n *(__global double *) &mem_80422[(group_id_71447 *\n (group_sizze_71737 *\n D_68526) +\n local_tid_71446 +\n i_82411 *\n group_sizze_71737) *\n 8];\n }\n for (int32_t i_72018 = 0; i_72018 < triD_68516; i_72018++) {\n double res_72020;\n double redout_72021 = 0.0;\n \n for (int32_t i_72022 = 0; i_72022 < D_68526; i_72022++) {\n double x_72023;\n double res_72026;\n \n x_72023 = *(__global\n double *) &mem_80419[(group_id_71447 *\n (group_sizze_71737 *\n triD_68516 *\n D_68526) +\n local_tid_71446 +\n ",
" (i_72022 *\n (group_sizze_71737 *\n triD_68516) +\n i_72018 *\n group_sizze_71737)) *\n 8];\n res_72026 = redout_72021 + x_72023;\n \n double redout_tmp_82413 = res_72026;\n \n redout_72021 = redout_tmp_82413;\n }\n res_72020 = redout_72021;\n *(__global double *) &mem_80392[(group_id_71447 *\n (group_sizze_71737 *\n triD_68516 * K_68510) +\n local_tid_71446 + i_71847 *\n (group_sizze_71737 *\n triD_68516) + i_72018 *\n group_sizze_71737) * 8] =\n res_72020;\n }\n for (int32_t i_72032 = 0; i_72032 < D_68526; i_72032++) {\n double res_72034;\n double res_72041;\n double redout_72035 = 0.0;\n \n for (int32_t i_72036 = 0; i_72036 < D_68526; i_72036++) {\n double x_72037;\n double res_72040;\n \n x_72037 = *(__global\n double *) &mem_80415[(group_id_71447 *\n (group_sizze_71737 *\n D_68526 * D_68526) +\n ",
" local_tid_71446 +\n (i_72036 *\n (group_sizze_71737 *\n D_68526) + i_72032 *\n group_sizze_71737)) *\n 8];\n res_72040 = redout_72035 + x_72037;\n \n double redout_tmp_82415 = res_72040;\n \n redout_72035 = redout_tmp_82415;\n }\n res_72034 = redout_72035;\n res_72041 = res_71853 + res_72034;\n *(__global double *) &mem_80388[(group_id_71447 *\n (group_sizze_71737 *\n D_68526 * K_68510) +\n local_tid_71446 + i_71847 *\n (group_sizze_71737 *\n D_68526) + i_72032 *\n group_sizze_71737) * 8] =\n res_72041;\n }\n *(__global double *) &mem_80395[(group_id_71447 *\n (group_sizze_71737 * K_68510) +\n local_tid_71446 + i_71847 *\n group_sizze_71737) * 8] =\n res_71853;\n }\n }\n if (slt32(gtid_71423, N_68508)) {\n for (int32_t i_82416 = 0; i_82416 < K_68510; i_82416++) {\n *(__global double *) &mem_80480[(gtid_71423 + i_82416 *\n N_68508) * 8] = *(__global\n ",
" double *) &mem_80395[(group_id_71447 *\n (group_sizze_71737 *\n K_68510) +\n local_tid_71446 +\n i_82416 *\n group_sizze_71737) *\n 8];\n }\n }\n if (slt32(gtid_71423, N_68508)) {\n for (int32_t i_82417 = 0; i_82417 < K_68510; i_82417++) {\n for (int32_t i_82418 = 0; i_82418 < D_68526; i_82418++) {\n *(__global double *) &mem_80485[(N_68508 * D_68526 * 0 +\n N_68508 * 0 + gtid_71423 +\n (i_82417 * (N_68508 *\n D_68526) +\n i_82418 * N_68508)) * 8] =\n *(__global double *) &mem_80384[(group_id_71447 *\n (group_sizze_71737 *\n D_68526 * K_68510) +\n local_tid_71446 +\n (i_82417 *\n (group_sizze_71737 *\n D_68526) + i_82418 *\n group_sizze_71737)) *\n 8];\n ",
" }\n }\n }\n if (slt32(gtid_71423, N_68508)) {\n for (int32_t i_82419 = 0; i_82419 < K_68510; i_82419++) {\n for (int32_t i_82420 = 0; i_82420 < D_68526; i_82420++) {\n *(__global double *) &mem_80490[(N_68508 * D_68526 * 0 +\n N_68508 * 0 + gtid_71423 +\n (i_82419 * (N_68508 *\n D_68526) +\n i_82420 * N_68508)) * 8] =\n *(__global double *) &mem_80388[(group_id_71447 *\n (group_sizze_71737 *\n D_68526 * K_68510) +\n local_tid_71446 +\n (i_82419 *\n (group_sizze_71737 *\n D_68526) + i_82420 *\n group_sizze_71737)) *\n 8];\n }\n }\n }\n if (slt32(gtid_71423, N_68508)) {\n for (int32_t i_82421 = 0; i_82421 < K_68510; i_82421++) {\n for (int32_t i_82422 = 0; i_82422 < triD_68516; i_82422++) {\n *(__global double *) &mem_80495[(N_68508 * triD_68516 * 0 +\n N_68508 * 0 + gtid_71423 +\n (i_82421 * (N_68508 *\n triD_68516) +\n i_82422 * N_68508)) * 8] =\n *(__global double *) &mem_80392[(group_id_71447 *\n ",
" (group_sizze_71737 *\n triD_68516 *\n K_68510) +\n local_tid_71446 +\n (i_82421 *\n (group_sizze_71737 *\n triD_68516) +\n i_82422 *\n group_sizze_71737)) *\n 8];\n }\n }\n }\n }\n}\n__kernel void map_72450(int32_t N_68508, int32_t D_68509, int32_t K_68510,\n int32_t K_68511, int32_t K_68513, int32_t K_68515,\n int32_t triD_68516, int32_t D_68526,\n int32_t num_groups_76462, int32_t virt_groups_76469,\n __global unsigned char *x_mem_80366, __global\n unsigned char *mem_80631, __global\n unsigned char *mem_80635, __global\n unsigned char *mem_80639, __global\n unsigned char *mem_80649, __global\n unsigned char *mem_80652, __global\n unsigned char *mem_80655, __global\n unsigned char *mem_80659, __global\n unsigned char *mem_80668, __global\n unsigned char *mem_80672, __global\n unsigned char *mem_80676, __global\n unsigned char *mem_80679, __global\n unsigned char *mem_80683, __global\n unsigned char *mem_80687, __global\n unsigned char *mem_80690, __global\n unsigned char *mem_80693, __global\n ",
" unsigned char *mem_80696, __global\n unsigned char *mem_80699, __global\n unsigned char *mem_80702, __global\n unsigned char *mem_80705, __global\n unsigned char *mem_80724, __global\n unsigned char *mem_80727, __global\n unsigned char *mem_80731, __global\n unsigned char *mem_80736, __global\n unsigned char *mem_80741, __global\n unsigned char *mem_80746, __global\n unsigned char *mem_81705, __global\n unsigned char *mem_81708, __global\n unsigned char *mem_81715, __global\n unsigned char *mem_81718)\n{\n const int32_t group_sizze_76452 = rev_gmm_objectivezigroup_sizze_72430;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_72450;\n int32_t local_tid_72451;\n int32_t group_sizze_82532;\n int32_t wave_sizze_82531;\n int32_t group_id_72452;\n \n global_tid_72450 = get_global_id(0);\n local_tid_72451 = get_local_id(0);\n group_sizze_82532 = get_local_size(0);\n wave_sizze_82531 = LOCKSTEP_WIDTH;\n group_id_72452 = get_group_id(0);\n \n int32_t gtid_72426;\n int32_t gtid_72427;\n int32_t phys_group_id_82533;\n \n phys_group_id_82533 = get_group_id(0);\n for (int32_t i_82534 = 0; i_82534 < squot32(virt_groups_76469 -\n phys_group_id_82533 +\n num_groups_76462 - 1,\n num_groups_76462); i_82534++) {\n int32_t virt_group_id_82535 = phys_group_id_82533 + i_82534 *\n num_groups_76462;\n \n gtid_72426 = squot32(virt_group_id_82535 * group_sizze_76452 +\n local_tid_72451, K_68510);\n ",
"gtid_72427 = virt_group_id_82535 * group_sizze_76452 + local_tid_72451 -\n squot32(virt_group_id_82535 * group_sizze_76452 + local_tid_72451,\n K_68510) * K_68510;\n \n double res_76495;\n double res_elem_76497;\n double res_76501;\n double res_76502;\n double y_76542;\n double rev_sqnorm_arg_76543;\n \n if (slt32(gtid_72426, N_68508) && slt32(gtid_72427, K_68510)) {\n res_76495 = *(__global double *) &mem_80652[gtid_72426 * 8];\n res_elem_76497 = *(__global double *) &mem_80649[(gtid_72426 *\n K_68510 +\n gtid_72427) * 8];\n res_76501 = futrts_exp64(res_elem_76497);\n res_76502 = res_76495 * res_76501;\n for (int32_t i_76511 = 0; i_76511 < D_68526; i_76511++) {\n double qs_elem_elem_76513;\n double x_elem_elem_76514;\n double means_elem_elem_76515;\n double res_76539;\n \n qs_elem_elem_76513 = *(__global double *) &mem_80631[(i_76511 *\n K_68513 +\n gtid_72427) *\n 8];\n x_elem_elem_76514 = *(__global\n double *) &x_mem_80366[(gtid_72426 *\n D_68509 +\n i_76511) * 8];\n means_elem_elem_76515 = *(__global\n double *) &mem_80635[(i_76511 *\n K_68511 +\n gtid_72427) *\n ",
" 8];\n for (int32_t i_76519 = 0; i_76519 < D_68526; i_76519++) {\n bool cond_76521;\n double res_76522;\n \n cond_76521 = slt32(i_76511, i_76519);\n if (cond_76521) {\n res_76522 = 0.0;\n } else {\n bool cond_76523;\n double res_76524;\n \n cond_76523 = i_76511 == i_76519;\n if (cond_76523) {\n double res_76525;\n \n res_76525 = futrts_exp64(qs_elem_elem_76513);\n res_76524 = res_76525;\n } else {\n int32_t y_76526;\n int32_t x_76527;\n int32_t res_76528;\n int32_t gmm_knossos_tri_arg_76529;\n int32_t y_76530;\n int32_t x_76531;\n int32_t res_76532;\n int32_t x_76533;\n int32_t x_76534;\n int32_t y_76535;\n int32_t i_76536;\n double res_76537;\n \n y_76526 = D_68526 - 1;\n x_76527 = D_68526 * y_76526;\n res_76528 = sdiv32(x_76527, 2);\n gmm_knossos_tri_arg_76529 = D_68526 - i_76519;\n y_76530 = gmm_knossos_tri_arg_76529 - 1;\n x_76531 = gmm_knossos_tri_arg_76529 * y_76530;\n res_76532 = sdiv32(x_76531, 2);\n x_76533 = res_76528 - res_76532;\n x_76534 = i_76511 - i_76519;\n y_",
"76535 = x_76534 - 1;\n i_76536 = x_76533 + y_76535;\n res_76537 = *(__global\n double *) &mem_80639[(i_76536 *\n K_68515 +\n gtid_72427) *\n 8];\n res_76524 = res_76537;\n }\n res_76522 = res_76524;\n }\n *(__global double *) &mem_80659[(group_id_72452 *\n (group_sizze_76452 *\n D_68526 * D_68526) +\n local_tid_72451 + i_76511 *\n (group_sizze_76452 *\n D_68526) + i_76519 *\n group_sizze_76452) * 8] =\n res_76522;\n }\n res_76539 = x_elem_elem_76514 - means_elem_elem_76515;\n *(__global double *) &mem_80655[(group_id_72452 *\n (group_sizze_76452 * D_68526) +\n local_tid_72451 + i_76511 *\n group_sizze_76452) * 8] =\n res_76539;\n }\n y_76542 = 0.0 - res_76502;\n rev_sqnorm_arg_76543 = 0.5 * y_76542;\n for (int32_t i_76547 = 0; i_76547 < D_68526; i_76547++) {\n double res_76549;\n double res_76558;\n double res_76559;\n double redout_76550 = 0.0;\n \n for (int32_t i_76551 = 0; i_76551 < D_68526; i_76551++) {\n double x_76552;\n ",
" double x_76553;\n double res_76554;\n double res_76557;\n \n x_76552 = *(__global double *) &mem_80655[(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local_tid_72451 +\n i_76551 *\n group_sizze_76452) *\n 8];\n x_76553 = *(__global double *) &mem_80659[(group_id_72452 *\n (group_sizze_76452 *\n D_68526 *\n D_68526) +\n local_tid_72451 +\n (i_76547 *\n (group_sizze_76452 *\n D_68526) +\n i_76551 *\n group_sizze_76452)) *\n 8];\n res_76554 = x_76552 * x_76553;\n res_76557 = redout_76550 + res_76554;\n \n double redout_tmp_82540 = res_76557;\n \n redout_76550 = redout_tmp_82540;\n }\n res_76549 = redout_76550;\n res_76558 = rev_sqnorm_arg_76543 * res_76549;\n res_76559 = res_76558 + res_76558;\n *(__global double *) &me",
"m_80668[(group_id_72452 *\n (group_sizze_76452 * D_68526) +\n local_tid_72451 + i_76547 *\n group_sizze_76452) * 8] =\n res_76559;\n }\n for (int32_t i_76571 = 0; i_76571 < D_68526; i_76571++) {\n double x_76572;\n double qs_elem_elem_76575;\n double res_76576;\n double res_76585;\n \n x_76572 = *(__global double *) &mem_80668[(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local_tid_72451 +\n i_76571 *\n group_sizze_76452) *\n 8];\n qs_elem_elem_76575 = *(__global double *) &mem_80631[(i_76571 *\n K_68513 +\n gtid_72427) *\n 8];\n \n double redout_76577 = 0.0;\n \n for (int32_t i_76578 = 0; i_76578 < D_68526; i_76578++) {\n double x_76579;\n double x_76580;\n double res_76581;\n double res_76584;\n \n x_76579 = *(__global double *) &mem_80659[(group_id_72452 *\n (group_sizze_76452 *\n D_68526 *\n D_68526) +\n ",
" local_tid_72451 +\n (i_76578 *\n (group_sizze_76452 *\n D_68526) +\n i_76571 *\n group_sizze_76452)) *\n 8];\n x_76580 = *(__global double *) &mem_80668[(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local_tid_72451 +\n i_76578 *\n group_sizze_76452) *\n 8];\n res_76581 = x_76579 * x_76580;\n res_76584 = redout_76577 + res_76581;\n \n double redout_tmp_82544 = res_76584;\n \n redout_76577 = redout_tmp_82544;\n }\n res_76576 = redout_76577;\n res_76585 = 0.0 - res_76576;\n for (int32_t i_76592 = 0; i_76592 < D_68526; i_76592++) {\n double x_76593;\n double res_76595;\n bool cond_76596;\n bool cond_76597;\n \n x_76593 = *(__global double *) &mem_80655[(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local",
"_tid_72451 +\n i_76592 *\n group_sizze_76452) *\n 8];\n res_76595 = x_76572 * x_76593;\n cond_76596 = slt32(i_76571, i_76592);\n cond_76597 = i_76571 == i_76592;\n if (cond_76596) {\n for (int32_t i_82547 = 0; i_82547 < D_68526;\n i_82547++) {\n *(__global double *) &mem_80690[(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local_tid_72451 +\n i_82547 *\n group_sizze_76452) *\n 8] = 0.0;\n }\n for (int32_t i_82548 = 0; i_82548 < triD_68516;\n i_82548++) {\n *(__global double *) &mem_80693[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516) +\n local_tid_72451 +\n i_82548 *\n group_sizze_76452) *\n 8] = 0.0;\n }\n for (int32_t i_82549 = 0; i_82549 < D_68526;\n i_82549++) {\n *(__global double *) &mem_81718[(group_id_72452 *\n ",
" (group_sizze_76452 *\n D_68526) +\n local_tid_72451 +\n i_82549 *\n group_sizze_76452) *\n 8] = *(__global\n double *) &mem_80690[(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local_tid_72451 +\n i_82549 *\n group_sizze_76452) *\n 8];\n }\n for (int32_t i_82550 = 0; i_82550 < triD_68516;\n i_82550++) {\n *(__global double *) &mem_81715[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516) +\n local_tid_72451 +\n i_82550 *\n group_sizze_76452) *\n 8] = *(__global\n double *) &mem_80693[(group_id_72452 *\n ",
" (group_sizze_76452 *\n triD_68516) +\n local_tid_72451 +\n i_82550 *\n group_sizze_76452) *\n 8];\n }\n } else {\n if (cond_76597) {\n double res_76604;\n double deltaVec_arg_76605;\n \n res_76604 = futrts_exp64(qs_elem_elem_76575);\n deltaVec_arg_76605 = res_76595 * res_76604;\n for (int32_t i_76610 = 0; i_76610 < D_68526;\n i_76610++) {\n bool cond_76612;\n double res_76613;\n \n cond_76612 = i_76610 == i_76571;\n if (cond_76612) {\n res_76613 = deltaVec_arg_76605;\n } else {\n res_76613 = 0.0;\n }\n *(__global\n double *) &mem_80696[(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local_tid_72451 +\n i_76610 *\n group_sizze_76452) *\n ",
" 8] = res_76613;\n }\n for (int32_t i_82552 = 0; i_82552 < triD_68516;\n i_82552++) {\n *(__global\n double *) &mem_80699[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516) +\n local_tid_72451 +\n i_82552 *\n group_sizze_76452) *\n 8] = 0.0;\n }\n for (int32_t i_82553 = 0; i_82553 < D_68526;\n i_82553++) {\n *(__global\n double *) &mem_81708[(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local_tid_72451 +\n i_82553 *\n group_sizze_76452) *\n 8] = *(__global\n double *) &mem_80696[(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local_tid_72451 +\n i_82553 *\n ",
" group_sizze_76452) *\n 8];\n }\n for (int32_t i_82554 = 0; i_82554 < triD_68516;\n i_82554++) {\n *(__global\n double *) &mem_81705[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516) +\n local_tid_72451 +\n i_82554 *\n group_sizze_76452) *\n 8] = *(__global\n double *) &mem_80699[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516) +\n local_tid_72451 +\n i_82554 *\n group_sizze_76452) *\n 8];\n }\n } else {\n int32_t y_76616;\n int32_t x_76617;\n int32_t res_76618;\n int32_t deltaVec_arg_76619;\n \n y_76616 = i_76571 - 1;\n x_76617 = i_76571 * y_76616;\n res_76618 = sdiv32(x_76617, 2);\n ",
" deltaVec_arg_76619 = i_76592 + res_76618;\n for (int32_t i_76624 = 0; i_76624 < triD_68516;\n i_76624++) {\n bool cond_76626;\n double res_76627;\n \n cond_76626 = i_76624 == deltaVec_arg_76619;\n if (cond_76626) {\n res_76627 = res_76595;\n } else {\n res_76627 = 0.0;\n }\n *(__global\n double *) &mem_80702[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516) +\n local_tid_72451 +\n i_76624 *\n group_sizze_76452) *\n 8] = res_76627;\n }\n for (int32_t i_82556 = 0; i_82556 < D_68526;\n i_82556++) {\n *(__global\n double *) &mem_80705[(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local_tid_72451 +\n i_82556 *\n group_sizze_76452) *\n 8] = 0.0;\n }\n for (int32_t i_82557 = 0; i_82557 < D_68526;\n ",
" i_82557++) {\n *(__global\n double *) &mem_81708[(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local_tid_72451 +\n i_82557 *\n group_sizze_76452) *\n 8] = *(__global\n double *) &mem_80705[(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local_tid_72451 +\n i_82557 *\n group_sizze_76452) *\n 8];\n }\n for (int32_t i_82558 = 0; i_82558 < triD_68516;\n i_82558++) {\n *(__global\n double *) &mem_81705[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516) +\n local_tid_72451 +\n i_82558 *\n group_sizze_76452) *\n 8] = *(__global\n ",
" double *) &mem_80702[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516) +\n local_tid_72451 +\n i_82558 *\n group_sizze_76452) *\n 8];\n }\n }\n for (int32_t i_82559 = 0; i_82559 < D_68526;\n i_82559++) {\n *(__global double *) &mem_81718[(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local_tid_72451 +\n i_82559 *\n group_sizze_76452) *\n 8] = *(__global\n double *) &mem_81708[(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local_tid_72451 +\n i_82559 *\n group_sizze_76452) ",
"*\n 8];\n }\n for (int32_t i_82560 = 0; i_82560 < triD_68516;\n i_82560++) {\n *(__global double *) &mem_81715[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516) +\n local_tid_72451 +\n i_82560 *\n group_sizze_76452) *\n 8] = *(__global\n double *) &mem_81705[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516) +\n local_tid_72451 +\n i_82560 *\n group_sizze_76452) *\n 8];\n }\n }\n for (int32_t i_82561 = 0; i_82561 < triD_68516; i_82561++) {\n *(__global double *) &mem_80687[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516 *\n D_68526) +\n local_tid_72451 ",
"+\n i_76592 *\n (group_sizze_76452 *\n triD_68516) +\n i_82561 *\n group_sizze_76452) *\n 8] = *(__global\n double *) &mem_81715[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516) +\n local_tid_72451 +\n i_82561 *\n group_sizze_76452) *\n 8];\n }\n for (int32_t i_82562 = 0; i_82562 < D_68526; i_82562++) {\n *(__global double *) &mem_80683[(group_id_72452 *\n (group_sizze_76452 *\n D_68526 * D_68526) +\n local_tid_72451 +\n i_76592 *\n (group_sizze_76452 *\n D_68526) + i_82562 *\n group_sizze_76452) *\n 8] = *(__global\n double *) &mem_81718[",
"(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local_tid_72451 +\n i_82562 *\n group_sizze_76452) *\n 8];\n }\n }\n for (int32_t i_76636 = 0; i_76636 < D_68526; i_76636++) {\n double res_76638;\n double redout_76639 = 0.0;\n \n for (int32_t i_76640 = 0; i_76640 < D_68526; i_76640++) {\n double x_76641;\n double res_76644;\n \n x_76641 = *(__global\n double *) &mem_80683[(group_id_72452 *\n (group_sizze_76452 *\n D_68526 * D_68526) +\n local_tid_72451 +\n (i_76640 *\n (group_sizze_76452 *\n D_68526) + i_76636 *\n group_sizze_76452)) *\n 8];\n res_76644 = redout_76639 + x_76641;\n \n double redout_tmp_82564 = res_76644;\n \n redout_76639 = redout_tmp_82564;\n }\n ",
" res_76638 = redout_76639;\n *(__global double *) &mem_80672[(group_id_72452 *\n (group_sizze_76452 *\n D_68526 * D_68526) +\n local_tid_72451 + i_76571 *\n (group_sizze_76452 *\n D_68526) + i_76636 *\n group_sizze_76452) * 8] =\n res_76638;\n }\n for (int32_t i_76650 = 0; i_76650 < triD_68516; i_76650++) {\n double res_76652;\n double redout_76653 = 0.0;\n \n for (int32_t i_76654 = 0; i_76654 < D_68526; i_76654++) {\n double x_76655;\n double res_76658;\n \n x_76655 = *(__global\n double *) &mem_80687[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516 *\n D_68526) +\n local_tid_72451 +\n (i_76654 *\n (group_sizze_76452 *\n triD_68516) +\n i_76650 *\n group_sizze_76452)) *\n 8];\n res_76658 = redout_76653 + x_76655;\n \n double redout_tmp_82566 = res_76658;\n ",
" \n redout_76653 = redout_tmp_82566;\n }\n res_76652 = redout_76653;\n *(__global double *) &mem_80676[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516 * D_68526) +\n local_tid_72451 + i_76571 *\n (group_sizze_76452 *\n triD_68516) + i_76650 *\n group_sizze_76452) * 8] =\n res_76652;\n }\n *(__global double *) &mem_80679[(group_id_72452 *\n (group_sizze_76452 * D_68526) +\n local_tid_72451 + i_76571 *\n group_sizze_76452) * 8] =\n res_76585;\n }\n for (int32_t i_76667 = 0; i_76667 < triD_68516; i_76667++) {\n double res_76669;\n double redout_76670 = 0.0;\n \n for (int32_t i_76671 = 0; i_76671 < D_68526; i_76671++) {\n double x_76672;\n double res_76675;\n \n x_76672 = *(__global double *) &mem_80676[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516 *\n D_68526) +\n local_tid_72451 +\n (i_76671 *\n (group_sizze_76452 *\n ",
" triD_68516) +\n i_76667 *\n group_sizze_76452)) *\n 8];\n res_76675 = redout_76670 + x_76672;\n \n double redout_tmp_82568 = res_76675;\n \n redout_76670 = redout_tmp_82568;\n }\n res_76669 = redout_76670;\n *(__global double *) &mem_80724[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516) +\n local_tid_72451 + i_76667 *\n group_sizze_76452) * 8] =\n res_76669;\n }\n for (int32_t i_76681 = 0; i_76681 < D_68526; i_76681++) {\n double res_76683;\n double res_76690;\n double redout_76684 = 0.0;\n \n for (int32_t i_76685 = 0; i_76685 < D_68526; i_76685++) {\n double x_76686;\n double res_76689;\n \n x_76686 = *(__global double *) &mem_80672[(group_id_72452 *\n (group_sizze_76452 *\n D_68526 *\n D_68526) +\n local_tid_72451 +\n (i_76685 *\n (group_sizze_76452 *\n D_68526) +\n i_76681 *\n",
" group_sizze_76452)) *\n 8];\n res_76689 = redout_76684 + x_76686;\n \n double redout_tmp_82570 = res_76689;\n \n redout_76684 = redout_tmp_82570;\n }\n res_76683 = redout_76684;\n res_76690 = res_76502 + res_76683;\n *(__global double *) &mem_80727[(group_id_72452 *\n (group_sizze_76452 * D_68526) +\n local_tid_72451 + i_76681 *\n group_sizze_76452) * 8] =\n res_76690;\n }\n }\n if (slt32(gtid_72426, N_68508) && slt32(gtid_72427, K_68510)) {\n *(__global double *) &mem_80731[(gtid_72426 * K_68510 +\n gtid_72427) * 8] = res_76502;\n }\n if (slt32(gtid_72426, N_68508) && slt32(gtid_72427, K_68510)) {\n for (int32_t i_82571 = 0; i_82571 < D_68526; i_82571++) {\n *(__global double *) &mem_80736[(K_68510 * N_68508 * 0 +\n gtid_72426 * K_68510 +\n gtid_72427 + i_82571 *\n (K_68510 * N_68508)) * 8] =\n *(__global double *) &mem_80679[(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local_tid_72451 + i_82571 *\n group_sizze_76452) * 8];\n }\n }\n if (slt32(gtid_72426, N_68508) && slt32(gtid_72427, K_68510)) {\n for (int32_t i_82572 = 0; i_82572 < D_",
"68526; i_82572++) {\n *(__global double *) &mem_80741[(K_68510 * N_68508 * 0 +\n gtid_72426 * K_68510 +\n gtid_72427 + i_82572 *\n (K_68510 * N_68508)) * 8] =\n *(__global double *) &mem_80727[(group_id_72452 *\n (group_sizze_76452 *\n D_68526) +\n local_tid_72451 + i_82572 *\n group_sizze_76452) * 8];\n }\n }\n if (slt32(gtid_72426, N_68508) && slt32(gtid_72427, K_68510)) {\n for (int32_t i_82573 = 0; i_82573 < triD_68516; i_82573++) {\n *(__global double *) &mem_80746[(K_68510 * N_68508 * 0 +\n gtid_72426 * K_68510 +\n gtid_72427 + i_82573 *\n (K_68510 * N_68508)) * 8] =\n *(__global double *) &mem_80724[(group_id_72452 *\n (group_sizze_76452 *\n triD_68516) +\n local_tid_72451 + i_82573 *\n group_sizze_76452) * 8];\n }\n }\n }\n}\n__kernel void map_72987(int32_t N_68508, int32_t K_68510, int32_t D_68526,\n __global unsigned char *mem_81347, __global\n unsigned char *mem_81359, __global\n unsigned char *mem_81364)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_72987;\n int32_t local_tid_72988;\n int32_t group_sizze_82988;\n int32_t wave_s",
"izze_82987;\n int32_t group_id_72989;\n \n global_tid_72987 = get_global_id(0);\n local_tid_72988 = get_local_id(0);\n group_sizze_82988 = get_local_size(0);\n wave_sizze_82987 = LOCKSTEP_WIDTH;\n group_id_72989 = get_group_id(0);\n \n int32_t gtid_72976;\n int32_t gtid_72977;\n int32_t gtid_72978;\n \n gtid_72976 = squot32(global_tid_72987, K_68510 * D_68526);\n gtid_72977 = squot32(global_tid_72987 - squot32(global_tid_72987, K_68510 *\n D_68526) * (K_68510 *\n D_68526),\n D_68526);\n gtid_72978 = global_tid_72987 - squot32(global_tid_72987, K_68510 *\n D_68526) * (K_68510 * D_68526) -\n squot32(global_tid_72987 - squot32(global_tid_72987, K_68510 *\n D_68526) * (K_68510 * D_68526),\n D_68526) * D_68526;\n \n double res_77857;\n double res_77859;\n double res_77873;\n \n if ((slt32(gtid_72976, N_68508) && slt32(gtid_72977, K_68510)) &&\n slt32(gtid_72978, D_68526)) {\n res_77857 = *(__global double *) &mem_81347[(gtid_72977 * N_68508 +\n gtid_72976) * 8];\n \n double x_77862 = 0.0;\n \n for (int32_t chunk_offset_77861 = 0; chunk_offset_77861 < D_68526;\n chunk_offset_77861++) {\n double x_77869;\n double res_77872;\n \n x_77869 = *(__global double *) &mem_81359[(chunk_offset_77861 *\n (D_68526 * K_68510 *\n N_68508) + gtid_72976 *\n (D_68526 * K_68510) +\n gtid_72977 * D_68526 +\n gtid_72978) ",
"* 8];\n res_77872 = x_77862 + x_77869;\n \n double x_tmp_82989 = res_77872;\n \n x_77862 = x_tmp_82989;\n }\n res_77859 = x_77862;\n res_77873 = res_77857 + res_77859;\n }\n if ((slt32(gtid_72976, N_68508) && slt32(gtid_72977, K_68510)) &&\n slt32(gtid_72978, D_68526)) {\n *(__global double *) &mem_81364[(gtid_72976 * (D_68526 * K_68510) +\n gtid_72977 * D_68526 + gtid_72978) *\n 8] = res_77873;\n }\n}\n__kernel void map_73051(int32_t N_68508, int32_t K_68510, int32_t D_68526,\n __global unsigned char *mem_81387, __global\n unsigned char *mem_81391, __global\n unsigned char *mem_81396)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_73051;\n int32_t local_tid_73052;\n int32_t group_sizze_83035;\n int32_t wave_sizze_83034;\n int32_t group_id_73053;\n \n global_tid_73051 = get_global_id(0);\n local_tid_73052 = get_local_id(0);\n group_sizze_83035 = get_local_size(0);\n wave_sizze_83034 = LOCKSTEP_WIDTH;\n group_id_73053 = get_group_id(0);\n \n int32_t gtid_73040;\n int32_t gtid_73041;\n int32_t gtid_73042;\n \n gtid_73040 = squot32(global_tid_73051, K_68510 * D_68526);\n gtid_73041 = squot32(global_tid_73051 - squot32(global_tid_73051, K_68510 *\n D_68526) * (K_68510 *\n D_68526),\n D_68526);\n gtid_73042 = global_tid_73051 - squot32(global_tid_73051, K_68510 *\n D_68526) * (K_68510 * D_68526) -\n squot32(global_tid_73051 - squot32(global_tid_73051, K_68510 *\n D_68526) * (K_68510 * D_68526),\n D_68526) * D_685",
"26;\n \n double res_77922;\n double res_77923;\n double res_77924;\n \n if ((slt32(gtid_73040, N_68508) && slt32(gtid_73041, K_68510)) &&\n slt32(gtid_73042, D_68526)) {\n res_77922 = *(__global double *) &mem_81391[(gtid_73041 * N_68508 +\n gtid_73040) * 8];\n res_77923 = *(__global double *) &mem_81387[(gtid_73040 * (D_68526 *\n K_68510) +\n gtid_73041 * D_68526 +\n gtid_73042) * 8];\n res_77924 = res_77922 + res_77923;\n }\n if ((slt32(gtid_73040, N_68508) && slt32(gtid_73041, K_68510)) &&\n slt32(gtid_73042, D_68526)) {\n *(__global double *) &mem_81396[(gtid_73040 * (D_68526 * K_68510) +\n gtid_73041 * D_68526 + gtid_73042) *\n 8] = res_77924;\n }\n}\n__kernel void map_73155(int32_t N_68508, int32_t K_68510, int32_t D_68526,\n int32_t num_groups_77829, int32_t virt_groups_77836,\n __global unsigned char *mem_81337, __global\n unsigned char *mem_81343)\n{\n const int32_t group_sizze_77819 = rev_gmm_objectivezigroup_sizze_73135;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_73155;\n int32_t local_tid_73156;\n int32_t group_sizze_82981;\n int32_t wave_sizze_82980;\n int32_t group_id_73157;\n \n global_tid_73155 = get_global_id(0);\n local_tid_73156 = get_local_id(0);\n group_sizze_82981 = get_local_size(0);\n wave_sizze_82980 = LOCKSTEP_WIDTH;\n group_id_73157 = get_group_id(0);\n \n int32_t gtid_73131;\n int32_t gtid_73132;\n int32_t phys_group_id_82982;\n \n phys_group_id_82982 = get_group_id(0);\n for (int32_t i_82983 = 0; i_82983 < squot32(virt_groups_778",
"36 -\n phys_group_id_82982 +\n num_groups_77829 - 1,\n num_groups_77829); i_82983++) {\n int32_t virt_group_id_82984 = phys_group_id_82982 + i_82983 *\n num_groups_77829;\n \n gtid_73131 = squot32(virt_group_id_82984 * group_sizze_77819 +\n local_tid_73156, K_68510);\n gtid_73132 = virt_group_id_82984 * group_sizze_77819 + local_tid_73156 -\n squot32(virt_group_id_82984 * group_sizze_77819 + local_tid_73156,\n K_68510) * K_68510;\n if (slt32(gtid_73131, N_68508) && slt32(gtid_73132, K_68510)) { }\n if (slt32(gtid_73131, N_68508) && slt32(gtid_73132, K_68510)) {\n for (int32_t i_82985 = 0; i_82985 < D_68526; i_82985++) {\n for (int32_t i_82986 = 0; i_82986 < D_68526; i_82986++) {\n *(__global double *) &mem_81343[(gtid_73131 * K_68510 +\n gtid_73132 + (i_82985 *\n (K_68510 *\n N_68508 *\n D_68526) +\n i_82986 *\n (K_68510 *\n N_68508))) *\n 8] = *(__global\n double *) &mem_81337[(gtid_73131 *\n K_68510 +\n gtid_73132 +\n ",
" (i_82986 *\n (K_68510 *\n N_68508 *\n D_68526) +\n i_82985 *\n (K_68510 *\n N_68508))) *\n 8];\n }\n }\n }\n }\n}\n__kernel void map_73182(int32_t N_68508, int32_t K_68510, int32_t triD_68516,\n int32_t D_68526, __global unsigned char *mem_81301,\n __global unsigned char *mem_81306)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_73182;\n int32_t local_tid_73183;\n int32_t group_sizze_82934;\n int32_t wave_sizze_82933;\n int32_t group_id_73184;\n \n global_tid_73182 = get_global_id(0);\n local_tid_73183 = get_local_id(0);\n group_sizze_82934 = get_local_size(0);\n wave_sizze_82933 = LOCKSTEP_WIDTH;\n group_id_73184 = get_group_id(0);\n \n int32_t gtid_73171;\n int32_t gtid_73172;\n int32_t gtid_73173;\n \n gtid_73171 = squot32(global_tid_73182, K_68510 * triD_68516);\n gtid_73172 = squot32(global_tid_73182 - squot32(global_tid_73182, K_68510 *\n triD_68516) * (K_68510 *\n triD_68516),\n triD_68516);\n gtid_73173 = global_tid_73182 - squot32(global_tid_73182, K_68510 *\n triD_68516) * (K_68510 *\n triD_6",
"8516) -\n squot32(global_tid_73182 - squot32(global_tid_73182, K_68510 *\n triD_68516) * (K_68510 * triD_68516),\n triD_68516) * triD_68516;\n \n double res_77764;\n \n if ((slt32(gtid_73171, N_68508) && slt32(gtid_73172, K_68510)) &&\n slt32(gtid_73173, triD_68516)) {\n double x_77767 = 0.0;\n \n for (int32_t chunk_offset_77766 = 0; chunk_offset_77766 < D_68526;\n chunk_offset_77766++) {\n double x_77774;\n double res_77777;\n \n x_77774 = *(__global double *) &mem_81301[(chunk_offset_77766 *\n (triD_68516 * K_68510 *\n N_68508) + gtid_73171 *\n (triD_68516 * K_68510) +\n gtid_73172 * triD_68516 +\n gtid_73173) * 8];\n res_77777 = x_77767 + x_77774;\n \n double x_tmp_82935 = res_77777;\n \n x_77767 = x_tmp_82935;\n }\n res_77764 = x_77767;\n }\n if ((slt32(gtid_73171, N_68508) && slt32(gtid_73172, K_68510)) &&\n slt32(gtid_73173, triD_68516)) {\n *(__global double *) &mem_81306[(gtid_73171 * (triD_68516 * K_68510) +\n gtid_73172 * triD_68516 + gtid_73173) *\n 8] = res_77764;\n }\n}\n__kernel void map_73315(int32_t N_68508, int32_t K_68510, int32_t triD_68516,\n int32_t D_68526, int32_t num_groups_77735,\n int32_t virt_groups_77742, __global\n unsigned char *mem_81283, __global\n unsigned char *mem_81289)\n{\n const int32_t group_sizze_77725 = rev_gmm_objectivezigroup_sizze_73295;\n const int block_dim0 = 0;\n const int ",
"block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_73315;\n int32_t local_tid_73316;\n int32_t group_sizze_82927;\n int32_t wave_sizze_82926;\n int32_t group_id_73317;\n \n global_tid_73315 = get_global_id(0);\n local_tid_73316 = get_local_id(0);\n group_sizze_82927 = get_local_size(0);\n wave_sizze_82926 = LOCKSTEP_WIDTH;\n group_id_73317 = get_group_id(0);\n \n int32_t gtid_73291;\n int32_t gtid_73292;\n int32_t phys_group_id_82928;\n \n phys_group_id_82928 = get_group_id(0);\n for (int32_t i_82929 = 0; i_82929 < squot32(virt_groups_77742 -\n phys_group_id_82928 +\n num_groups_77735 - 1,\n num_groups_77735); i_82929++) {\n int32_t virt_group_id_82930 = phys_group_id_82928 + i_82929 *\n num_groups_77735;\n \n gtid_73291 = squot32(virt_group_id_82930 * group_sizze_77725 +\n local_tid_73316, K_68510);\n gtid_73292 = virt_group_id_82930 * group_sizze_77725 + local_tid_73316 -\n squot32(virt_group_id_82930 * group_sizze_77725 + local_tid_73316,\n K_68510) * K_68510;\n if (slt32(gtid_73291, N_68508) && slt32(gtid_73292, K_68510)) { }\n if (slt32(gtid_73291, N_68508) && slt32(gtid_73292, K_68510)) {\n for (int32_t i_82931 = 0; i_82931 < triD_68516; i_82931++) {\n for (int32_t i_82932 = 0; i_82932 < D_68526; i_82932++) {\n *(__global double *) &mem_81289[(gtid_73291 * K_68510 +\n gtid_73292 + (i_82931 *\n (K_68510 *\n N_68508 *\n D_68526) +\n ",
" i_82932 *\n (K_68510 *\n N_68508))) *\n 8] = *(__global\n double *) &mem_81283[(gtid_73291 *\n K_68510 +\n gtid_73292 +\n (i_82932 *\n (K_68510 *\n N_68508 *\n triD_68516) +\n i_82931 *\n (K_68510 *\n N_68508))) *\n 8];\n }\n }\n }\n }\n}\n__kernel void map_73392(int32_t N_68508, int32_t K_68510, int32_t triD_68516,\n int32_t D_68526, int32_t num_groups_77161,\n int32_t virt_groups_77168, __global\n unsigned char *mem_80868, __global\n unsigned char *mem_80889, __global\n unsigned char *res_r_r_mem_80930, __global\n unsigned char *mem_80954, __global\n unsigned char *mem_80958, __global\n unsigned char *mem_80962, __global\n unsigned char *mem_80965, __global\n unsigned char *mem_80968, __global\n",
" unsigned char *mem_80971, __global\n unsigned char *mem_80974, __global\n unsigned char *mem_80977, __global\n unsigned char *mem_80980, __global\n unsigned char *mem_80987, __global\n unsigned char *mem_80990, __global\n unsigned char *mem_80996, __global\n unsigned char *mem_81002, __global\n unsigned char *mem_81007, __global\n unsigned char *mem_81749, __global\n unsigned char *mem_81752, __global\n unsigned char *mem_81759, __global\n unsigned char *mem_81762)\n{\n const int32_t group_sizze_77151 = rev_gmm_objectivezigroup_sizze_73372;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_73392;\n int32_t local_tid_73393;\n int32_t group_sizze_82694;\n int32_t wave_sizze_82693;\n int32_t group_id_73394;\n \n global_tid_73392 = get_global_id(0);\n local_tid_73393 = get_local_id(0);\n group_sizze_82694 = get_local_size(0);\n wave_sizze_82693 = LOCKSTEP_WIDTH;\n group_id_73394 = get_group_id(0);\n \n int32_t gtid_73366;\n int32_t gtid_73367;\n int32_t gtid_73368;\n int32_t phys_group_id_82695;\n \n phys_group_id_82695 = get_group_id(0);\n for (int32_t i_82696 = 0; i_82696 < squot32(virt_groups_77168 -\n phys_group_id_82695 +\n num_groups_77161 - 1,\n num_groups_77161); i_82696++) {\n int32_t virt_group_id_82697 = phys_group_id_82695 + i_82696 *\n num_groups_77161;\n \n gtid_73366 = squot32(virt_group_id_82697 * group_sizze_77151 +\n local_tid_73393, K_68510 * D_68526);\n gtid_73367 = squo",
"t32(virt_group_id_82697 * group_sizze_77151 +\n local_tid_73393 - squot32(virt_group_id_82697 *\n group_sizze_77151 +\n local_tid_73393,\n K_68510 * D_68526) *\n (K_68510 * D_68526), D_68526);\n gtid_73368 = virt_group_id_82697 * group_sizze_77151 + local_tid_73393 -\n squot32(virt_group_id_82697 * group_sizze_77151 + local_tid_73393,\n K_68510 * D_68526) * (K_68510 * D_68526) -\n squot32(virt_group_id_82697 * group_sizze_77151 + local_tid_73393 -\n squot32(virt_group_id_82697 * group_sizze_77151 +\n local_tid_73393, K_68510 * D_68526) * (K_68510 *\n D_68526),\n D_68526) * D_68526;\n \n double x_77195;\n double qs_elem_elem_77198;\n double res_77199;\n double res_77217;\n \n if ((slt32(gtid_73366, N_68508) && slt32(gtid_73367, K_68510)) &&\n slt32(gtid_73368, D_68526)) {\n x_77195 = *(__global double *) &res_r_r_mem_80930[(gtid_73366 *\n (D_68526 *\n K_68510) +\n gtid_73367 *\n D_68526 +\n gtid_73368) * 8];\n qs_elem_elem_77198 = *(__global double *) &mem_80868[(gtid_73366 *\n (D_68526 *\n K_68510) +\n gtid_73367 *\n ",
" D_68526 +\n gtid_73368) *\n 8];\n \n double x_77202 = 0.0;\n \n for (int32_t chunk_offset_77201 = 0; chunk_offset_77201 < D_68526;\n chunk_offset_77201++) {\n double x_77211;\n double x_77212;\n double res_77214;\n double res_77216;\n \n x_77211 = *(__global double *) &mem_80954[(chunk_offset_77201 *\n (D_68526 * K_68510 *\n N_68508) +\n gtid_73366 *\n (D_68526 * K_68510) +\n gtid_73367 *\n D_68526 +\n gtid_73368) * 8];\n x_77212 = *(__global double *) &res_r_r_mem_80930[(gtid_73366 *\n (D_68526 *\n K_68510) +\n gtid_73367 *\n D_68526 +\n chunk_offset_77201) *\n 8];\n res_77214 = x_77211 * x_77212;\n res_77216 = x_77202 + res_77214;\n \n double x_tmp_82698 = res_77216;\n \n x_77202 = x_tmp_82698;\n }\n res_77199 = x_77202;\n res_77217 = 0.0 - res_77199;\n",
" for (int32_t i_77224 = 0; i_77224 < D_68526; i_77224++) {\n double x_77225;\n double res_77227;\n bool cond_77228;\n bool cond_77229;\n \n x_77225 = *(__global double *) &mem_80889[(gtid_73366 *\n (D_68526 * K_68510) +\n gtid_73367 *\n D_68526 + i_77224) *\n 8];\n res_77227 = x_77195 * x_77225;\n cond_77228 = slt32(gtid_73368, i_77224);\n cond_77229 = gtid_73368 == i_77224;\n if (cond_77228) {\n for (int32_t i_82701 = 0; i_82701 < D_68526; i_82701++) {\n *(__global double *) &mem_80965[(group_id_73394 *\n (group_sizze_77151 *\n D_68526) +\n local_tid_73393 +\n i_82701 *\n group_sizze_77151) *\n 8] = 0.0;\n }\n for (int32_t i_82702 = 0; i_82702 < triD_68516; i_82702++) {\n *(__global double *) &mem_80968[(group_id_73394 *\n (group_sizze_77151 *\n triD_68516) +\n local_tid_73393 +\n i_82702 *\n group_sizze_77151) *\n 8] = 0.0;\n }\n ",
" for (int32_t i_82703 = 0; i_82703 < D_68526; i_82703++) {\n *(__global double *) &mem_81762[(group_id_73394 *\n (group_sizze_77151 *\n D_68526) +\n local_tid_73393 +\n i_82703 *\n group_sizze_77151) *\n 8] = *(__global\n double *) &mem_80965[(group_id_73394 *\n (group_sizze_77151 *\n D_68526) +\n local_tid_73393 +\n i_82703 *\n group_sizze_77151) *\n 8];\n }\n for (int32_t i_82704 = 0; i_82704 < triD_68516; i_82704++) {\n *(__global double *) &mem_81759[(group_id_73394 *\n (group_sizze_77151 *\n triD_68516) +\n local_tid_73393 +\n i_82704 *\n group_sizze_77151) *\n 8] = *(__global\n double *) &mem_80968[(group_id_733",
"94 *\n (group_sizze_77151 *\n triD_68516) +\n local_tid_73393 +\n i_82704 *\n group_sizze_77151) *\n 8];\n }\n } else {\n if (cond_77229) {\n double res_77236;\n double deltaVec_arg_77237;\n \n res_77236 = futrts_exp64(qs_elem_elem_77198);\n deltaVec_arg_77237 = res_77227 * res_77236;\n for (int32_t i_77242 = 0; i_77242 < D_68526;\n i_77242++) {\n bool cond_77244;\n double res_77245;\n \n cond_77244 = i_77242 == gtid_73368;\n if (cond_77244) {\n res_77245 = deltaVec_arg_77237;\n } else {\n res_77245 = 0.0;\n }\n *(__global double *) &mem_80971[(group_id_73394 *\n (group_sizze_77151 *\n D_68526) +\n local_tid_73393 +\n i_77242 *\n group_sizze_77151) *\n 8] = res_7",
"7245;\n }\n for (int32_t i_82706 = 0; i_82706 < triD_68516;\n i_82706++) {\n *(__global double *) &mem_80974[(group_id_73394 *\n (group_sizze_77151 *\n triD_68516) +\n local_tid_73393 +\n i_82706 *\n group_sizze_77151) *\n 8] = 0.0;\n }\n for (int32_t i_82707 = 0; i_82707 < D_68526;\n i_82707++) {\n *(__global double *) &mem_81752[(group_id_73394 *\n (group_sizze_77151 *\n D_68526) +\n local_tid_73393 +\n i_82707 *\n group_sizze_77151) *\n 8] = *(__global\n double *) &mem_80971[(group_id_73394 *\n (group_sizze_77151 *\n D_68526) +\n local_tid_73393 +\n i_82707 *\n group_sizze_77151) *\n ",
" 8];\n }\n for (int32_t i_82708 = 0; i_82708 < triD_68516;\n i_82708++) {\n *(__global double *) &mem_81749[(group_id_73394 *\n (group_sizze_77151 *\n triD_68516) +\n local_tid_73393 +\n i_82708 *\n group_sizze_77151) *\n 8] = *(__global\n double *) &mem_80974[(group_id_73394 *\n (group_sizze_77151 *\n triD_68516) +\n local_tid_73393 +\n i_82708 *\n group_sizze_77151) *\n 8];\n }\n } else {\n int32_t y_77248;\n int32_t x_77249;\n int32_t res_77250;\n int32_t deltaVec_arg_77251;\n \n y_77248 = gtid_73368 - 1;\n x_77249 = gtid_73368 * y_77248;\n res_77250 = sdiv32(x_77249, 2);\n deltaVec_arg_77251 = i_77224 + res_77250;\n ",
" for (int32_t i_77256 = 0; i_77256 < triD_68516;\n i_77256++) {\n bool cond_77258;\n double res_77259;\n \n cond_77258 = i_77256 == deltaVec_arg_77251;\n if (cond_77258) {\n res_77259 = res_77227;\n } else {\n res_77259 = 0.0;\n }\n *(__global double *) &mem_80977[(group_id_73394 *\n (group_sizze_77151 *\n triD_68516) +\n local_tid_73393 +\n i_77256 *\n group_sizze_77151) *\n 8] = res_77259;\n }\n for (int32_t i_82710 = 0; i_82710 < D_68526;\n i_82710++) {\n *(__global double *) &mem_80980[(group_id_73394 *\n (group_sizze_77151 *\n D_68526) +\n local_tid_73393 +\n i_82710 *\n group_sizze_77151) *\n 8] = 0.0;\n }\n for (int32_t i_82711 = 0; i_82711 < D_68526;\n i_82711++) {\n *(__global double *) &mem_81752[(group_id_73394 *\n (",
"group_sizze_77151 *\n D_68526) +\n local_tid_73393 +\n i_82711 *\n group_sizze_77151) *\n 8] = *(__global\n double *) &mem_80980[(group_id_73394 *\n (group_sizze_77151 *\n D_68526) +\n local_tid_73393 +\n i_82711 *\n group_sizze_77151) *\n 8];\n }\n for (int32_t i_82712 = 0; i_82712 < triD_68516;\n i_82712++) {\n *(__global double *) &mem_81749[(group_id_73394 *\n (group_sizze_77151 *\n triD_68516) +\n local_tid_73393 +\n i_82712 *\n group_sizze_77151) *\n 8] = *(__global\n double *) &mem_80977[(group_id_73394 *\n ",
" (group_sizze_77151 *\n triD_68516) +\n local_tid_73393 +\n i_82712 *\n group_sizze_77151) *\n 8];\n }\n }\n for (int32_t i_82713 = 0; i_82713 < D_68526; i_82713++) {\n *(__global double *) &mem_81762[(group_id_73394 *\n (group_sizze_77151 *\n D_68526) +\n local_tid_73393 +\n i_82713 *\n group_sizze_77151) *\n 8] = *(__global\n double *) &mem_81752[(group_id_73394 *\n (group_sizze_77151 *\n D_68526) +\n local_tid_73393 +\n i_82713 *\n group_sizze_77151) *\n 8];\n }\n for (int32_t i_82714 = 0; i_82714 < triD_68516; i_82714++) {\n ",
" *(__global double *) &mem_81759[(group_id_73394 *\n (group_sizze_77151 *\n triD_68516) +\n local_tid_73393 +\n i_82714 *\n group_sizze_77151) *\n 8] = *(__global\n double *) &mem_81749[(group_id_73394 *\n (group_sizze_77151 *\n triD_68516) +\n local_tid_73393 +\n i_82714 *\n group_sizze_77151) *\n 8];\n }\n }\n for (int32_t i_82715 = 0; i_82715 < triD_68516; i_82715++) {\n *(__global double *) &mem_80962[(group_id_73394 *\n (group_sizze_77151 *\n triD_68516 * D_68526) +\n local_tid_73393 + i_77224 *\n (group_sizze_77151 *\n triD_68516) + i_82715 *\n group_sizze_77151) * 8] =\n *(__global double *) &mem_81759[(group_id_73394 *\n (group_sizze_77151 *\n ",
" triD_68516) +\n local_tid_73393 +\n i_82715 *\n group_sizze_77151) *\n 8];\n }\n for (int32_t i_82716 = 0; i_82716 < D_68526; i_82716++) {\n *(__global double *) &mem_80958[(group_id_73394 *\n (group_sizze_77151 *\n D_68526 * D_68526) +\n local_tid_73393 + i_77224 *\n (group_sizze_77151 *\n D_68526) + i_82716 *\n group_sizze_77151) * 8] =\n *(__global double *) &mem_81762[(group_id_73394 *\n (group_sizze_77151 *\n D_68526) +\n local_tid_73393 +\n i_82716 *\n group_sizze_77151) *\n 8];\n }\n }\n for (int32_t i_77268 = 0; i_77268 < D_68526; i_77268++) {\n double res_77270;\n double redout_77271 = 0.0;\n \n for (int32_t i_77272 = 0; i_77272 < D_68526; i_77272++) {\n double x_77273;\n double res_77276;\n \n x_77273 = *(__global double *) &mem_80958[(group_id_73394 *\n (group_sizze_77151 *\n ",
" D_68526 *\n D_68526) +\n local_tid_73393 +\n (i_77272 *\n (group_sizze_77151 *\n D_68526) +\n i_77268 *\n group_sizze_77151)) *\n 8];\n res_77276 = redout_77271 + x_77273;\n \n double redout_tmp_82718 = res_77276;\n \n redout_77271 = redout_tmp_82718;\n }\n res_77270 = redout_77271;\n *(__global double *) &mem_80987[(group_id_73394 *\n (group_sizze_77151 * D_68526) +\n local_tid_73393 + i_77268 *\n group_sizze_77151) * 8] =\n res_77270;\n }\n for (int32_t i_77282 = 0; i_77282 < triD_68516; i_77282++) {\n double res_77284;\n double redout_77285 = 0.0;\n \n for (int32_t i_77286 = 0; i_77286 < D_68526; i_77286++) {\n double x_77287;\n double res_77290;\n \n x_77287 = *(__global double *) &mem_80962[(group_id_73394 *\n (group_sizze_77151 *\n triD_68516 *\n D_68526) +\n ",
" local_tid_73393 +\n (i_77286 *\n (group_sizze_77151 *\n triD_68516) +\n i_77282 *\n group_sizze_77151)) *\n 8];\n res_77290 = redout_77285 + x_77287;\n \n double redout_tmp_82720 = res_77290;\n \n redout_77285 = redout_tmp_82720;\n }\n res_77284 = redout_77285;\n *(__global double *) &mem_80990[(group_id_73394 *\n (group_sizze_77151 *\n triD_68516) +\n local_tid_73393 + i_77282 *\n group_sizze_77151) * 8] =\n res_77284;\n }\n }\n if ((slt32(gtid_73366, N_68508) && slt32(gtid_73367, K_68510)) &&\n slt32(gtid_73368, D_68526)) {\n for (int32_t i_82721 = 0; i_82721 < triD_68516; i_82721++) {\n *(__global double *) &mem_80996[(D_68526 * K_68510 * N_68508 *\n 0 + gtid_73366 * (D_68526 *\n K_68510) +\n gtid_73367 * D_68526 +\n gtid_73368 + i_82721 *\n (D_68526 * K_68510 *\n N_68508)) * 8] = *(__global\n double *) &mem_80990[(group_id_73394 *\n ",
" (group_sizze_77151 *\n triD_68516) +\n local_tid_73393 +\n i_82721 *\n group_sizze_77151) *\n 8];\n }\n }\n if ((slt32(gtid_73366, N_68508) && slt32(gtid_73367, K_68510)) &&\n slt32(gtid_73368, D_68526)) {\n for (int32_t i_82722 = 0; i_82722 < D_68526; i_82722++) {\n *(__global double *) &mem_81002[(D_68526 * K_68510 * N_68508 *\n 0 + gtid_73366 * (D_68526 *\n K_68510) +\n gtid_73367 * D_68526 +\n gtid_73368 + i_82722 *\n (D_68526 * K_68510 *\n N_68508)) * 8] = *(__global\n double *) &mem_80987[(group_id_73394 *\n (group_sizze_77151 *\n D_68526) +\n local_tid_73393 +\n i_82722 *\n group_sizz",
"e_77151) *\n 8];\n }\n }\n if ((slt32(gtid_73366, N_68508) && slt32(gtid_73367, K_68510)) &&\n slt32(gtid_73368, D_68526)) {\n *(__global double *) &mem_81007[(gtid_73366 * (D_68526 * K_68510) +\n gtid_73367 * D_68526 +\n gtid_73368) * 8] = res_77217;\n }\n }\n}\n__kernel void map_73675(int32_t N_68508, int32_t K_68510, int32_t triD_68516,\n int32_t D_68526, __global unsigned char *mem_81237,\n __global unsigned char *mem_81243)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_73675;\n int32_t local_tid_73676;\n int32_t group_sizze_82880;\n int32_t wave_sizze_82879;\n int32_t group_id_73677;\n \n global_tid_73675 = get_global_id(0);\n local_tid_73676 = get_local_id(0);\n group_sizze_82880 = get_local_size(0);\n wave_sizze_82879 = LOCKSTEP_WIDTH;\n group_id_73677 = get_group_id(0);\n \n int32_t gtid_73662;\n int32_t gtid_73663;\n int32_t gtid_73664;\n int32_t gtid_73665;\n \n gtid_73662 = squot32(global_tid_73675, K_68510 * D_68526 * triD_68516);\n gtid_73663 = squot32(global_tid_73675 - squot32(global_tid_73675, K_68510 *\n D_68526 * triD_68516) *\n (K_68510 * D_68526 * triD_68516), D_68526 *\n triD_68516);\n gtid_73664 = squot32(global_tid_73675 - squot32(global_tid_73675, K_68510 *\n D_68526 * triD_68516) *\n (K_68510 * D_68526 * triD_68516) -\n squot32(global_tid_73675 - squot32(global_tid_73675,\n K_68510 * D_68526 *\n ",
" triD_68516) *\n (K_68510 * D_68526 * triD_68516), D_68526 *\n triD_68516) * (D_68526 * triD_68516),\n triD_68516);\n gtid_73665 = global_tid_73675 - squot32(global_tid_73675, K_68510 *\n D_68526 * triD_68516) * (K_68510 *\n D_68526 *\n triD_68516) -\n squot32(global_tid_73675 - squot32(global_tid_73675, K_68510 * D_68526 *\n triD_68516) * (K_68510 * D_68526 *\n triD_68516), D_68526 *\n triD_68516) * (D_68526 * triD_68516) -\n squot32(global_tid_73675 - squot32(global_tid_73675, K_68510 * D_68526 *\n triD_68516) * (K_68510 * D_68526 *\n triD_68516) -\n squot32(global_tid_73675 - squot32(global_tid_73675, K_68510 *\n D_68526 * triD_68516) *\n (K_68510 * D_68526 * triD_68516), D_68526 *\n triD_68516) * (D_68526 * triD_68516), triD_68516) *\n triD_68516;\n \n double res_77665;\n \n if (((slt32(gtid_73662, N_68508) && slt32(gtid_73663, K_68510)) &&\n slt32(gtid_73664, D_68526)) && slt32(gtid_73665, triD_68516)) {\n double x_77668 = 0.0;\n \n for (int32_t chunk_offset_77667 = 0; chunk_offset_77667 < D_68526;\n chunk_offset_77667++) {\n double x_77675;\n double res_77678;\n \n x_77675 = *(__global double *) &mem_81237[(chunk_offset_77667 *\n (triD_68516 * D_68526 *\n K_68510 * N",
"_68508) +\n gtid_73662 *\n (triD_68516 * D_68526 *\n K_68510) + gtid_73663 *\n (triD_68516 * D_68526) +\n gtid_73664 * triD_68516 +\n gtid_73665) * 8];\n res_77678 = x_77668 + x_77675;\n \n double x_tmp_82881 = res_77678;\n \n x_77668 = x_tmp_82881;\n }\n res_77665 = x_77668;\n }\n if (((slt32(gtid_73662, N_68508) && slt32(gtid_73663, K_68510)) &&\n slt32(gtid_73664, D_68526)) && slt32(gtid_73665, triD_68516)) {\n *(__global double *) &mem_81243[(gtid_73662 * (triD_68516 * D_68526 *\n K_68510) + gtid_73663 *\n (triD_68516 * D_68526) + gtid_73664 *\n triD_68516 + gtid_73665) * 8] =\n res_77665;\n }\n}\n__kernel void map_73818(int32_t N_68508, int32_t K_68510, int32_t triD_68516,\n int32_t D_68526, int32_t num_groups_77634,\n int32_t virt_groups_77641, __global\n unsigned char *mem_81216, __global\n unsigned char *mem_81223)\n{\n const int32_t group_sizze_77624 = rev_gmm_objectivezigroup_sizze_73798;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_73818;\n int32_t local_tid_73819;\n int32_t group_sizze_82873;\n int32_t wave_sizze_82872;\n int32_t group_id_73820;\n \n global_tid_73818 = get_global_id(0);\n local_tid_73819 = get_local_id(0);\n group_sizze_82873 = get_local_size(0);\n wave_sizze_82872 = LOCKSTEP_WIDTH;\n group_id_73820 = get_group_id(0);\n \n int32_t ",
"gtid_73792;\n int32_t gtid_73793;\n int32_t gtid_73794;\n int32_t phys_group_id_82874;\n \n phys_group_id_82874 = get_group_id(0);\n for (int32_t i_82875 = 0; i_82875 < squot32(virt_groups_77641 -\n phys_group_id_82874 +\n num_groups_77634 - 1,\n num_groups_77634); i_82875++) {\n int32_t virt_group_id_82876 = phys_group_id_82874 + i_82875 *\n num_groups_77634;\n \n gtid_73792 = squot32(virt_group_id_82876 * group_sizze_77624 +\n local_tid_73819, K_68510 * D_68526);\n gtid_73793 = squot32(virt_group_id_82876 * group_sizze_77624 +\n local_tid_73819 - squot32(virt_group_id_82876 *\n group_sizze_77624 +\n local_tid_73819,\n K_68510 * D_68526) *\n (K_68510 * D_68526), D_68526);\n gtid_73794 = virt_group_id_82876 * group_sizze_77624 + local_tid_73819 -\n squot32(virt_group_id_82876 * group_sizze_77624 + local_tid_73819,\n K_68510 * D_68526) * (K_68510 * D_68526) -\n squot32(virt_group_id_82876 * group_sizze_77624 + local_tid_73819 -\n squot32(virt_group_id_82876 * group_sizze_77624 +\n local_tid_73819, K_68510 * D_68526) * (K_68510 *\n D_68526),\n D_68526) * D_68526;\n if ((slt32(gtid_73792, N_68508) && slt32(gtid_73793, K_68510)) &&\n slt32(gtid_73794, D_68526)) { }\n if ((slt32(gtid_73792, N_68508) && slt32(gtid_73793, K_68510)) &&\n slt32(gtid_73794, D_68526)) {\n for (int32_t i_82877 = 0; i_82877 < triD_68516; i_82877++) {\n ",
"for (int32_t i_82878 = 0; i_82878 < D_68526; i_82878++) {\n *(__global double *) &mem_81223[(D_68526 * K_68510 *\n N_68508 * D_68526 * 0 +\n D_68526 * K_68510 *\n N_68508 * 0 + gtid_73792 *\n (D_68526 * K_68510) +\n gtid_73793 * D_68526 +\n gtid_73794 + (i_82877 *\n (D_68526 *\n K_68510 *\n N_68508 *\n D_68526) +\n i_82878 *\n (D_68526 *\n K_68510 *\n N_68508))) *\n 8] = *(__global\n double *) &mem_81216[(gtid_73792 *\n (D_68526 *\n K_68510) +\n gtid_73793 *\n D_68526 +\n gtid_73794 +\n (i_82878 *\n ",
" (D_68526 *\n K_68510 *\n N_68508 *\n triD_68516) +\n i_82877 *\n (D_68526 *\n K_68510 *\n N_68508))) *\n 8];\n }\n }\n }\n }\n}\n__kernel void map_73848(int32_t N_68508, int32_t K_68510, int32_t D_68526,\n __global unsigned char *mem_81168, __global\n unsigned char *mem_81174)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_73848;\n int32_t local_tid_73849;\n int32_t group_sizze_82826;\n int32_t wave_sizze_82825;\n int32_t group_id_73850;\n \n global_tid_73848 = get_global_id(0);\n local_tid_73849 = get_local_id(0);\n group_sizze_82826 = get_local_size(0);\n wave_sizze_82825 = LOCKSTEP_WIDTH;\n group_id_73850 = get_group_id(0);\n \n int32_t gtid_73835;\n int32_t gtid_73836;\n int32_t gtid_73837;\n int32_t gtid_73838;\n \n gtid_73835 = squot32(global_tid_73848, K_68510 * D_68526 * D_68526);\n gtid_73836 = squot32(global_tid_73848 - squot32(global_tid_73848, K_68510 *\n D_68526 * D_68526) *\n (K_68510 * D_68526 * D_68526), D_68526 * D_68526);\n gtid_73837 = squot32(global_tid_73848 - squot32(global_tid_73848, K_68510 *\n D_68",
"526 * D_68526) *\n (K_68510 * D_68526 * D_68526) -\n squot32(global_tid_73848 - squot32(global_tid_73848,\n K_68510 * D_68526 *\n D_68526) *\n (K_68510 * D_68526 * D_68526), D_68526 *\n D_68526) * (D_68526 * D_68526), D_68526);\n gtid_73838 = global_tid_73848 - squot32(global_tid_73848, K_68510 *\n D_68526 * D_68526) * (K_68510 *\n D_68526 *\n D_68526) -\n squot32(global_tid_73848 - squot32(global_tid_73848, K_68510 * D_68526 *\n D_68526) * (K_68510 * D_68526 *\n D_68526), D_68526 *\n D_68526) * (D_68526 * D_68526) - squot32(global_tid_73848 -\n squot32(global_tid_73848,\n K_68510 *\n D_68526 *\n D_68526) *\n (K_68510 * D_68526 *\n D_68526) -\n squot32(global_tid_73848 -\n squot32(global_tid_73848,\n K_68510 *\n D_68526 *\n D_68526) *\n ",
" (K_68510 *\n D_68526 *\n D_68526),\n D_68526 *\n D_68526) *\n (D_68526 * D_68526),\n D_68526) * D_68526;\n \n double res_77566;\n \n if (((slt32(gtid_73835, N_68508) && slt32(gtid_73836, K_68510)) &&\n slt32(gtid_73837, D_68526)) && slt32(gtid_73838, D_68526)) {\n double x_77569 = 0.0;\n \n for (int32_t chunk_offset_77568 = 0; chunk_offset_77568 < D_68526;\n chunk_offset_77568++) {\n double x_77576;\n double res_77579;\n \n x_77576 = *(__global double *) &mem_81168[(chunk_offset_77568 *\n (D_68526 * D_68526 *\n K_68510 * N_68508) +\n gtid_73835 * (D_68526 *\n D_68526 *\n K_68510) +\n gtid_73836 * (D_68526 *\n D_68526) +\n gtid_73837 * D_68526 +\n gtid_73838) * 8];\n res_77579 = x_77569 + x_77576;\n \n double x_tmp_82827 = res_77579;\n \n x_77569 = x_tmp_82827;\n }\n res_77566 = x_77569;\n }\n if (((slt32(gtid_73835, N_68508) && slt32(gtid_73836, K_68510)) &&\n slt32(gtid_73837, D_68526)) && slt32(gtid_73838, D_68526)) {\n *(__global ",
"double *) &mem_81174[(gtid_73835 * (D_68526 * D_68526 *\n K_68510) + gtid_73836 *\n (D_68526 * D_68526) + gtid_73837 *\n D_68526 + gtid_73838) * 8] = res_77566;\n }\n}\n__kernel void map_73991(int32_t N_68508, int32_t K_68510, int32_t D_68526,\n int32_t num_groups_77535, int32_t virt_groups_77542,\n __global unsigned char *mem_81147, __global\n unsigned char *mem_81154)\n{\n const int32_t group_sizze_77525 = rev_gmm_objectivezigroup_sizze_73971;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_73991;\n int32_t local_tid_73992;\n int32_t group_sizze_82819;\n int32_t wave_sizze_82818;\n int32_t group_id_73993;\n \n global_tid_73991 = get_global_id(0);\n local_tid_73992 = get_local_id(0);\n group_sizze_82819 = get_local_size(0);\n wave_sizze_82818 = LOCKSTEP_WIDTH;\n group_id_73993 = get_group_id(0);\n \n int32_t gtid_73965;\n int32_t gtid_73966;\n int32_t gtid_73967;\n int32_t phys_group_id_82820;\n \n phys_group_id_82820 = get_group_id(0);\n for (int32_t i_82821 = 0; i_82821 < squot32(virt_groups_77542 -\n phys_group_id_82820 +\n num_groups_77535 - 1,\n num_groups_77535); i_82821++) {\n int32_t virt_group_id_82822 = phys_group_id_82820 + i_82821 *\n num_groups_77535;\n \n gtid_73965 = squot32(virt_group_id_82822 * group_sizze_77525 +\n local_tid_73992, K_68510 * D_68526);\n gtid_73966 = squot32(virt_group_id_82822 * group_sizze_77525 +\n local_tid_73992 - squot32(virt_group_id_82822 *\n group_sizz",
"e_77525 +\n local_tid_73992,\n K_68510 * D_68526) *\n (K_68510 * D_68526), D_68526);\n gtid_73967 = virt_group_id_82822 * group_sizze_77525 + local_tid_73992 -\n squot32(virt_group_id_82822 * group_sizze_77525 + local_tid_73992,\n K_68510 * D_68526) * (K_68510 * D_68526) -\n squot32(virt_group_id_82822 * group_sizze_77525 + local_tid_73992 -\n squot32(virt_group_id_82822 * group_sizze_77525 +\n local_tid_73992, K_68510 * D_68526) * (K_68510 *\n D_68526),\n D_68526) * D_68526;\n if ((slt32(gtid_73965, N_68508) && slt32(gtid_73966, K_68510)) &&\n slt32(gtid_73967, D_68526)) { }\n if ((slt32(gtid_73965, N_68508) && slt32(gtid_73966, K_68510)) &&\n slt32(gtid_73967, D_68526)) {\n for (int32_t i_82823 = 0; i_82823 < D_68526; i_82823++) {\n for (int32_t i_82824 = 0; i_82824 < D_68526; i_82824++) {\n *(__global double *) &mem_81154[(D_68526 * K_68510 *\n N_68508 * 0 + gtid_73965 *\n (D_68526 * K_68510) +\n gtid_73966 * D_68526 +\n gtid_73967 + (i_82823 *\n (D_68526 *\n K_68510 *\n N_68508 *\n D_68526) +\n i_82824 *\n ",
" (D_68526 *\n K_68510 *\n N_68508))) *\n 8] = *(__global\n double *) &mem_81147[(gtid_73965 *\n (D_68526 *\n K_68510) +\n gtid_73966 *\n D_68526 +\n gtid_73967 +\n (i_82824 *\n (D_68526 *\n K_68510 *\n N_68508 *\n D_68526) +\n i_82823 *\n (D_68526 *\n K_68510 *\n N_68508))) *\n 8];\n }\n }\n }\n }\n}\n__kernel void map_74126(int32_t N_68508, int32_t K_68510, int32_t triD_68516,\n int32_t D_68526, int32_t num_groups_77470,\n int32_t virt_groups_77477, __global\n u",
"nsigned char *mem_80873, __global\n unsigned char *mem_80889, __global\n unsigned char *mem_81097, __global\n unsigned char *mem_81100, __global\n unsigned char *mem_81103, __global\n unsigned char *mem_81106, __global\n unsigned char *mem_81109, __global\n unsigned char *mem_81112, __global\n unsigned char *mem_81115, __global\n unsigned char *mem_81126, __global\n unsigned char *mem_81133, __global\n unsigned char *mem_81789, __global\n unsigned char *mem_81792, __global\n unsigned char *mem_81799, __global\n unsigned char *mem_81802)\n{\n const int32_t group_sizze_77460 = rev_gmm_objectivezigroup_sizze_74106;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_74126;\n int32_t local_tid_74127;\n int32_t group_sizze_82798;\n int32_t wave_sizze_82797;\n int32_t group_id_74128;\n \n global_tid_74126 = get_global_id(0);\n local_tid_74127 = get_local_id(0);\n group_sizze_82798 = get_local_size(0);\n wave_sizze_82797 = LOCKSTEP_WIDTH;\n group_id_74128 = get_group_id(0);\n \n int32_t gtid_74098;\n int32_t gtid_74099;\n int32_t gtid_74100;\n int32_t gtid_74101;\n int32_t phys_group_id_82799;\n \n phys_group_id_82799 = get_group_id(0);\n for (int32_t i_82800 = 0; i_82800 < squot32(virt_groups_77477 -\n phys_group_id_82799 +\n num_groups_77470 - 1,\n num_groups_77470); i_82800++) {\n int32_t virt_group_id_82801 = phys_group_id_82799 + i_82800 *\n num_groups_77470;\n \n gtid_74098 = squot32(virt_group_id_82",
"801 * group_sizze_77460 +\n local_tid_74127, K_68510 * D_68526 * D_68526);\n gtid_74099 = squot32(virt_group_id_82801 * group_sizze_77460 +\n local_tid_74127 - squot32(virt_group_id_82801 *\n group_sizze_77460 +\n local_tid_74127,\n K_68510 * D_68526 *\n D_68526) * (K_68510 *\n D_68526 *\n D_68526),\n D_68526 * D_68526);\n gtid_74100 = squot32(virt_group_id_82801 * group_sizze_77460 +\n local_tid_74127 - squot32(virt_group_id_82801 *\n group_sizze_77460 +\n local_tid_74127,\n K_68510 * D_68526 *\n D_68526) * (K_68510 *\n D_68526 *\n D_68526) -\n squot32(virt_group_id_82801 * group_sizze_77460 +\n local_tid_74127 -\n squot32(virt_group_id_82801 *\n group_sizze_77460 +\n local_tid_74127, K_68510 *\n D_68526 * D_68526) * (K_68510 *\n D_68526 *\n D_68526),\n D_68526 * D_68526) * (D_68526 * D_68526)",
",\n D_68526);\n gtid_74101 = virt_group_id_82801 * group_sizze_77460 + local_tid_74127 -\n squot32(virt_group_id_82801 * group_sizze_77460 + local_tid_74127,\n K_68510 * D_68526 * D_68526) * (K_68510 * D_68526 *\n D_68526) -\n squot32(virt_group_id_82801 * group_sizze_77460 + local_tid_74127 -\n squot32(virt_group_id_82801 * group_sizze_77460 +\n local_tid_74127, K_68510 * D_68526 * D_68526) *\n (K_68510 * D_68526 * D_68526), D_68526 * D_68526) *\n (D_68526 * D_68526) - squot32(virt_group_id_82801 *\n group_sizze_77460 + local_tid_74127 -\n squot32(virt_group_id_82801 *\n group_sizze_77460 +\n local_tid_74127, K_68510 *\n D_68526 * D_68526) *\n (K_68510 * D_68526 * D_68526) -\n squot32(virt_group_id_82801 *\n group_sizze_77460 +\n local_tid_74127 -\n squot32(virt_group_id_82801 *\n group_sizze_77460 +\n local_tid_74127,\n K_68510 * D_68526 *\n D_68526) * (K_68510 *\n D_68526 *\n D_68526),\n D_68526 * D_68526) *\n ",
" (D_68526 * D_68526), D_68526) *\n D_68526;\n \n double x_77480;\n double qs_elem_elem_77482;\n double x_77483;\n double res_77485;\n bool cond_77486;\n bool cond_77487;\n \n if (((slt32(gtid_74098, N_68508) && slt32(gtid_74099, K_68510)) &&\n slt32(gtid_74100, D_68526)) && slt32(gtid_74101, D_68526)) {\n x_77480 = *(__global double *) &mem_81097[(gtid_74099 * (N_68508 *\n D_68526) +\n gtid_74100 * N_68508 +\n gtid_74098) * 8];\n qs_elem_elem_77482 = *(__global double *) &mem_80873[(gtid_74099 *\n (N_68508 *\n D_68526) +\n gtid_74100 *\n N_68508 +\n gtid_74098) *\n 8];\n x_77483 = *(__global double *) &mem_80889[(gtid_74098 * (D_68526 *\n K_68510) +\n gtid_74099 * D_68526 +\n gtid_74101) * 8];\n res_77485 = x_77480 * x_77483;\n cond_77486 = slt32(gtid_74100, gtid_74101);\n cond_77487 = gtid_74100 == gtid_74101;\n if (cond_77486) {\n for (int32_t i_82802 = 0; i_82802 < D_68526; i_82802++) {\n *(__global double *) &mem_81100[(group_id_74128 *\n (group_sizze_77460 *\n ",
" D_68526) +\n local_tid_74127 + i_82802 *\n group_sizze_77460) * 8] =\n 0.0;\n }\n for (int32_t i_82803 = 0; i_82803 < triD_68516; i_82803++) {\n *(__global double *) &mem_81103[(group_id_74128 *\n (group_sizze_77460 *\n triD_68516) +\n local_tid_74127 + i_82803 *\n group_sizze_77460) * 8] =\n 0.0;\n }\n for (int32_t i_82804 = 0; i_82804 < D_68526; i_82804++) {\n *(__global double *) &mem_81802[(group_id_74128 *\n (group_sizze_77460 *\n D_68526) +\n local_tid_74127 + i_82804 *\n group_sizze_77460) * 8] =\n *(__global double *) &mem_81100[(group_id_74128 *\n (group_sizze_77460 *\n D_68526) +\n local_tid_74127 +\n i_82804 *\n group_sizze_77460) *\n 8];\n }\n for (int32_t i_82805 = 0; i_82805 < triD_68516; i_82805++) {\n *(__global double *) &mem_81799[(group_id_74128 *\n (group_sizze_77460 *\n triD_68516) +\n ",
" local_tid_74127 + i_82805 *\n group_sizze_77460) * 8] =\n *(__global double *) &mem_81103[(group_id_74128 *\n (group_sizze_77460 *\n triD_68516) +\n local_tid_74127 +\n i_82805 *\n group_sizze_77460) *\n 8];\n }\n } else {\n if (cond_77487) {\n double res_77494;\n double deltaVec_arg_77495;\n \n res_77494 = futrts_exp64(qs_elem_elem_77482);\n deltaVec_arg_77495 = res_77485 * res_77494;\n for (int32_t i_77500 = 0; i_77500 < D_68526; i_77500++) {\n bool cond_77502;\n double res_77503;\n \n cond_77502 = i_77500 == gtid_74100;\n if (cond_77502) {\n res_77503 = deltaVec_arg_77495;\n } else {\n res_77503 = 0.0;\n }\n *(__global double *) &mem_81106[(group_id_74128 *\n (group_sizze_77460 *\n D_68526) +\n local_tid_74127 +\n i_77500 *\n group_sizze_77460) *\n 8] = res_77503;\n }\n for (int32_t i_82807 = 0; i_82807 < triD_68516; i_",
"82807++) {\n *(__global double *) &mem_81109[(group_id_74128 *\n (group_sizze_77460 *\n triD_68516) +\n local_tid_74127 +\n i_82807 *\n group_sizze_77460) *\n 8] = 0.0;\n }\n for (int32_t i_82808 = 0; i_82808 < D_68526; i_82808++) {\n *(__global double *) &mem_81792[(group_id_74128 *\n (group_sizze_77460 *\n D_68526) +\n local_tid_74127 +\n i_82808 *\n group_sizze_77460) *\n 8] = *(__global\n double *) &mem_81106[(group_id_74128 *\n (group_sizze_77460 *\n D_68526) +\n local_tid_74127 +\n i_82808 *\n group_sizze_77460) *\n 8];\n }\n for (int32_t i_82809 = 0; i_82809 < triD_68516; i_82809++) {\n *(__global double *) &mem_81789[(group_id_",
"74128 *\n (group_sizze_77460 *\n triD_68516) +\n local_tid_74127 +\n i_82809 *\n group_sizze_77460) *\n 8] = *(__global\n double *) &mem_81109[(group_id_74128 *\n (group_sizze_77460 *\n triD_68516) +\n local_tid_74127 +\n i_82809 *\n group_sizze_77460) *\n 8];\n }\n } else {\n int32_t y_77506;\n int32_t x_77507;\n int32_t res_77508;\n int32_t deltaVec_arg_77509;\n \n y_77506 = gtid_74100 - 1;\n x_77507 = gtid_74100 * y_77506;\n res_77508 = sdiv32(x_77507, 2);\n deltaVec_arg_77509 = gtid_74101 + res_77508;\n for (int32_t i_77514 = 0; i_77514 < triD_68516; i_77514++) {\n bool cond_77516;\n double res_77517;\n \n cond_77516 = i_77514 == deltaVec_arg_77509;\n if (cond_77516) {\n res_77517 = res_77485;\n } else {\n ",
" res_77517 = 0.0;\n }\n *(__global double *) &mem_81112[(group_id_74128 *\n (group_sizze_77460 *\n triD_68516) +\n local_tid_74127 +\n i_77514 *\n group_sizze_77460) *\n 8] = res_77517;\n }\n for (int32_t i_82811 = 0; i_82811 < D_68526; i_82811++) {\n *(__global double *) &mem_81115[(group_id_74128 *\n (group_sizze_77460 *\n D_68526) +\n local_tid_74127 +\n i_82811 *\n group_sizze_77460) *\n 8] = 0.0;\n }\n for (int32_t i_82812 = 0; i_82812 < D_68526; i_82812++) {\n *(__global double *) &mem_81792[(group_id_74128 *\n (group_sizze_77460 *\n D_68526) +\n local_tid_74127 +\n i_82812 *\n group_sizze_77460) *\n 8] = *(__global\n double *) &mem_81115[(group_id_74128 *\n (group_sizze_77460 *\n ",
" D_68526) +\n local_tid_74127 +\n i_82812 *\n group_sizze_77460) *\n 8];\n }\n for (int32_t i_82813 = 0; i_82813 < triD_68516; i_82813++) {\n *(__global double *) &mem_81789[(group_id_74128 *\n (group_sizze_77460 *\n triD_68516) +\n local_tid_74127 +\n i_82813 *\n group_sizze_77460) *\n 8] = *(__global\n double *) &mem_81112[(group_id_74128 *\n (group_sizze_77460 *\n triD_68516) +\n local_tid_74127 +\n i_82813 *\n group_sizze_77460) *\n 8];\n }\n }\n for (int32_t i_82814 = 0; i_82814 < D_68526; i_82814++) {\n *(__global double *) &mem_81802[(group_id_74128 *\n ",
" (group_sizze_77460 *\n D_68526) +\n local_tid_74127 + i_82814 *\n group_sizze_77460) * 8] =\n *(__global double *) &mem_81792[(group_id_74128 *\n (group_sizze_77460 *\n D_68526) +\n local_tid_74127 +\n i_82814 *\n group_sizze_77460) *\n 8];\n }\n for (int32_t i_82815 = 0; i_82815 < triD_68516; i_82815++) {\n *(__global double *) &mem_81799[(group_id_74128 *\n (group_sizze_77460 *\n triD_68516) +\n local_tid_74127 + i_82815 *\n group_sizze_77460) * 8] =\n *(__global double *) &mem_81789[(group_id_74128 *\n (group_sizze_77460 *\n triD_68516) +\n local_tid_74127 +\n i_82815 *\n group_sizze_77460) *\n 8];\n }\n }\n }\n if (((slt32(gtid_74098, N_68508) && slt32(gtid_74099, K_68510)) &&\n slt32(gtid_74100, D_68526)) && slt32(gtid_74101, D_68526)) {\n for (int32_t i_82816 = 0; i_82816 < D_6852",
"6; i_82816++) {\n *(__global double *) &mem_81126[(gtid_74098 * (D_68526 *\n D_68526 *\n K_68510) +\n gtid_74099 * (D_68526 *\n D_68526) +\n gtid_74100 * D_68526 +\n gtid_74101 + i_82816 *\n (D_68526 * D_68526 * K_68510 *\n N_68508)) * 8] = *(__global\n double *) &mem_81802[(group_id_74128 *\n (group_sizze_77460 *\n D_68526) +\n local_tid_74127 +\n i_82816 *\n group_sizze_77460) *\n 8];\n }\n }\n if (((slt32(gtid_74098, N_68508) && slt32(gtid_74099, K_68510)) &&\n slt32(gtid_74100, D_68526)) && slt32(gtid_74101, D_68526)) {\n for (int32_t i_82817 = 0; i_82817 < triD_68516; i_82817++) {\n *(__global double *) &mem_81133[(D_68526 * D_68526 * K_68510 *\n N_68508 * 0 + gtid_74098 *\n (D_68526 * D_68526 * K_68510) +\n gtid_74099 * (D_68526 *\n ",
" D_68526) +\n gtid_74100 * D_68526 +\n gtid_74101 + i_82817 *\n (D_68526 * D_68526 * K_68510 *\n N_68508)) * 8] = *(__global\n double *) &mem_81799[(group_id_74128 *\n (group_sizze_77460 *\n triD_68516) +\n local_tid_74127 +\n i_82817 *\n group_sizze_77460) *\n 8];\n }\n }\n }\n}\n__kernel void map_74247(int32_t N_68508, int32_t K_68510, int32_t D_68526,\n __global unsigned char *mem_81087, __global\n unsigned char *mem_81092)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_74247;\n int32_t local_tid_74248;\n int32_t group_sizze_82796;\n int32_t wave_sizze_82795;\n int32_t group_id_74249;\n \n global_tid_74247 = get_global_id(0);\n local_tid_74248 = get_local_id(0);\n group_sizze_82796 = get_local_size(0);\n wave_sizze_82795 = LOCKSTEP_WIDTH;\n group_id_74249 = get_group_id(0);\n \n int32_t gtid_74236;\n int32_t gtid_74237;\n int32_t gtid_74238;\n \n gtid_74236 = squot32(global_tid_74247, K_68510 * D_68526);\n gtid_74237 = squot32(global_tid_74247 - squot32(global_tid_74247, K_68510 *\n ",
" D_68526) * (K_68510 *\n D_68526),\n D_68526);\n gtid_74238 = global_tid_74247 - squot32(global_tid_74247, K_68510 *\n D_68526) * (K_68510 * D_68526) -\n squot32(global_tid_74247 - squot32(global_tid_74247, K_68510 *\n D_68526) * (K_68510 * D_68526),\n D_68526) * D_68526;\n \n double res_77431;\n double res_77432;\n \n if ((slt32(gtid_74236, N_68508) && slt32(gtid_74237, K_68510)) &&\n slt32(gtid_74238, D_68526)) {\n res_77431 = *(__global double *) &mem_81087[(gtid_74236 * (D_68526 *\n K_68510) +\n gtid_74237 * D_68526 +\n gtid_74238) * 8];\n res_77432 = 0.0 - res_77431;\n }\n if ((slt32(gtid_74236, N_68508) && slt32(gtid_74237, K_68510)) &&\n slt32(gtid_74238, D_68526)) {\n *(__global double *) &mem_81092[(gtid_74236 * (D_68526 * K_68510) +\n gtid_74237 * D_68526 + gtid_74238) *\n 8] = res_77432;\n }\n}\n__kernel void map_74648(int32_t N_68508, int32_t K_68510, int32_t D_68526,\n int32_t num_groups_77137, int32_t virt_groups_77144,\n __global unsigned char *mem_80936, __global\n unsigned char *mem_80942)\n{\n const int32_t group_sizze_77127 = rev_gmm_objectivezigroup_sizze_74628;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_74648;\n int32_t local_tid_74649;\n int32_t group_sizze_82687;\n int32_t wave_sizze_82686;\n int32_t group_id_74650;\n \n global_tid_74648 = get_global_id(0);\n local_tid_74649 = get_loc",
"al_id(0);\n group_sizze_82687 = get_local_size(0);\n wave_sizze_82686 = LOCKSTEP_WIDTH;\n group_id_74650 = get_group_id(0);\n \n int32_t gtid_74624;\n int32_t gtid_74625;\n int32_t phys_group_id_82688;\n \n phys_group_id_82688 = get_group_id(0);\n for (int32_t i_82689 = 0; i_82689 < squot32(virt_groups_77144 -\n phys_group_id_82688 +\n num_groups_77137 - 1,\n num_groups_77137); i_82689++) {\n int32_t virt_group_id_82690 = phys_group_id_82688 + i_82689 *\n num_groups_77137;\n \n gtid_74624 = squot32(virt_group_id_82690 * group_sizze_77127 +\n local_tid_74649, K_68510);\n gtid_74625 = virt_group_id_82690 * group_sizze_77127 + local_tid_74649 -\n squot32(virt_group_id_82690 * group_sizze_77127 + local_tid_74649,\n K_68510) * K_68510;\n if (slt32(gtid_74624, N_68508) && slt32(gtid_74625, K_68510)) { }\n if (slt32(gtid_74624, N_68508) && slt32(gtid_74625, K_68510)) {\n for (int32_t i_82691 = 0; i_82691 < D_68526; i_82691++) {\n for (int32_t i_82692 = 0; i_82692 < D_68526; i_82692++) {\n *(__global double *) &mem_80942[(gtid_74624 * K_68510 +\n gtid_74625 + (i_82691 *\n (K_68510 *\n N_68508 *\n D_68526) +\n i_82692 *\n (K_68510 *\n N_68508))) *\n 8] = *(__global\n ",
" double *) &mem_80936[(gtid_74624 *\n K_68510 +\n gtid_74625 +\n (i_82692 *\n (K_68510 *\n N_68508 *\n D_68526) +\n i_82691 *\n (K_68510 *\n N_68508))) *\n 8];\n }\n }\n }\n }\n}\n__kernel void map_74677(int32_t N_68508, int32_t K_68510, int32_t D_68526,\n __global unsigned char *mem_80889, __global\n unsigned char *mem_80897, __global\n unsigned char *mem_80903, __global\n unsigned char *mem_80908)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_74677;\n int32_t local_tid_74678;\n int32_t group_sizze_82638;\n int32_t wave_sizze_82637;\n int32_t group_id_74679;\n \n global_tid_74677 = get_global_id(0);\n local_tid_74678 = get_local_id(0);\n group_sizze_82638 = get_local_size(0);\n wave_sizze_82637 = LOCKSTEP_WIDTH;\n group_id_74679 = get_group_id(0);\n \n int32_t gtid_74666;\n int32_t gtid_74667;\n int32_t gtid_74668;\n \n gtid_74666 = squot32(global_tid_74677, K_68510 * D_68526);\n gtid_74667 = squot32(global_tid_74677 - squot32(global_",
"tid_74677, K_68510 *\n D_68526) * (K_68510 *\n D_68526),\n D_68526);\n gtid_74668 = global_tid_74677 - squot32(global_tid_74677, K_68510 *\n D_68526) * (K_68510 * D_68526) -\n squot32(global_tid_74677 - squot32(global_tid_74677, K_68510 *\n D_68526) * (K_68510 * D_68526),\n D_68526) * D_68526;\n \n double rev_sqnorm_arg_77038;\n double res_77040;\n double res_77058;\n double res_77059;\n \n if ((slt32(gtid_74666, N_68508) && slt32(gtid_74667, K_68510)) &&\n slt32(gtid_74668, D_68526)) {\n rev_sqnorm_arg_77038 = *(__global double *) &mem_80897[(gtid_74667 *\n N_68508 +\n gtid_74666) *\n 8];\n \n double x_77043 = 0.0;\n \n for (int32_t chunk_offset_77042 = 0; chunk_offset_77042 < D_68526;\n chunk_offset_77042++) {\n double x_77052;\n double x_77053;\n double res_77055;\n double res_77057;\n \n x_77052 = *(__global double *) &mem_80889[(gtid_74666 * (D_68526 *\n K_68510) +\n gtid_74667 * D_68526 +\n chunk_offset_77042) * 8];\n x_77053 = *(__global double *) &mem_80903[(chunk_offset_77042 *\n (D_68526 * K_68510 *\n N_68508) + gtid_74666 *\n (D_68526 * K_68510) +\n ",
" gtid_74667 * D_68526 +\n gtid_74668) * 8];\n res_77055 = x_77052 * x_77053;\n res_77057 = x_77043 + res_77055;\n \n double x_tmp_82639 = res_77057;\n \n x_77043 = x_tmp_82639;\n }\n res_77040 = x_77043;\n res_77058 = rev_sqnorm_arg_77038 * res_77040;\n res_77059 = res_77058 + res_77058;\n }\n if ((slt32(gtid_74666, N_68508) && slt32(gtid_74667, K_68510)) &&\n slt32(gtid_74668, D_68526)) {\n *(__global double *) &mem_80908[(gtid_74666 * (D_68526 * K_68510) +\n gtid_74667 * D_68526 + gtid_74668) *\n 8] = res_77059;\n }\n}\n__kernel void map_74751(int32_t N_68508, int32_t K_68510, int32_t D_68526,\n __global unsigned char *mem_80919, __global\n unsigned char *mem_80923, __global\n unsigned char *mem_80928)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_74751;\n int32_t local_tid_74752;\n int32_t group_sizze_82685;\n int32_t wave_sizze_82684;\n int32_t group_id_74753;\n \n global_tid_74751 = get_global_id(0);\n local_tid_74752 = get_local_id(0);\n group_sizze_82685 = get_local_size(0);\n wave_sizze_82684 = LOCKSTEP_WIDTH;\n group_id_74753 = get_group_id(0);\n \n int32_t gtid_74740;\n int32_t gtid_74741;\n int32_t gtid_74742;\n \n gtid_74740 = squot32(global_tid_74751, K_68510 * D_68526);\n gtid_74741 = squot32(global_tid_74751 - squot32(global_tid_74751, K_68510 *\n D_68526) * (K_68510 *\n D_68526),\n D_68526);\n gtid_74742 = global_tid_74751 - squot32(global_tid_74751, K_68510 *\n D_6",
"8526) * (K_68510 * D_68526) -\n squot32(global_tid_74751 - squot32(global_tid_74751, K_68510 *\n D_68526) * (K_68510 * D_68526),\n D_68526) * D_68526;\n \n double rev_sqnorm_arg_77120;\n double res_77121;\n double res_77122;\n double res_77123;\n \n if ((slt32(gtid_74740, N_68508) && slt32(gtid_74741, K_68510)) &&\n slt32(gtid_74742, D_68526)) {\n rev_sqnorm_arg_77120 = *(__global double *) &mem_80923[(gtid_74741 *\n N_68508 +\n gtid_74740) *\n 8];\n res_77121 = *(__global double *) &mem_80919[(gtid_74740 * (D_68526 *\n K_68510) +\n gtid_74741 * D_68526 +\n gtid_74742) * 8];\n res_77122 = rev_sqnorm_arg_77120 * res_77121;\n res_77123 = res_77122 + res_77122;\n }\n if ((slt32(gtid_74740, N_68508) && slt32(gtid_74741, K_68510)) &&\n slt32(gtid_74742, D_68526)) {\n *(__global double *) &mem_80928[(gtid_74740 * (D_68526 * K_68510) +\n gtid_74741 * D_68526 + gtid_74742) *\n 8] = res_77123;\n }\n}\n__kernel void map_74848(int32_t N_68508, int32_t K_68510, __global\n unsigned char *mem_80863, __global\n unsigned char *mem_80893)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_74848;\n int32_t local_tid_74849;\n int32_t group_sizze_82636;\n int32_t wave_sizze_82635;\n int32_t group_id_74850;\n \n global_tid_74848 = get_global_id(0);\n local_tid_74849 = get_local_id(0);\n group_sizze_82636 = get_local_size(0);\n",
" wave_sizze_82635 = LOCKSTEP_WIDTH;\n group_id_74850 = get_group_id(0);\n \n int32_t gtid_74839;\n int32_t gtid_74840;\n \n gtid_74839 = squot32(global_tid_74848, K_68510);\n gtid_74840 = global_tid_74848 - squot32(global_tid_74848, K_68510) *\n K_68510;\n \n double res_77017;\n double y_77018;\n double rev_sqnorm_arg_77019;\n \n if (slt32(gtid_74839, N_68508) && slt32(gtid_74840, K_68510)) {\n res_77017 = *(__global double *) &mem_80863[(gtid_74839 * K_68510 +\n gtid_74840) * 8];\n y_77018 = 0.0 - res_77017;\n rev_sqnorm_arg_77019 = 0.5 * y_77018;\n }\n if (slt32(gtid_74839, N_68508) && slt32(gtid_74840, K_68510)) {\n *(__global double *) &mem_80893[(gtid_74839 * K_68510 + gtid_74840) *\n 8] = rev_sqnorm_arg_77019;\n }\n}\n__kernel void map_74932(int32_t N_68508, int32_t D_68509, int32_t K_68510,\n int32_t D_68526, __global unsigned char *x_mem_80366,\n __global unsigned char *mem_80884, __global\n unsigned char *mem_80889)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_74932;\n int32_t local_tid_74933;\n int32_t group_sizze_82634;\n int32_t wave_sizze_82633;\n int32_t group_id_74934;\n \n global_tid_74932 = get_global_id(0);\n local_tid_74933 = get_local_id(0);\n group_sizze_82634 = get_local_size(0);\n wave_sizze_82633 = LOCKSTEP_WIDTH;\n group_id_74934 = get_group_id(0);\n \n int32_t gtid_74921;\n int32_t gtid_74922;\n int32_t gtid_74923;\n \n gtid_74921 = squot32(global_tid_74932, K_68510 * D_68526);\n gtid_74922 = squot32(global_tid_74932 - squot32(global_tid_74932, K_68510 *\n D_68526) * (K_68510 *\n D_68526),\n ",
"D_68526);\n gtid_74923 = global_tid_74932 - squot32(global_tid_74932, K_68510 *\n D_68526) * (K_68510 * D_68526) -\n squot32(global_tid_74932 - squot32(global_tid_74932, K_68510 *\n D_68526) * (K_68510 * D_68526),\n D_68526) * D_68526;\n \n double x_elem_elem_77005;\n double means_elem_elem_77006;\n double res_77007;\n \n if ((slt32(gtid_74921, N_68508) && slt32(gtid_74922, K_68510)) &&\n slt32(gtid_74923, D_68526)) {\n x_elem_elem_77005 = *(__global double *) &x_mem_80366[(gtid_74921 *\n D_68509 +\n gtid_74923) * 8];\n means_elem_elem_77006 = *(__global double *) &mem_80884[(gtid_74921 *\n (D_68526 *\n K_68510) +\n gtid_74922 *\n D_68526 +\n gtid_74923) *\n 8];\n res_77007 = x_elem_elem_77005 - means_elem_elem_77006;\n }\n if ((slt32(gtid_74921, N_68508) && slt32(gtid_74922, K_68510)) &&\n slt32(gtid_74923, D_68526)) {\n *(__global double *) &mem_80889[(gtid_74921 * (D_68526 * K_68510) +\n gtid_74922 * D_68526 + gtid_74923) *\n 8] = res_77007;\n }\n}\n__kernel void map_74990(int32_t N_68508, int32_t K_68510, int32_t triD_68516,\n int32_t D_68526, __global unsigned char *icf_mem_80370,\n __global unsigned char *mem_80873, __global\n unsigned char *mem_80879)\n{\n const int block_d",
"im0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_74990;\n int32_t local_tid_74991;\n int32_t group_sizze_82632;\n int32_t wave_sizze_82631;\n int32_t group_id_74992;\n \n global_tid_74990 = get_global_id(0);\n local_tid_74991 = get_local_id(0);\n group_sizze_82632 = get_local_size(0);\n wave_sizze_82631 = LOCKSTEP_WIDTH;\n group_id_74992 = get_group_id(0);\n \n int32_t gtid_74977;\n int32_t gtid_74978;\n int32_t gtid_74979;\n int32_t gtid_74980;\n \n gtid_74977 = squot32(global_tid_74990, K_68510 * D_68526 * D_68526);\n gtid_74978 = squot32(global_tid_74990 - squot32(global_tid_74990, K_68510 *\n D_68526 * D_68526) *\n (K_68510 * D_68526 * D_68526), D_68526 * D_68526);\n gtid_74979 = squot32(global_tid_74990 - squot32(global_tid_74990, K_68510 *\n D_68526 * D_68526) *\n (K_68510 * D_68526 * D_68526) -\n squot32(global_tid_74990 - squot32(global_tid_74990,\n K_68510 * D_68526 *\n D_68526) *\n (K_68510 * D_68526 * D_68526), D_68526 *\n D_68526) * (D_68526 * D_68526), D_68526);\n gtid_74980 = global_tid_74990 - squot32(global_tid_74990, K_68510 *\n D_68526 * D_68526) * (K_68510 *\n D_68526 *\n D_68526) -\n squot32(global_tid_74990 - squot32(global_tid_74990, K_68510 * D_68526 *\n D_68526) * (K_68510 * D_68526 *\n D_68526), D_68526 *\n D_68526) * (D_68526 * D_68526) - s",
"quot32(global_tid_74990 -\n squot32(global_tid_74990,\n K_68510 *\n D_68526 *\n D_68526) *\n (K_68510 * D_68526 *\n D_68526) -\n squot32(global_tid_74990 -\n squot32(global_tid_74990,\n K_68510 *\n D_68526 *\n D_68526) *\n (K_68510 *\n D_68526 *\n D_68526),\n D_68526 *\n D_68526) *\n (D_68526 * D_68526),\n D_68526) * D_68526;\n \n double qs_elem_elem_76977;\n bool cond_76979;\n double res_76980;\n \n if (((slt32(gtid_74977, N_68508) && slt32(gtid_74978, K_68510)) &&\n slt32(gtid_74979, D_68526)) && slt32(gtid_74980, D_68526)) {\n qs_elem_elem_76977 = *(__global double *) &mem_80873[(gtid_74978 *\n (N_68508 *\n D_68526) +\n gtid_74979 *\n ",
" N_68508 +\n gtid_74977) * 8];\n cond_76979 = slt32(gtid_74979, gtid_74980);\n if (cond_76979) {\n res_76980 = 0.0;\n } else {\n bool cond_76981;\n double res_76982;\n \n cond_76981 = gtid_74979 == gtid_74980;\n if (cond_76981) {\n double res_76983;\n \n res_76983 = futrts_exp64(qs_elem_elem_76977);\n res_76982 = res_76983;\n } else {\n int32_t y_76984;\n int32_t x_76985;\n int32_t res_76986;\n int32_t gmm_knossos_tri_arg_76987;\n int32_t y_76988;\n int32_t x_76989;\n int32_t res_76990;\n int32_t x_76991;\n int32_t x_76992;\n int32_t y_76993;\n int32_t i_76994;\n double res_76995;\n \n y_76984 = D_68526 - 1;\n x_76985 = D_68526 * y_76984;\n res_76986 = sdiv32(x_76985, 2);\n gmm_knossos_tri_arg_76987 = D_68526 - gtid_74980;\n y_76988 = gmm_knossos_tri_arg_76987 - 1;\n x_76989 = gmm_knossos_tri_arg_76987 * y_76988;\n res_76990 = sdiv32(x_76989, 2);\n x_76991 = res_76986 - res_76990;\n x_76992 = gtid_74979 - gtid_74980;\n y_76993 = x_76992 - 1;\n i_76994 = x_76991 + y_76993;\n res_76995 = *(__global double *) &icf_mem_80370[(gtid_74978 *\n triD_68516 +\n i_76994) * 8];\n res_76982 = res_76995;\n }\n res_76980 = res_76982;\n }\n }\n if (((slt32(gtid_74977, N_68508) && slt32(gtid_74978, K_68510)) &&\n slt32(gtid_74979, D_68526)) &",
"& slt32(gtid_74980, D_68526)) {\n *(__global double *) &mem_80879[(gtid_74977 * (D_68526 * D_68526 *\n K_68510) + gtid_74978 *\n (D_68526 * D_68526) + gtid_74979 *\n D_68526 + gtid_74980) * 8] = res_76980;\n }\n}\n__kernel void map_75119(int32_t N_68508, int32_t K_68510, int32_t K_68511,\n int32_t K_68513, int32_t D_68526,\n int32_t num_groups_76916, int32_t virt_groups_76923,\n __global unsigned char *mem_80631, __global\n unsigned char *mem_80635, __global\n unsigned char *mem_80649, __global\n unsigned char *mem_80652, __global\n unsigned char *mem_80854, __global\n unsigned char *mem_80859, __global\n unsigned char *mem_80863)\n{\n const int32_t group_sizze_76906 = rev_gmm_objectivezigroup_sizze_75099;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_75119;\n int32_t local_tid_75120;\n int32_t group_sizze_82625;\n int32_t wave_sizze_82624;\n int32_t group_id_75121;\n \n global_tid_75119 = get_global_id(0);\n local_tid_75120 = get_local_id(0);\n group_sizze_82625 = get_local_size(0);\n wave_sizze_82624 = LOCKSTEP_WIDTH;\n group_id_75121 = get_group_id(0);\n \n int32_t gtid_75095;\n int32_t gtid_75096;\n int32_t phys_group_id_82626;\n \n phys_group_id_82626 = get_group_id(0);\n for (int32_t i_82627 = 0; i_82627 < squot32(virt_groups_76923 -\n phys_group_id_82626 +\n num_groups_76916 - 1,\n num_groups_76916); i_82627++) {\n int32_t virt_group_id_82628 = phys_group_id_82626 + i_82627 *\n ",
" num_groups_76916;\n \n gtid_75095 = squot32(virt_group_id_82628 * group_sizze_76906 +\n local_tid_75120, K_68510);\n gtid_75096 = virt_group_id_82628 * group_sizze_76906 + local_tid_75120 -\n squot32(virt_group_id_82628 * group_sizze_76906 + local_tid_75120,\n K_68510) * K_68510;\n \n double res_76927;\n double res_elem_76928;\n double res_76931;\n double res_76932;\n \n if (slt32(gtid_75095, N_68508) && slt32(gtid_75096, K_68510)) {\n res_76927 = *(__global double *) &mem_80652[gtid_75095 * 8];\n res_elem_76928 = *(__global double *) &mem_80649[(gtid_75095 *\n K_68510 +\n gtid_75096) * 8];\n res_76931 = futrts_exp64(res_elem_76928);\n res_76932 = res_76927 * res_76931;\n }\n if (slt32(gtid_75095, N_68508) && slt32(gtid_75096, K_68510)) {\n for (int32_t i_82629 = 0; i_82629 < D_68526; i_82629++) {\n *(__global double *) &mem_80854[(K_68510 * N_68508 * 0 +\n gtid_75095 * K_68510 +\n gtid_75096 + i_82629 *\n (K_68510 * N_68508)) * 8] =\n *(__global double *) &mem_80635[(K_68511 * 0 + gtid_75096 +\n i_82629 * K_68511) * 8];\n }\n }\n if (slt32(gtid_75095, N_68508) && slt32(gtid_75096, K_68510)) {\n for (int32_t i_82630 = 0; i_82630 < D_68526; i_82630++) {\n *(__global double *) &mem_80859[(K_68510 * N_68508 * 0 +\n gtid_75095 * K_68510 +\n gtid_75096 + i_82630 *\n (K_6851",
"0 * N_68508)) * 8] =\n *(__global double *) &mem_80631[(K_68513 * 0 + gtid_75096 +\n i_82630 * K_68513) * 8];\n }\n }\n if (slt32(gtid_75095, N_68508) && slt32(gtid_75096, K_68510)) {\n *(__global double *) &mem_80863[(gtid_75095 * K_68510 +\n gtid_75096) * 8] = res_76932;\n }\n }\n}\n__kernel void map_76183(int32_t N_68508, double d_r_68524,\n int32_t num_groups_76435, int32_t virt_groups_76442,\n __global unsigned char *mem_80645, __global\n unsigned char *mem_80652)\n{\n const int32_t group_sizze_76425 = rev_gmm_objectivezigroup_sizze_76163;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_76183;\n int32_t local_tid_76184;\n int32_t group_sizze_82527;\n int32_t wave_sizze_82526;\n int32_t group_id_76185;\n \n global_tid_76183 = get_global_id(0);\n local_tid_76184 = get_local_id(0);\n group_sizze_82527 = get_local_size(0);\n wave_sizze_82526 = LOCKSTEP_WIDTH;\n group_id_76185 = get_group_id(0);\n \n int32_t gtid_76161;\n int32_t phys_group_id_82528;\n \n phys_group_id_82528 = get_group_id(0);\n for (int32_t i_82529 = 0; i_82529 < squot32(virt_groups_76442 -\n phys_group_id_82528 +\n num_groups_76435 - 1,\n num_groups_76435); i_82529++) {\n int32_t virt_group_id_82530 = phys_group_id_82528 + i_82529 *\n num_groups_76435;\n \n gtid_76161 = virt_group_id_82530 * group_sizze_76425 + local_tid_76184;\n \n double res_76446;\n double x_76447;\n double res_76448;\n \n if (slt32(gtid_76161, N_68508)) {\n res_76446 = *(__global double *) &mem_80645[gtid_7",
"6161 * 8];\n x_76447 = 1.0 / res_76446;\n res_76448 = d_r_68524 * x_76447;\n }\n if (slt32(gtid_76161, N_68508)) {\n *(__global double *) &mem_80652[gtid_76161 * 8] = res_76448;\n }\n }\n}\n__kernel void map_78058(int32_t N_68508, int32_t K_68510, int32_t D_68526,\n int32_t num_groups_78392, int32_t virt_groups_78399,\n __global unsigned char *alphas_mem_80367, __global\n unsigned char *mem_81439, __global\n unsigned char *mem_81442, __global\n unsigned char *mem_81445, __global\n unsigned char *mem_81449)\n{\n const int32_t group_sizze_78382 = rev_gmm_objectivezigroup_sizze_78038;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_78058;\n int32_t local_tid_78059;\n int32_t group_sizze_83067;\n int32_t wave_sizze_83066;\n int32_t group_id_78060;\n \n global_tid_78058 = get_global_id(0);\n local_tid_78059 = get_local_id(0);\n group_sizze_83067 = get_local_size(0);\n wave_sizze_83066 = LOCKSTEP_WIDTH;\n group_id_78060 = get_group_id(0);\n \n int32_t gtid_78036;\n int32_t phys_group_id_83068;\n \n phys_group_id_83068 = get_group_id(0);\n for (int32_t i_83069 = 0; i_83069 < squot32(virt_groups_78399 -\n phys_group_id_83068 +\n num_groups_78392 - 1,\n num_groups_78392); i_83069++) {\n int32_t virt_group_id_83070 = phys_group_id_83068 + i_83069 *\n num_groups_78392;\n \n gtid_78036 = virt_group_id_83070 * group_sizze_78382 + local_tid_78059;\n \n double alphas_elem_78403;\n double res_78417;\n \n if (slt32(gtid_78036, K_68510)) {\n alphas_elem_78403 = *(__global\n ",
" double *) &alphas_mem_80367[gtid_78036 * 8];\n for (int32_t i_78407 = 0; i_78407 < D_68526; i_78407++) {\n double res_78409;\n double redout_78410 = 0.0;\n \n for (int32_t i_78411 = 0; i_78411 < N_68508; i_78411++) {\n double x_78412;\n double res_78415;\n \n x_78412 = *(__global double *) &mem_81439[(i_78411 *\n (K_68510 *\n D_68526) +\n i_78407 *\n K_68510 +\n gtid_78036) * 8];\n res_78415 = redout_78410 + x_78412;\n \n double redout_tmp_83072 = res_78415;\n \n redout_78410 = redout_tmp_83072;\n }\n res_78409 = redout_78410;\n *(__global double *) &mem_81442[(group_id_78060 *\n (group_sizze_78382 * D_68526) +\n local_tid_78059 + i_78407 *\n group_sizze_78382) * 8] =\n res_78409;\n }\n res_78417 = futrts_exp64(alphas_elem_78403);\n }\n if (slt32(gtid_78036, K_68510)) {\n *(__global double *) &mem_81445[gtid_78036 * 8] = res_78417;\n }\n if (slt32(gtid_78036, K_68510)) {\n for (int32_t i_83073 = 0; i_83073 < D_68526; i_83073++) {\n *(__global double *) &mem_81449[(gtid_78036 + i_83073 *\n K_68510) * 8] = *(__global\n double *) &mem_81442[(group_id_78060 *\n ",
" (group_sizze_78382 *\n D_68526) +\n local_tid_78059 +\n i_83073 *\n group_sizze_78382) *\n 8];\n }\n }\n }\n}\n__kernel void map_78148(int32_t K_68510, __global\n unsigned char *alphas_mem_80367, __global\n unsigned char *mem_81493)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_78148;\n int32_t local_tid_78149;\n int32_t group_sizze_83129;\n int32_t wave_sizze_83128;\n int32_t group_id_78150;\n \n global_tid_78148 = get_global_id(0);\n local_tid_78149 = get_local_id(0);\n group_sizze_83129 = get_local_size(0);\n wave_sizze_83128 = LOCKSTEP_WIDTH;\n group_id_78150 = get_group_id(0);\n \n int32_t gtid_78141;\n \n gtid_78141 = global_tid_78148;\n \n double alphas_elem_78516;\n double res_78517;\n \n if (slt32(gtid_78141, K_68510)) {\n alphas_elem_78516 = *(__global double *) &alphas_mem_80367[gtid_78141 *\n 8];\n res_78517 = futrts_exp64(alphas_elem_78516);\n }\n if (slt32(gtid_78141, K_68510)) {\n *(__global double *) &mem_81493[gtid_78141 * 8] = res_78517;\n }\n}\n__kernel void map_78171(int32_t N_68508, int32_t K_68510, int32_t D_68526,\n __global unsigned char *res_mem_81412, __global\n unsigned char *mem_81468)\n{\n const int block_dim0 = 0;\n const int blo",
"ck_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_78171;\n int32_t local_tid_78172;\n int32_t group_sizze_83082;\n int32_t wave_sizze_83081;\n int32_t group_id_78173;\n \n global_tid_78171 = get_global_id(0);\n local_tid_78172 = get_local_id(0);\n group_sizze_83082 = get_local_size(0);\n wave_sizze_83081 = LOCKSTEP_WIDTH;\n group_id_78173 = get_group_id(0);\n \n int32_t gtid_78162;\n int32_t gtid_78163;\n \n gtid_78162 = squot32(global_tid_78171, D_68526);\n gtid_78163 = global_tid_78171 - squot32(global_tid_78171, D_68526) *\n D_68526;\n \n double res_78459;\n \n if (slt32(gtid_78162, K_68510) && slt32(gtid_78163, D_68526)) {\n double x_78462 = 0.0;\n \n for (int32_t chunk_offset_78461 = 0; chunk_offset_78461 < N_68508;\n chunk_offset_78461++) {\n double x_78469;\n double res_78472;\n \n x_78469 = *(__global double *) &res_mem_81412[(chunk_offset_78461 *\n (D_68526 * K_68510) +\n gtid_78162 *\n D_68526 +\n gtid_78163) * 8];\n res_78472 = x_78462 + x_78469;\n \n double x_tmp_83083 = res_78472;\n \n x_78462 = x_tmp_83083;\n }\n res_78459 = x_78462;\n }\n if (slt32(gtid_78162, K_68510) && slt32(gtid_78163, D_68526)) {\n *(__global double *) &mem_81468[(gtid_78162 * D_68526 + gtid_78163) *\n 8] = res_78459;\n }\n}\n__kernel void map_78623(int32_t N_68508, int32_t K_68510, int32_t K_68513,\n int32_t K_68515, int32_t triD_68516, int32_t D_68526,\n double res_68862, double t1389_68865, double res_68867,\n int32_t num_groups_78700, int32_t virt_gro",
"ups_78707,\n __global unsigned char *alphas_mem_80367, __global\n unsigned char *res_mem_81411, __global\n unsigned char *mem_81505, __global\n unsigned char *mem_81510, __global\n unsigned char *mem_81514, __global\n unsigned char *mem_81519, __global\n unsigned char *mem_81522, __global\n unsigned char *mem_81525, __global\n unsigned char *mem_81529, __global\n unsigned char *mem_81533, __global\n unsigned char *mem_81536)\n{\n const int32_t group_sizze_78690 = rev_gmm_objectivezigroup_sizze_78603;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_78623;\n int32_t local_tid_78624;\n int32_t group_sizze_83153;\n int32_t wave_sizze_83152;\n int32_t group_id_78625;\n \n global_tid_78623 = get_global_id(0);\n local_tid_78624 = get_local_id(0);\n group_sizze_83153 = get_local_size(0);\n wave_sizze_83152 = LOCKSTEP_WIDTH;\n group_id_78625 = get_group_id(0);\n \n int32_t gtid_78601;\n int32_t phys_group_id_83154;\n \n phys_group_id_83154 = get_group_id(0);\n for (int32_t i_83155 = 0; i_83155 < squot32(virt_groups_78707 -\n phys_group_id_83154 +\n num_groups_78700 - 1,\n num_groups_78700); i_83155++) {\n int32_t virt_group_id_83156 = phys_group_id_83154 + i_83155 *\n num_groups_78700;\n \n gtid_78601 = virt_group_id_83156 * group_sizze_78690 + local_tid_78624;\n \n double alphas_elem_78714;\n double res_78717;\n double res_78731;\n double res_78732;\n double res_78733;\n \n if (slt32(gtid_78601, K_68510)) {\n alphas_",
"elem_78714 = *(__global\n double *) &alphas_mem_80367[gtid_78601 * 8];\n \n double x_78720 = 0.0;\n \n for (int32_t chunk_offset_78719 = 0; chunk_offset_78719 < N_68508;\n chunk_offset_78719++) {\n double x_78727;\n double res_78730;\n \n x_78727 = *(__global\n double *) &res_mem_81411[(chunk_offset_78719 *\n K_68510 + gtid_78601) *\n 8];\n res_78730 = x_78720 + x_78727;\n \n double x_tmp_83157 = res_78730;\n \n x_78720 = x_tmp_83157;\n }\n res_78717 = x_78720;\n res_78731 = futrts_exp64(alphas_elem_78714);\n res_78732 = res_68862 * res_78731;\n res_78733 = res_78717 + res_78732;\n for (int32_t i_78738 = 0; i_78738 < D_68526; i_78738++) {\n double qs_elem_elem_78740;\n double res_78742;\n double res_78749;\n double res_78750;\n double res_78751;\n double res_78753;\n double res_78754;\n double res_78755;\n \n qs_elem_elem_78740 = *(__global double *) &mem_81505[(i_78738 *\n K_68513 +\n gtid_78601) *\n 8];\n \n double redout_78743 = 0.0;\n \n for (int32_t i_78744 = 0; i_78744 < N_68508; i_78744++) {\n double x_78745;\n double res_78748;\n \n x_78745 = *(__global double *) &mem_81510[(i_78744 *\n ",
" (K_68510 *\n D_68526) +\n i_78738 *\n K_68510 +\n gtid_78601) * 8];\n res_78748 = redout_78743 + x_78745;\n \n double redout_tmp_83159 = res_78748;\n \n redout_78743 = redout_tmp_83159;\n }\n res_78742 = redout_78743;\n res_78749 = futrts_exp64(qs_elem_elem_78740);\n res_78750 = t1389_68865 * res_78749;\n res_78751 = res_78750 + res_78750;\n res_78753 = res_78749 * res_78751;\n res_78754 = res_68867 + res_78753;\n res_78755 = res_78742 + res_78754;\n *(__global double *) &mem_81522[(group_id_78625 *\n (group_sizze_78690 * D_68526) +\n local_tid_78624 + i_78738 *\n group_sizze_78690) * 8] =\n res_78755;\n }\n for (int32_t i_78760 = 0; i_78760 < triD_68516; i_78760++) {\n double icf_elem_elem_78762;\n double res_78763;\n double res_78770;\n double res_78771;\n double res_78772;\n \n icf_elem_elem_78762 = *(__global double *) &mem_81514[(i_78760 *\n K_68515 +\n gtid_78601) *\n 8];\n \n double redout_78764 = 0.0;\n \n for (int32_t i_78765 = 0; ",
"i_78765 < N_68508; i_78765++) {\n double x_78766;\n double res_78769;\n \n x_78766 = *(__global double *) &mem_81519[(i_78765 *\n (K_68510 *\n triD_68516) +\n i_78760 *\n K_68510 +\n gtid_78601) * 8];\n res_78769 = redout_78764 + x_78766;\n \n double redout_tmp_83161 = res_78769;\n \n redout_78764 = redout_tmp_83161;\n }\n res_78763 = redout_78764;\n res_78770 = t1389_68865 * icf_elem_elem_78762;\n res_78771 = res_78770 + res_78770;\n res_78772 = res_78763 + res_78771;\n *(__global double *) &mem_81525[(group_id_78625 *\n (group_sizze_78690 *\n triD_68516) +\n local_tid_78624 + i_78760 *\n group_sizze_78690) * 8] =\n res_78772;\n }\n }\n if (slt32(gtid_78601, K_68510)) {\n for (int32_t i_83162 = 0; i_83162 < triD_68516; i_83162++) {\n *(__global double *) &mem_81529[(gtid_78601 + i_83162 *\n K_68510) * 8] = *(__global\n double *) &mem_81525[(group_id_78625 *\n (group_sizze_78690 *\n triD_68516) ",
"+\n local_tid_78624 +\n i_83162 *\n group_sizze_78690) *\n 8];\n }\n }\n if (slt32(gtid_78601, K_68510)) {\n for (int32_t i_83163 = 0; i_83163 < D_68526; i_83163++) {\n *(__global double *) &mem_81533[(gtid_78601 + i_83163 *\n K_68510) * 8] = *(__global\n double *) &mem_81522[(group_id_78625 *\n (group_sizze_78690 *\n D_68526) +\n local_tid_78624 +\n i_83163 *\n group_sizze_78690) *\n 8];\n }\n }\n if (slt32(gtid_78601, K_68510)) {\n *(__global double *) &mem_81536[gtid_78601 * 8] = res_78733;\n }\n }\n}\n__kernel void map_78863(int32_t N_68508, int32_t K_68510, int32_t triD_68516,\n double t1389_68865, __global\n unsigned char *icf_mem_80370, __global\n unsigned char *res_mem_81414, __global\n unsigned char *mem_81614)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_78",
"863;\n int32_t local_tid_78864;\n int32_t group_sizze_83273;\n int32_t wave_sizze_83272;\n int32_t group_id_78865;\n \n global_tid_78863 = get_global_id(0);\n local_tid_78864 = get_local_id(0);\n group_sizze_83273 = get_local_size(0);\n wave_sizze_83272 = LOCKSTEP_WIDTH;\n group_id_78865 = get_group_id(0);\n \n int32_t gtid_78854;\n int32_t gtid_78855;\n \n gtid_78854 = squot32(global_tid_78863, triD_68516);\n gtid_78855 = global_tid_78863 - squot32(global_tid_78863, triD_68516) *\n triD_68516;\n \n double icf_elem_elem_79406;\n double res_79407;\n double res_79421;\n double res_79422;\n double res_79423;\n \n if (slt32(gtid_78854, K_68510) && slt32(gtid_78855, triD_68516)) {\n icf_elem_elem_79406 = *(__global double *) &icf_mem_80370[(gtid_78854 *\n triD_68516 +\n gtid_78855) *\n 8];\n \n double x_79410 = 0.0;\n \n for (int32_t chunk_offset_79409 = 0; chunk_offset_79409 < N_68508;\n chunk_offset_79409++) {\n double x_79417;\n double res_79420;\n \n x_79417 = *(__global double *) &res_mem_81414[(chunk_offset_79409 *\n (triD_68516 *\n K_68510) +\n gtid_78854 *\n triD_68516 +\n gtid_78855) * 8];\n res_79420 = x_79410 + x_79417;\n \n double x_tmp_83274 = res_79420;\n \n x_79410 = x_tmp_83274;\n }\n res_79407 = x_79410;\n res_79421 = t1389_68865 * icf_elem_elem_79406;\n res_79422 = res_79421 + ",
"res_79421;\n res_79423 = res_79407 + res_79422;\n }\n if (slt32(gtid_78854, K_68510) && slt32(gtid_78855, triD_68516)) {\n *(__global double *) &mem_81614[(gtid_78854 * triD_68516 + gtid_78855) *\n 8] = res_79423;\n }\n}\n__kernel void map_78926(int32_t K_68510, int32_t triD_68516, double t1389_68865,\n __global unsigned char *icf_mem_80370, __global\n unsigned char *mem_81638, __global\n unsigned char *mem_81642)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_78926;\n int32_t local_tid_78927;\n int32_t group_sizze_83320;\n int32_t wave_sizze_83319;\n int32_t group_id_78928;\n \n global_tid_78926 = get_global_id(0);\n local_tid_78927 = get_local_id(0);\n group_sizze_83320 = get_local_size(0);\n wave_sizze_83319 = LOCKSTEP_WIDTH;\n group_id_78928 = get_group_id(0);\n \n int32_t gtid_78917;\n int32_t gtid_78918;\n \n gtid_78917 = squot32(global_tid_78926, triD_68516);\n gtid_78918 = global_tid_78926 - squot32(global_tid_78926, triD_68516) *\n triD_68516;\n \n double icf_elem_elem_79471;\n double res_79472;\n double res_79473;\n double res_79474;\n double res_79475;\n \n if (slt32(gtid_78917, K_68510) && slt32(gtid_78918, triD_68516)) {\n icf_elem_elem_79471 = *(__global double *) &icf_mem_80370[(gtid_78917 *\n triD_68516 +\n gtid_78918) *\n 8];\n res_79472 = *(__global double *) &mem_81638[(gtid_78917 * triD_68516 +\n gtid_78918) * 8];\n res_79473 = t1389_68865 * icf_elem_elem_79471;\n res_79474 = res_79473 + res_79473;\n res_79475 = res_79472 + res_79474;\n ",
"}\n if (slt32(gtid_78917, K_68510) && slt32(gtid_78918, triD_68516)) {\n *(__global double *) &mem_81642[(gtid_78917 * triD_68516 + gtid_78918) *\n 8] = res_79475;\n }\n}\n__kernel void map_79020(int32_t N_68508, int32_t K_68510, int32_t D_68514,\n int32_t D_68526, double t1389_68865, double res_68867,\n __global unsigned char *qs_mem_80369, __global\n unsigned char *res_mem_81413, __global\n unsigned char *mem_81580)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_79020;\n int32_t local_tid_79021;\n int32_t group_sizze_83224;\n int32_t wave_sizze_83223;\n int32_t group_id_79022;\n \n global_tid_79020 = get_global_id(0);\n local_tid_79021 = get_local_id(0);\n group_sizze_83224 = get_local_size(0);\n wave_sizze_83223 = LOCKSTEP_WIDTH;\n group_id_79022 = get_group_id(0);\n \n int32_t gtid_79011;\n int32_t gtid_79012;\n \n gtid_79011 = squot32(global_tid_79020, D_68526);\n gtid_79012 = global_tid_79020 - squot32(global_tid_79020, D_68526) *\n D_68526;\n \n double qs_elem_elem_79304;\n double res_79306;\n double res_79320;\n double res_79321;\n double res_79322;\n double res_79324;\n double res_79325;\n double res_79326;\n \n if (slt32(gtid_79011, K_68510) && slt32(gtid_79012, D_68526)) {\n qs_elem_elem_79304 = *(__global double *) &qs_mem_80369[(gtid_79011 *\n D_68514 +\n gtid_79012) *\n 8];\n \n double x_79309 = 0.0;\n \n for (int32_t chunk_offset_79308 = 0; chunk_offset_79308 < N_68508;\n chunk_offset_79308++) {\n double x_79316;\n double res_79319;\n \n ",
" x_79316 = *(__global double *) &res_mem_81413[(chunk_offset_79308 *\n (D_68526 * K_68510) +\n gtid_79011 *\n D_68526 +\n gtid_79012) * 8];\n res_79319 = x_79309 + x_79316;\n \n double x_tmp_83225 = res_79319;\n \n x_79309 = x_tmp_83225;\n }\n res_79306 = x_79309;\n res_79320 = futrts_exp64(qs_elem_elem_79304);\n res_79321 = t1389_68865 * res_79320;\n res_79322 = res_79321 + res_79321;\n res_79324 = res_79320 * res_79322;\n res_79325 = res_68867 + res_79324;\n res_79326 = res_79306 + res_79325;\n }\n if (slt32(gtid_79011, K_68510) && slt32(gtid_79012, D_68526)) {\n *(__global double *) &mem_81580[(gtid_79011 * D_68526 + gtid_79012) *\n 8] = res_79326;\n }\n}\n__kernel void map_79093(int32_t K_68510, int32_t D_68514, int32_t D_68526,\n double t1389_68865, double res_68867, __global\n unsigned char *qs_mem_80369, __global\n unsigned char *mem_81604, __global\n unsigned char *mem_81608)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_79093;\n int32_t local_tid_79094;\n int32_t group_sizze_83271;\n int32_t wave_sizze_83270;\n int32_t group_id_79095;\n \n global_tid_79093 = get_global_id(0);\n local_tid_79094 = get_local_id(0);\n group_sizze_83271 = get_local_size(0);\n wave_sizze_83270 = LOCKSTEP_WIDTH;\n group_id_79095 = get_group_id(0);\n \n int32_t gtid_79084;\n int32_t gtid_79085;\n \n gtid_79084 = squot32(global_tid_79093, D_68526);\n gtid_79085 = global_tid_79093 - squot32(global_tid_79093, D_68526) *\n ",
" D_68526;\n \n double qs_elem_elem_79379;\n double res_79381;\n double res_79382;\n double res_79383;\n double res_79384;\n double res_79386;\n double res_79387;\n double res_79388;\n \n if (slt32(gtid_79084, K_68510) && slt32(gtid_79085, D_68526)) {\n qs_elem_elem_79379 = *(__global double *) &qs_mem_80369[(gtid_79084 *\n D_68514 +\n gtid_79085) *\n 8];\n res_79381 = *(__global double *) &mem_81604[(gtid_79084 * D_68526 +\n gtid_79085) * 8];\n res_79382 = futrts_exp64(qs_elem_elem_79379);\n res_79383 = t1389_68865 * res_79382;\n res_79384 = res_79383 + res_79383;\n res_79386 = res_79382 * res_79384;\n res_79387 = res_68867 + res_79386;\n res_79388 = res_79381 + res_79387;\n }\n if (slt32(gtid_79084, K_68510) && slt32(gtid_79085, D_68526)) {\n *(__global double *) &mem_81608[(gtid_79084 * D_68526 + gtid_79085) *\n 8] = res_79388;\n }\n}\n__kernel void map_79201(int32_t K_68510, double res_68862,\n int32_t num_groups_79271, int32_t virt_groups_79278,\n __global unsigned char *alphas_mem_80367, __global\n unsigned char *mem_81573, __global\n unsigned char *mem_81576)\n{\n const int32_t group_sizze_79261 = rev_gmm_objectivezigroup_sizze_79181;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n int32_t global_tid_79201;\n int32_t local_tid_79202;\n int32_t group_sizze_83219;\n int32_t wave_sizze_83218;\n int32_t group_id_79203;\n \n global_tid_79201 = get_global_id(0);\n local_tid_79202 = get_local_id(0);\n group_sizze_83219 = get_local_size(0);\n wave_sizze_83218 = LOCK",
"STEP_WIDTH;\n group_id_79203 = get_group_id(0);\n \n int32_t gtid_79179;\n int32_t phys_group_id_83220;\n \n phys_group_id_83220 = get_group_id(0);\n for (int32_t i_83221 = 0; i_83221 < squot32(virt_groups_79278 -\n phys_group_id_83220 +\n num_groups_79271 - 1,\n num_groups_79271); i_83221++) {\n int32_t virt_group_id_83222 = phys_group_id_83220 + i_83221 *\n num_groups_79271;\n \n gtid_79179 = virt_group_id_83222 * group_sizze_79261 + local_tid_79202;\n \n double alphas_elem_79281;\n double res_79283;\n double res_79284;\n double res_79285;\n double res_79286;\n \n if (slt32(gtid_79179, K_68510)) {\n alphas_elem_79281 = *(__global\n double *) &alphas_mem_80367[gtid_79179 * 8];\n res_79283 = *(__global double *) &mem_81573[gtid_79179 * 8];\n res_79284 = futrts_exp64(alphas_elem_79281);\n res_79285 = res_68862 * res_79284;\n res_79286 = res_79283 + res_79285;\n }\n if (slt32(gtid_79179, K_68510)) {\n *(__global double *) &mem_81576[gtid_79179 * 8] = res_79286;\n }\n }\n}\n__kernel void map_intra_group_69162(__local volatile\n int64_t *mem_80411_backing_aligned_0,\n int32_t N_68316, int32_t D_68317,\n int32_t K_68318, int32_t K_68319,\n int32_t K_68321, int32_t K_68323,\n int32_t D_68333, __global\n unsigned char *x_mem_80366, __global\n unsigned char *alphas_mem_80367, __global\n unsigned char *mem_80397, __global\n unsigned char *",
"mem_80401, __global\n unsigned char *mem_80405, __global\n unsigned char *mem_80408, __global\n unsigned char *mem_80414)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n __local volatile char *restrict mem_80411_backing_0 =\n mem_80411_backing_aligned_0;\n int32_t global_tid_69162;\n int32_t local_tid_69163;\n int32_t group_sizze_82046;\n int32_t wave_sizze_82045;\n int32_t group_id_69164;\n \n global_tid_69162 = get_global_id(0);\n local_tid_69163 = get_local_id(0);\n group_sizze_82046 = get_local_size(0);\n wave_sizze_82045 = LOCKSTEP_WIDTH;\n group_id_69164 = get_group_id(0);\n \n int32_t gtid_69122;\n int32_t ltid_69123;\n \n gtid_69122 = squot32(global_tid_69162, K_68318);\n ltid_69123 = global_tid_69162 - squot32(global_tid_69162, K_68318) *\n K_68318;\n \n double x_79555;\n double x_79559;\n double x_70011;\n double x_79563;\n double y_70074;\n double res_70075;\n double res_70076;\n \n if (slt32(gtid_69122, N_68316) && slt32(ltid_69123, K_68318)) {\n x_79555 = *(__global double *) &alphas_mem_80367[ltid_69123 * 8];\n \n double x_70000 = 0.0;\n \n for (int32_t chunk_offset_69999 = 0; chunk_offset_69999 < D_68333;\n chunk_offset_69999++) {\n double x_70007;\n double res_70010;\n \n x_70007 = *(__global double *) &mem_80397[(chunk_offset_69999 *\n K_68321 + ltid_69123) *\n 8];\n res_70010 = x_70000 + x_70007;\n \n double x_tmp_82047 = res_70010;\n \n x_70000 = x_tmp_82047;\n }\n x_79559 = x_70000;\n x_70011 = x_79555 + x_79559;\n for (int32_t i_70016 = 0; i_70016 < D_68333; i_",
"70016++) {\n double x_elem_elem_70017;\n double means_elem_elem_70018;\n double res_70019;\n \n x_elem_elem_70017 = *(__global double *) &x_mem_80366[(gtid_69122 *\n D_68317 +\n i_70016) *\n 8];\n means_elem_elem_70018 = *(__global double *) &mem_80401[(i_70016 *\n K_68319 +\n ltid_69123) *\n 8];\n res_70019 = x_elem_elem_70017 - means_elem_elem_70018;\n *(__global double *) &mem_80408[(group_id_69164 * (K_68318 *\n D_68333) +\n local_tid_69163 + i_70016 *\n K_68318) * 8] = res_70019;\n }\n \n double x_70024 = 0.0;\n \n for (int32_t chunk_offset_70023 = 0; chunk_offset_70023 < D_68333;\n chunk_offset_70023++) {\n double qs_elem_elem_70034;\n double res_70036;\n double res_70071;\n double res_70073;\n \n qs_elem_elem_70034 = *(__global\n double *) &mem_80397[(chunk_offset_70023 *\n K_68321 + ltid_69123) *\n 8];\n \n double x_70039 = 0.0;\n \n for (int32_t chunk_offset_70038 = 0; chunk_offset_70038 < D_68333;\n chunk_offset_70038++) {\n double x_70049;\n bool cond_70051;\n double res_70052;\n double res_70068;\n ",
" double res_70070;\n \n x_70049 = *(__global double *) &mem_80408[(group_id_69164 *\n (K_68318 * D_68333) +\n local_tid_69163 +\n chunk_offset_70038 *\n K_68318) * 8];\n cond_70051 = slt32(chunk_offset_70023, chunk_offset_70038);\n if (cond_70051) {\n res_70052 = 0.0;\n } else {\n bool cond_70053;\n double res_70054;\n \n cond_70053 = chunk_offset_70023 == chunk_offset_70038;\n if (cond_70053) {\n double res_70055;\n \n res_70055 = futrts_exp64(qs_elem_elem_70034);\n res_70054 = res_70055;\n } else {\n int32_t y_70056;\n int32_t x_70057;\n int32_t res_70058;\n int32_t gmm_knossos_tri_arg_70059;\n int32_t y_70060;\n int32_t x_70061;\n int32_t res_70062;\n int32_t x_70063;\n int32_t x_70064;\n int32_t y_70065;\n int32_t i_70066;\n double res_70067;\n \n y_70056 = D_68333 - 1;\n x_70057 = D_68333 * y_70056;\n res_70058 = sdiv32(x_70057, 2);\n gmm_knossos_tri_arg_70059 = D_68333 -\n chunk_offset_70038;\n y_70060 = gmm_knossos_tri_arg_70059 - 1;\n x_70061 = gmm_knossos_tri_arg_70059 * y_70060;\n re",
"s_70062 = sdiv32(x_70061, 2);\n x_70063 = res_70058 - res_70062;\n x_70064 = chunk_offset_70023 - chunk_offset_70038;\n y_70065 = x_70064 - 1;\n i_70066 = x_70063 + y_70065;\n res_70067 = *(__global double *) &mem_80405[(i_70066 *\n K_68323 +\n ltid_69123) *\n 8];\n res_70054 = res_70067;\n }\n res_70052 = res_70054;\n }\n res_70068 = x_70049 * res_70052;\n res_70070 = x_70039 + res_70068;\n \n double x_tmp_82050 = res_70070;\n \n x_70039 = x_tmp_82050;\n }\n res_70036 = x_70039;\n res_70071 = res_70036 * res_70036;\n res_70073 = x_70024 + res_70071;\n \n double x_tmp_82049 = res_70073;\n \n x_70024 = x_tmp_82049;\n }\n x_79563 = x_70024;\n y_70074 = 0.5 * x_79563;\n res_70075 = x_70011 - y_70074;\n res_70076 = futrts_exp64(res_70075);\n }\n \n __local char *mem_80411;\n double res_70077;\n \n mem_80411 = (__local char *) mem_80411_backing_0;\n for (int32_t comb_iter_82051 = 0; comb_iter_82051 < 1; comb_iter_82051++) {\n int32_t ctid_69160;\n int32_t flat_comb_id_82052 = comb_iter_82051 * K_68318 +\n local_tid_69163;\n \n ctid_69160 = flat_comb_id_82052;\n if (slt32(ctid_69160, K_68318) && 1) {\n *(__local double *) &mem_80411[ctid_69160 * 8] = res_70076;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82053;\n int32_t skip_waves_82054;\n double x_70078;\n double x_70079;\n \n offset_82053 = 0;\n ",
"// participating threads read initial accumulator\n {\n if (slt32(local_tid_69163, K_68318)) {\n x_70078 = *(__local double *) &mem_80411[(local_tid_69163 +\n offset_82053) * 8];\n }\n }\n offset_82053 = 1;\n while (slt32(offset_82053, wave_sizze_82045)) {\n if (slt32(local_tid_69163 + offset_82053, K_68318) &&\n ((local_tid_69163 - squot32(local_tid_69163, wave_sizze_82045) *\n wave_sizze_82045) & (2 * offset_82053 - 1)) == 0) {\n // read array element\n {\n x_70079 = *(volatile __local\n double *) &mem_80411[(local_tid_69163 +\n offset_82053) * 8];\n }\n // apply reduction operation\n {\n double res_70080;\n \n if (slt32(gtid_69122, N_68316) && slt32(ltid_69123, K_68318)) {\n res_70080 = x_70078 + x_70079;\n }\n x_70078 = res_70080;\n }\n // write result of operation\n {\n *(volatile __local double *) &mem_80411[local_tid_69163 * 8] =\n x_70078;\n }\n }\n offset_82053 *= 2;\n }\n skip_waves_82054 = 1;\n while (slt32(skip_waves_82054, squot32(K_68318 + wave_sizze_82045 - 1,\n wave_sizze_82045))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82053 = skip_waves_82054 * wave_sizze_82045;\n if (slt32(local_tid_69163 + offset_82053, K_68318) &&\n ((local_tid_69163 - squot32(local_tid_69163, wave_sizze_82045) *\n wave_sizze_82045) == 0 && (squot32(local_tid_69163,\n wave_sizze_82045) & (2 *\n skip_waves_82054 -\n ",
" 1)) ==\n 0)) {\n // read array element\n {\n x_70079 = *(__local double *) &mem_80411[(local_tid_69163 +\n offset_82053) * 8];\n }\n // apply reduction operation\n {\n double res_70080;\n \n if (slt32(gtid_69122, N_68316) && slt32(ltid_69123, K_68318)) {\n res_70080 = x_70078 + x_70079;\n }\n x_70078 = res_70080;\n }\n // write result of operation\n {\n *(__local double *) &mem_80411[local_tid_69163 * 8] = x_70078;\n }\n }\n skip_waves_82054 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n res_70077 = *(__local double *) &mem_80411[0];\n \n double res_70081;\n \n if (slt32(gtid_69122, N_68316) && slt32(ltid_69123, K_68318)) {\n res_70081 = futrts_log64(res_70077);\n }\n if (local_tid_69163 == 0) {\n *(__global double *) &mem_80414[group_id_69164 * 8] = res_70081;\n }\n}\n__kernel void map_intra_group_70441(__local volatile\n int64_t *mem_80471_backing_aligned_0,\n __local volatile\n int64_t *mem_80474_backing_aligned_1,\n __local volatile\n int64_t *mem_80477_backing_aligned_2,\n int32_t K_68318, int32_t D_68322,\n int32_t triD_68324, int32_t D_68333,\n double x_68452, double res_68453,\n double y_68475,\n int32_t computed_group_sizze_70945, __global\n unsigned char *qs_mem_80369, __global\n unsigned char *icf_mem_80370, __global\n ",
" unsigned char *mem_80480)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n __local volatile char *restrict mem_80471_backing_0 =\n mem_80471_backing_aligned_0;\n __local volatile char *restrict mem_80474_backing_1 =\n mem_80474_backing_aligned_1;\n __local volatile char *restrict mem_80477_backing_2 =\n mem_80477_backing_aligned_2;\n int32_t global_tid_70441;\n int32_t local_tid_70442;\n int32_t group_sizze_82203;\n int32_t wave_sizze_82202;\n int32_t group_id_70443;\n \n global_tid_70441 = get_global_id(0);\n local_tid_70442 = get_local_id(0);\n group_sizze_82203 = get_local_size(0);\n wave_sizze_82202 = LOCKSTEP_WIDTH;\n group_id_70443 = get_group_id(0);\n \n int32_t gtid_70430;\n int32_t ltid_70431;\n \n gtid_70430 = squot32(global_tid_70441, computed_group_sizze_70945);\n ltid_70431 = global_tid_70441 - squot32(global_tid_70441,\n computed_group_sizze_70945) *\n computed_group_sizze_70945;\n \n __local char *mem_80471;\n double res_71035;\n __local char *mem_80474;\n double res_71042;\n \n mem_80471 = (__local char *) mem_80471_backing_0;\n for (int32_t comb_iter_82204 = 0; comb_iter_82204 < squot32(D_68333 +\n computed_group_sizze_70945 -\n 1,\n computed_group_sizze_70945);\n comb_iter_82204++) {\n int32_t ctid_70433;\n int32_t flat_comb_id_82205 = comb_iter_82204 *\n computed_group_sizze_70945 + local_tid_70442;\n \n ctid_70433 = flat_comb_id_82205;\n if (slt32(ctid_70433, D_68333) && 1) {\n double qs_elem_elem_71032;\n double res_71033;\n double res_71034;\n ",
" \n qs_elem_elem_71032 = *(__global\n double *) &qs_mem_80369[(gtid_70430 *\n D_68322 +\n ltid_70431) * 8];\n res_71033 = futrts_exp64(qs_elem_elem_71032);\n res_71034 = res_71033 * res_71033;\n *(__local double *) &mem_80471[ctid_70433 * 8] = res_71034;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82206;\n int32_t skip_waves_82207;\n double x_71036;\n double x_71037;\n \n offset_82206 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_70442, D_68333)) {\n x_71036 = *(__local double *) &mem_80471[(local_tid_70442 +\n offset_82206) * 8];\n }\n }\n offset_82206 = 1;\n while (slt32(offset_82206, wave_sizze_82202)) {\n if (slt32(local_tid_70442 + offset_82206, D_68333) &&\n ((local_tid_70442 - squot32(local_tid_70442, wave_sizze_82202) *\n wave_sizze_82202) & (2 * offset_82206 - 1)) == 0) {\n // read array element\n {\n x_71037 = *(volatile __local\n double *) &mem_80471[(local_tid_70442 +\n offset_82206) * 8];\n }\n // apply reduction operation\n {\n double res_71038;\n \n if (slt32(gtid_70430, K_68318) && slt32(ltid_70431,\n computed_group_sizze_70945)) {\n res_71038 = x_71036 + x_71037;\n }\n x_71036 = res_71038;\n }\n // write result of operation\n {\n *(volatile __local double *) &mem_80471[local_tid_70442 * 8] =\n x_71036;\n }\n }\n o",
"ffset_82206 *= 2;\n }\n skip_waves_82207 = 1;\n while (slt32(skip_waves_82207, squot32(computed_group_sizze_70945 +\n wave_sizze_82202 - 1,\n wave_sizze_82202))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82206 = skip_waves_82207 * wave_sizze_82202;\n if (slt32(local_tid_70442 + offset_82206, D_68333) &&\n ((local_tid_70442 - squot32(local_tid_70442, wave_sizze_82202) *\n wave_sizze_82202) == 0 && (squot32(local_tid_70442,\n wave_sizze_82202) & (2 *\n skip_waves_82207 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_71037 = *(__local double *) &mem_80471[(local_tid_70442 +\n offset_82206) * 8];\n }\n // apply reduction operation\n {\n double res_71038;\n \n if (slt32(gtid_70430, K_68318) && slt32(ltid_70431,\n computed_group_sizze_70945)) {\n res_71038 = x_71036 + x_71037;\n }\n x_71036 = res_71038;\n }\n // write result of operation\n {\n *(__local double *) &mem_80471[local_tid_70442 * 8] = x_71036;\n }\n }\n skip_waves_82207 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n res_71035 = *(__local double *) &mem_80471[0];\n mem_80474 = (__local char *) mem_80474_backing_1;\n for (int32_t comb_iter_82208 = 0; comb_iter_82208 < squot32(triD_68324 +\n computed_group_sizze_70945 -\n 1,\n ",
" computed_group_sizze_70945);\n comb_iter_82208++) {\n int32_t ctid_70435;\n int32_t flat_comb_id_82209 = comb_iter_82208 *\n computed_group_sizze_70945 + local_tid_70442;\n \n ctid_70435 = flat_comb_id_82209;\n if (slt32(ctid_70435, triD_68324) && 1) {\n double x_71040;\n double res_71041;\n \n x_71040 = *(__global double *) &icf_mem_80370[(gtid_70430 *\n triD_68324 +\n ltid_70431) * 8];\n res_71041 = x_71040 * x_71040;\n *(__local double *) &mem_80474[ctid_70435 * 8] = res_71041;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82210;\n int32_t skip_waves_82211;\n double x_71043;\n double x_71044;\n \n offset_82210 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_70442, triD_68324)) {\n x_71043 = *(__local double *) &mem_80474[(local_tid_70442 +\n offset_82210) * 8];\n }\n }\n offset_82210 = 1;\n while (slt32(offset_82210, wave_sizze_82202)) {\n if (slt32(local_tid_70442 + offset_82210, triD_68324) &&\n ((local_tid_70442 - squot32(local_tid_70442, wave_sizze_82202) *\n wave_sizze_82202) & (2 * offset_82210 - 1)) == 0) {\n // read array element\n {\n x_71044 = *(volatile __local\n double *) &mem_80474[(local_tid_70442 +\n offset_82210) * 8];\n }\n // apply reduction operation\n {\n double res_71045;\n \n if (slt32(gtid_70430, K_68318) && slt32(ltid_70431,\n computed_group_sizze_70945)) {\n ",
" res_71045 = x_71043 + x_71044;\n }\n x_71043 = res_71045;\n }\n // write result of operation\n {\n *(volatile __local double *) &mem_80474[local_tid_70442 * 8] =\n x_71043;\n }\n }\n offset_82210 *= 2;\n }\n skip_waves_82211 = 1;\n while (slt32(skip_waves_82211, squot32(computed_group_sizze_70945 +\n wave_sizze_82202 - 1,\n wave_sizze_82202))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82210 = skip_waves_82211 * wave_sizze_82202;\n if (slt32(local_tid_70442 + offset_82210, triD_68324) &&\n ((local_tid_70442 - squot32(local_tid_70442, wave_sizze_82202) *\n wave_sizze_82202) == 0 && (squot32(local_tid_70442,\n wave_sizze_82202) & (2 *\n skip_waves_82211 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_71044 = *(__local double *) &mem_80474[(local_tid_70442 +\n offset_82210) * 8];\n }\n // apply reduction operation\n {\n double res_71045;\n \n if (slt32(gtid_70430, K_68318) && slt32(ltid_70431,\n computed_group_sizze_70945)) {\n res_71045 = x_71043 + x_71044;\n }\n x_71043 = res_71045;\n }\n // write result of operation\n {\n *(__local double *) &mem_80474[local_tid_70442 * 8] = x_71043;\n }\n }\n skip_waves_82211 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n res_71042 = *(__local double *) &mem_80474",
"[0];\n \n double y_71046;\n double y_71047;\n double x_71048;\n \n if (slt32(gtid_70430, K_68318) && slt32(ltid_70431,\n computed_group_sizze_70945)) {\n y_71046 = res_71035 + res_71042;\n y_71047 = x_68452 * y_71046;\n x_71048 = 0.5 * y_71047;\n }\n \n __local char *mem_80477;\n double res_71051;\n \n mem_80477 = (__local char *) mem_80477_backing_2;\n for (int32_t comb_iter_82212 = 0; comb_iter_82212 < squot32(D_68333 +\n computed_group_sizze_70945 -\n 1,\n computed_group_sizze_70945);\n comb_iter_82212++) {\n int32_t ctid_70437;\n int32_t flat_comb_id_82213 = comb_iter_82212 *\n computed_group_sizze_70945 + local_tid_70442;\n \n ctid_70437 = flat_comb_id_82213;\n if (slt32(ctid_70437, D_68333) && 1) {\n double x_71050 = *(__global double *) &qs_mem_80369[(gtid_70430 *\n D_68322 +\n ltid_70431) *\n 8];\n \n *(__local double *) &mem_80477[ctid_70437 * 8] = x_71050;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82214;\n int32_t skip_waves_82215;\n double x_71052;\n double x_71053;\n \n offset_82214 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_70442, D_68333)) {\n x_71052 = *(__local double *) &mem_80477[(local_tid_70442 +\n offset_82214) * 8];\n }\n }\n offset_82214 = 1;\n while (slt32(offset_82214, wave_sizze_82202)) {\n if (slt32(local_tid_70442 + offset_82214, D_68333) ",
"&&\n ((local_tid_70442 - squot32(local_tid_70442, wave_sizze_82202) *\n wave_sizze_82202) & (2 * offset_82214 - 1)) == 0) {\n // read array element\n {\n x_71053 = *(volatile __local\n double *) &mem_80477[(local_tid_70442 +\n offset_82214) * 8];\n }\n // apply reduction operation\n {\n double res_71054;\n \n if (slt32(gtid_70430, K_68318) && slt32(ltid_70431,\n computed_group_sizze_70945)) {\n res_71054 = x_71052 + x_71053;\n }\n x_71052 = res_71054;\n }\n // write result of operation\n {\n *(volatile __local double *) &mem_80477[local_tid_70442 * 8] =\n x_71052;\n }\n }\n offset_82214 *= 2;\n }\n skip_waves_82215 = 1;\n while (slt32(skip_waves_82215, squot32(computed_group_sizze_70945 +\n wave_sizze_82202 - 1,\n wave_sizze_82202))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82214 = skip_waves_82215 * wave_sizze_82202;\n if (slt32(local_tid_70442 + offset_82214, D_68333) &&\n ((local_tid_70442 - squot32(local_tid_70442, wave_sizze_82202) *\n wave_sizze_82202) == 0 && (squot32(local_tid_70442,\n wave_sizze_82202) & (2 *\n skip_waves_82215 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_71053 = *(__local double *) &mem_80477[(local_tid_70442 +\n offset_82214) * 8];\n }\n ",
" // apply reduction operation\n {\n double res_71054;\n \n if (slt32(gtid_70430, K_68318) && slt32(ltid_70431,\n computed_group_sizze_70945)) {\n res_71054 = x_71052 + x_71053;\n }\n x_71052 = res_71054;\n }\n // write result of operation\n {\n *(__local double *) &mem_80477[local_tid_70442 * 8] = x_71052;\n }\n }\n skip_waves_82215 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n res_71051 = *(__local double *) &mem_80477[0];\n \n double y_71055;\n double x_71056;\n double res_71057;\n \n if (slt32(gtid_70430, K_68318) && slt32(ltid_70431,\n computed_group_sizze_70945)) {\n y_71055 = res_68453 * res_71051;\n x_71056 = x_71048 - y_71055;\n res_71057 = x_71056 - y_68475;\n }\n if (local_tid_70442 == 0) {\n *(__global double *) &mem_80480[group_id_70443 * 8] = res_71057;\n }\n}\n__kernel void map_intra_group_71320(__local volatile\n int64_t *mem_80524_backing_aligned_0,\n __local volatile\n int64_t *mem_80527_backing_aligned_1,\n __local volatile\n int64_t *mem_80597_backing_aligned_2,\n __local volatile\n int64_t *mem_80601_backing_aligned_3,\n __local volatile\n int64_t *mem_80605_backing_aligned_4,\n __local volatile\n int64_t *mem_80608_backing_aligned_5,\n int32_t N_68508, int32_t D_68509,\n int32_t K_68510, int32_t D_68512,\n",
" int32_t D_68514, int32_t triD_68516,\n double d_r_68524, int32_t D_68526,\n int32_t computed_group_sizze_71318, __global\n unsigned char *x_mem_80366, __global\n unsigned char *alphas_mem_80367, __global\n unsigned char *means_mem_80368, __global\n unsigned char *qs_mem_80369, __global\n unsigned char *icf_mem_80370, __global\n unsigned char *mem_80521, __global\n unsigned char *mem_80530, __global\n unsigned char *mem_80534, __global\n unsigned char *mem_80537, __global\n unsigned char *mem_80541, __global\n unsigned char *mem_80545, __global\n unsigned char *mem_80548, __global\n unsigned char *mem_80551, __global\n unsigned char *mem_80554, __global\n unsigned char *mem_80557, __global\n unsigned char *mem_80561, __global\n unsigned char *mem_80565, __global\n unsigned char *mem_80568, __global\n unsigned char *mem_80571, __global\n unsigned char *mem_80574, __global\n unsigned char *mem_80577, __global\n unsigned char *mem_80580, __global\n unsigned char *mem_80583, __global\n unsigned char *mem_80590, __global\n unsigned char",
" *mem_80593, __global\n unsigned char *mem_80612, __global\n unsigned char *mem_80617, __global\n unsigned char *mem_80622, __global\n unsigned char *mem_80627, __global\n unsigned char *mem_81683, __global\n unsigned char *mem_81686, __global\n unsigned char *mem_81693, __global\n unsigned char *mem_81696)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n __local volatile char *restrict mem_80524_backing_0 =\n mem_80524_backing_aligned_0;\n __local volatile char *restrict mem_80527_backing_1 =\n mem_80527_backing_aligned_1;\n __local volatile char *restrict mem_80597_backing_2 =\n mem_80597_backing_aligned_2;\n __local volatile char *restrict mem_80601_backing_3 =\n mem_80601_backing_aligned_3;\n __local volatile char *restrict mem_80605_backing_4 =\n mem_80605_backing_aligned_4;\n __local volatile char *restrict mem_80608_backing_5 =\n mem_80608_backing_aligned_5;\n int32_t global_tid_71320;\n int32_t local_tid_71321;\n int32_t group_sizze_82424;\n int32_t wave_sizze_82423;\n int32_t group_id_71322;\n \n global_tid_71320 = get_global_id(0);\n local_tid_71321 = get_local_id(0);\n group_sizze_82424 = get_local_size(0);\n wave_sizze_82423 = LOCKSTEP_WIDTH;\n group_id_71322 = get_group_id(0);\n \n int32_t gtid_71203;\n int32_t ltid_71204;\n \n gtid_71203 = squot32(global_tid_71320, computed_group_sizze_71318);\n ltid_71204 = global_tid_71320 - squot32(global_tid_71320,\n computed_group_sizze_71318) *\n computed_group_sizze_7",
"1318;\n if (slt32(gtid_71203, N_68508) && slt32(ltid_71204,\n computed_group_sizze_71318)) { }\n \n __local char *mem_80524;\n __local char *mem_80527;\n double res_72145;\n \n mem_80524 = (__local char *) mem_80524_backing_0;\n mem_80527 = (__local char *) mem_80527_backing_1;\n for (int32_t comb_iter_82425 = 0; comb_iter_82425 < squot32(K_68510 +\n computed_group_sizze_71318 -\n 1,\n computed_group_sizze_71318);\n comb_iter_82425++) {\n int32_t ctid_71241;\n int32_t flat_comb_id_82426 = comb_iter_82425 *\n computed_group_sizze_71318 + local_tid_71321;\n \n ctid_71241 = flat_comb_id_82426;\n if (slt32(ctid_71241, K_68510) && 1) {\n double alphas_elem_72060 = *(__global\n double *) &alphas_mem_80367[ltid_71204 *\n 8];\n double res_72065;\n double x_72068 = 0.0;\n int32_t chunk_sizze_72066;\n int32_t chunk_offset_72067 = 0;\n \n chunk_sizze_72066 = D_68526;\n \n double res_72070;\n double acc_72073 = x_72068;\n int32_t groupstream_mapaccum_dummy_chunk_sizze_72071;\n \n groupstream_mapaccum_dummy_chunk_sizze_72071 = 1;\n if (chunk_sizze_72066 == D_68526) {\n for (int32_t i_72072 = 0; i_72072 < D_68526; i_72072++) {\n double x_72075;\n double res_72078;\n \n x_72075 = *(__global double *) &qs_mem_80369[(ltid_71204 *\n D_68514 +\n ",
" chunk_offset_72067 +\n i_72072) * 8];\n res_72078 = acc_72073 + x_72075;\n \n double acc_tmp_82427 = res_72078;\n \n acc_72073 = acc_tmp_82427;\n }\n } else {\n for (int32_t i_72072 = 0; i_72072 < chunk_sizze_72066;\n i_72072++) {\n double x_72075;\n double res_72078;\n \n x_72075 = *(__global double *) &qs_mem_80369[(ltid_71204 *\n D_68514 +\n chunk_offset_72067 +\n i_72072) * 8];\n res_72078 = acc_72073 + x_72075;\n \n double acc_tmp_82428 = res_72078;\n \n acc_72073 = acc_tmp_82428;\n }\n }\n res_72070 = acc_72073;\n x_72068 = res_72070;\n res_72065 = x_72068;\n \n double x_72079;\n \n x_72079 = alphas_elem_72060 + res_72065;\n for (int32_t i_72084 = 0; i_72084 < D_68526; i_72084++) {\n double x_elem_elem_72085;\n double means_elem_elem_72086;\n double res_72087;\n \n x_elem_elem_72085 = *(__global\n double *) &x_mem_80366[(gtid_71203 *\n D_68509 +\n i_72084) * 8];\n means_elem_elem_72086 = *(__global\n double *) &means_mem_80368[(ltid_71204 *\n D_68512 +\n ",
" i_72084) *\n 8];\n res_72087 = x_elem_elem_72085 - means_elem_elem_72086;\n *(__global double *) &mem_80521[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) + local_tid_71321 +\n i_72084 *\n computed_group_sizze_71318) *\n 8] = res_72087;\n }\n \n double res_72089;\n double x_72092 = 0.0;\n int32_t chunk_sizze_72090;\n int32_t chunk_offset_72091 = 0;\n \n chunk_sizze_72090 = D_68526;\n \n double res_72095;\n double acc_72098 = x_72092;\n int32_t groupstream_mapaccum_dummy_chunk_sizze_72096;\n int32_t i_72097 = 0;\n \n groupstream_mapaccum_dummy_chunk_sizze_72096 = chunk_sizze_72090;\n for (int32_t i_72097 = 0; i_72097 < chunk_sizze_72090; i_72097++) {\n int32_t convop_x_79593;\n double qs_elem_elem_72102;\n \n convop_x_79593 = chunk_offset_72091 + i_72097;\n qs_elem_elem_72102 = *(__global\n double *) &qs_mem_80369[(ltid_71204 *\n D_68514 +\n chunk_offset_72091 +\n i_72097) * 8];\n \n double res_72104;\n double x_72107 = 0.0;\n int32_t chunk_sizze_72105;\n int32_t chunk_offset_72106 = 0;\n \n chunk_sizze_72105 = D_68526;\n ",
" \n double res_72110;\n double acc_72113 = x_72107;\n int32_t groupstream_mapaccum_dummy_chunk_sizze_72111;\n \n groupstream_mapaccum_dummy_chunk_sizze_72111 = 1;\n if (chunk_sizze_72105 == D_68526) {\n for (int32_t i_72112 = 0; i_72112 < D_68526; i_72112++) {\n int32_t convop_x_79589;\n double x_72117;\n bool cond_72119;\n double res_72120;\n double res_72136;\n double res_72138;\n \n convop_x_79589 = chunk_offset_72106 + i_72112;\n x_72117 = *(__global\n double *) &mem_80521[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n computed_group_sizze_71318 *\n (chunk_offset_72106 +\n i_72112) + 0 *\n computed_group_sizze_71318) *\n 8];\n cond_72119 = slt32(convop_x_79593, convop_x_79589);\n if (cond_72119) {\n res_72120 = 0.0;\n } else {\n bool cond_72121;\n double res_72122;\n \n cond_72121 = convop_x_79593 == convop_x_79589;\n if (cond_72121) {\n double res_72123;\n \n ",
" res_72123 = futrts_exp64(qs_elem_elem_72102);\n res_72122 = res_72123;\n } else {\n int32_t y_72124;\n int32_t x_72125;\n int32_t res_72126;\n int32_t gmm_knossos_tri_arg_72127;\n int32_t y_72128;\n int32_t x_72129;\n int32_t res_72130;\n int32_t x_72131;\n int32_t x_72132;\n int32_t y_72133;\n int32_t i_72134;\n double res_72135;\n \n y_72124 = D_68526 - 1;\n x_72125 = D_68526 * y_72124;\n res_72126 = sdiv32(x_72125, 2);\n gmm_knossos_tri_arg_72127 = D_68526 -\n convop_x_79589;\n y_72128 = gmm_knossos_tri_arg_72127 - 1;\n x_72129 = gmm_knossos_tri_arg_72127 * y_72128;\n res_72130 = sdiv32(x_72129, 2);\n x_72131 = res_72126 - res_72130;\n x_72132 = convop_x_79593 - convop_x_79589;\n y_72133 = x_72132 - 1;\n i_72134 = x_72131 + y_72133;\n res_72135 = *(__global\n double *) &icf_mem_80370[(ltid_71204 *\n triD_68516 +\n i_72134) *\n 8];\n res_72122 ",
"= res_72135;\n }\n res_72120 = res_72122;\n }\n res_72136 = x_72117 * res_72120;\n res_72138 = acc_72113 + res_72136;\n \n double acc_tmp_82430 = res_72138;\n \n acc_72113 = acc_tmp_82430;\n }\n } else {\n for (int32_t i_72112 = 0; i_72112 < chunk_sizze_72105;\n i_72112++) {\n int32_t convop_x_79589;\n double x_72117;\n bool cond_72119;\n double res_72120;\n double res_72136;\n double res_72138;\n \n convop_x_79589 = chunk_offset_72106 + i_72112;\n x_72117 = *(__global\n double *) &mem_80521[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n computed_group_sizze_71318 *\n (chunk_offset_72106 +\n i_72112) + 0 *\n computed_group_sizze_71318) *\n 8];\n cond_72119 = slt32(convop_x_79593, convop_x_79589);\n if (cond_72119) {\n res_72120 = 0.0;\n } else {\n bool cond_72121;\n double res_72122;\n \n cond_7",
"2121 = convop_x_79593 == convop_x_79589;\n if (cond_72121) {\n double res_72123;\n \n res_72123 = futrts_exp64(qs_elem_elem_72102);\n res_72122 = res_72123;\n } else {\n int32_t y_72124;\n int32_t x_72125;\n int32_t res_72126;\n int32_t gmm_knossos_tri_arg_72127;\n int32_t y_72128;\n int32_t x_72129;\n int32_t res_72130;\n int32_t x_72131;\n int32_t x_72132;\n int32_t y_72133;\n int32_t i_72134;\n double res_72135;\n \n y_72124 = D_68526 - 1;\n x_72125 = D_68526 * y_72124;\n res_72126 = sdiv32(x_72125, 2);\n gmm_knossos_tri_arg_72127 = D_68526 -\n convop_x_79589;\n y_72128 = gmm_knossos_tri_arg_72127 - 1;\n x_72129 = gmm_knossos_tri_arg_72127 * y_72128;\n res_72130 = sdiv32(x_72129, 2);\n x_72131 = res_72126 - res_72130;\n x_72132 = convop_x_79593 - convop_x_79589;\n y_72133 = x_72132 - 1;\n i_72134 = x_72131 + y_72133;\n res_72135 = *(__global\n double *) &icf_mem_80370[(ltid_71204 *\n triD_68516 +\n ",
" i_72134) *\n 8];\n res_72122 = res_72135;\n }\n res_72120 = res_72122;\n }\n res_72136 = x_72117 * res_72120;\n res_72138 = acc_72113 + res_72136;\n \n double acc_tmp_82431 = res_72138;\n \n acc_72113 = acc_tmp_82431;\n }\n }\n res_72110 = acc_72113;\n x_72107 = res_72110;\n res_72104 = x_72107;\n \n double res_72139;\n double res_72141;\n \n res_72139 = res_72104 * res_72104;\n res_72141 = acc_72098 + res_72139;\n acc_72098 = res_72141;\n }\n res_72095 = acc_72098;\n x_72092 = res_72095;\n res_72089 = x_72092;\n \n double y_72142;\n double res_72143;\n double res_72144;\n \n y_72142 = 0.5 * res_72089;\n res_72143 = x_72079 - y_72142;\n res_72144 = futrts_exp64(res_72143);\n *(__local double *) &mem_80524[ctid_71241 * 8] = res_72144;\n *(__local double *) &mem_80527[ctid_71241 * 8] = res_72143;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82432;\n int32_t skip_waves_82433;\n double x_72146;\n double x_72147;\n \n offset_82432 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_71321, K_68510)) {\n x_72146 = *(__local double *) &mem_80524[(local_tid_71321 +\n offset_82432) * 8];\n }\n }\n offset_82432 = 1;\n while (slt32(offset_82432, wave_sizze_82423)) {\n ",
" if (slt32(local_tid_71321 + offset_82432, K_68510) &&\n ((local_tid_71321 - squot32(local_tid_71321, wave_sizze_82423) *\n wave_sizze_82423) & (2 * offset_82432 - 1)) == 0) {\n // read array element\n {\n x_72147 = *(volatile __local\n double *) &mem_80524[(local_tid_71321 +\n offset_82432) * 8];\n }\n // apply reduction operation\n {\n double res_72148;\n \n if (slt32(gtid_71203, N_68508) && slt32(ltid_71204,\n computed_group_sizze_71318)) {\n res_72148 = x_72146 + x_72147;\n }\n x_72146 = res_72148;\n }\n // write result of operation\n {\n *(volatile __local double *) &mem_80524[local_tid_71321 * 8] =\n x_72146;\n }\n }\n offset_82432 *= 2;\n }\n skip_waves_82433 = 1;\n while (slt32(skip_waves_82433, squot32(computed_group_sizze_71318 +\n wave_sizze_82423 - 1,\n wave_sizze_82423))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82432 = skip_waves_82433 * wave_sizze_82423;\n if (slt32(local_tid_71321 + offset_82432, K_68510) &&\n ((local_tid_71321 - squot32(local_tid_71321, wave_sizze_82423) *\n wave_sizze_82423) == 0 && (squot32(local_tid_71321,\n wave_sizze_82423) & (2 *\n skip_waves_82433 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_72147 = *(__local double *) &mem_80524[(local_tid_71321 +\n ",
" offset_82432) * 8];\n }\n // apply reduction operation\n {\n double res_72148;\n \n if (slt32(gtid_71203, N_68508) && slt32(ltid_71204,\n computed_group_sizze_71318)) {\n res_72148 = x_72146 + x_72147;\n }\n x_72146 = res_72148;\n }\n // write result of operation\n {\n *(__local double *) &mem_80524[local_tid_71321 * 8] = x_72146;\n }\n }\n skip_waves_82433 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n res_72145 = *(__local double *) &mem_80524[0];\n \n double x_72149;\n double res_72150;\n \n if (slt32(gtid_71203, N_68508) && slt32(ltid_71204,\n computed_group_sizze_71318)) {\n x_72149 = 1.0 / res_72145;\n res_72150 = d_r_68524 * x_72149;\n }\n \n __local char *mem_80597;\n __local char *mem_80601;\n __local char *mem_80605;\n __local char *mem_80608;\n \n mem_80597 = (__local char *) mem_80597_backing_2;\n mem_80601 = (__local char *) mem_80601_backing_3;\n mem_80605 = (__local char *) mem_80605_backing_4;\n mem_80608 = (__local char *) mem_80608_backing_5;\n for (int32_t comb_iter_82434 = 0; comb_iter_82434 < squot32(K_68510 +\n computed_group_sizze_71318 -\n 1,\n computed_group_sizze_71318);\n comb_iter_82434++) {\n int32_t ctid_71310;\n int32_t flat_comb_id_82435 = comb_iter_82434 *\n computed_group_sizze_71318 + local_tid_71321;\n \n ctid_71310 = flat_comb_id_82435;\n if (slt32(ctid_71310, K_68510) && 1) {\n double res_elem_72156;\n double res_72160;\n ",
"double res_72161;\n double y_72201;\n double rev_sqnorm_arg_72202;\n \n res_elem_72156 = *(__local double *) &mem_80527[ltid_71204 * 8];\n res_72160 = futrts_exp64(res_elem_72156);\n res_72161 = res_72150 * res_72160;\n for (int32_t i_72170 = 0; i_72170 < D_68526; i_72170++) {\n double qs_elem_elem_72172;\n double x_elem_elem_72173;\n double means_elem_elem_72174;\n double res_72198;\n \n qs_elem_elem_72172 = *(__global\n double *) &qs_mem_80369[(ltid_71204 *\n D_68514 +\n i_72170) * 8];\n x_elem_elem_72173 = *(__global\n double *) &x_mem_80366[(gtid_71203 *\n D_68509 +\n i_72170) * 8];\n means_elem_elem_72174 = *(__global\n double *) &means_mem_80368[(ltid_71204 *\n D_68512 +\n i_72170) *\n 8];\n for (int32_t i_72178 = 0; i_72178 < D_68526; i_72178++) {\n bool cond_72180;\n double res_72181;\n \n cond_72180 = slt32(i_72170, i_72178);\n if (cond_72180) {\n res_72181 = 0.0;\n } else {\n bool cond_72182;\n double res_72183;\n \n cond_72182 = i_72170 == i_72178;\n if (cond_72182) {\n ",
" double res_72184;\n \n res_72184 = futrts_exp64(qs_elem_elem_72172);\n res_72183 = res_72184;\n } else {\n int32_t y_72185;\n int32_t x_72186;\n int32_t res_72187;\n int32_t gmm_knossos_tri_arg_72188;\n int32_t y_72189;\n int32_t x_72190;\n int32_t res_72191;\n int32_t x_72192;\n int32_t x_72193;\n int32_t y_72194;\n int32_t i_72195;\n double res_72196;\n \n y_72185 = D_68526 - 1;\n x_72186 = D_68526 * y_72185;\n res_72187 = sdiv32(x_72186, 2);\n gmm_knossos_tri_arg_72188 = D_68526 - i_72178;\n y_72189 = gmm_knossos_tri_arg_72188 - 1;\n x_72190 = gmm_knossos_tri_arg_72188 * y_72189;\n res_72191 = sdiv32(x_72190, 2);\n x_72192 = res_72187 - res_72191;\n x_72193 = i_72170 - i_72178;\n y_72194 = x_72193 - 1;\n i_72195 = x_72192 + y_72194;\n res_72196 = *(__global\n double *) &icf_mem_80370[(ltid_71204 *\n triD_68516 +\n i_72195) *\n 8];\n res_72183 = res_72196;\n }\n res_72181 = res_72183;\n }\n ",
" *(__global double *) &mem_80557[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 + i_72178 *\n computed_group_sizze_71318) *\n 8] = res_72181;\n }\n res_72198 = x_elem_elem_72173 - means_elem_elem_72174;\n *(__global double *) &mem_80530[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) + local_tid_71321 +\n i_72170 *\n computed_group_sizze_71318) *\n 8] = res_72198;\n for (int32_t i_82439 = 0; i_82439 < D_68526; i_82439++) {\n *(__global double *) &mem_80534[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526 * D_68526) +\n local_tid_71321 + i_72170 *\n (computed_group_sizze_71318 *\n D_68526) + i_82439 *\n computed_group_sizze_71318) *\n 8] = *(__global\n double *) &mem_80557[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n ",
" local_tid_71321 +\n i_82439 *\n computed_group_sizze_71318) *\n 8];\n }\n }\n y_72201 = 0.0 - res_72161;\n rev_sqnorm_arg_72202 = 0.5 * y_72201;\n for (int32_t i_72206 = 0; i_72206 < D_68526; i_72206++) {\n double res_72208;\n double res_72217;\n double res_72218;\n double redout_72209 = 0.0;\n \n for (int32_t i_72210 = 0; i_72210 < D_68526; i_72210++) {\n double x_72211;\n double x_72212;\n double res_72213;\n double res_72216;\n \n x_72211 = *(__global double *) &mem_80530[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_72210 *\n computed_group_sizze_71318) *\n 8];\n x_72212 = *(__global double *) &mem_80534[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526 *\n D_68526) +\n local_tid_71321 +\n (i_72206 *\n ",
" (computed_group_sizze_71318 *\n D_68526) +\n i_72210 *\n computed_group_sizze_71318)) *\n 8];\n res_72213 = x_72211 * x_72212;\n res_72216 = redout_72209 + res_72213;\n \n double redout_tmp_82441 = res_72216;\n \n redout_72209 = redout_tmp_82441;\n }\n res_72208 = redout_72209;\n res_72217 = rev_sqnorm_arg_72202 * res_72208;\n res_72218 = res_72217 + res_72217;\n *(__global double *) &mem_80537[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) + local_tid_71321 +\n i_72206 *\n computed_group_sizze_71318) *\n 8] = res_72218;\n }\n for (int32_t i_72230 = 0; i_72230 < D_68526; i_72230++) {\n double x_72231;\n double qs_elem_elem_72234;\n double res_72235;\n double res_72244;\n \n x_72231 = *(__global double *) &mem_80537[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_72230 *\n computed_group_sizze_71318) *\n ",
" 8];\n qs_elem_elem_72234 = *(__global\n double *) &qs_mem_80369[(ltid_71204 *\n D_68514 +\n i_72230) * 8];\n \n double redout_72236 = 0.0;\n \n for (int32_t i_72237 = 0; i_72237 < D_68526; i_72237++) {\n double x_72238;\n double x_72239;\n double res_72240;\n double res_72243;\n \n x_72238 = *(__global double *) &mem_80534[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526 *\n D_68526) +\n local_tid_71321 +\n (i_72237 *\n (computed_group_sizze_71318 *\n D_68526) +\n i_72230 *\n computed_group_sizze_71318)) *\n 8];\n x_72239 = *(__global double *) &mem_80537[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_72237 *\n computed_group_sizze_71318) *\n ",
" 8];\n res_72240 = x_72238 * x_72239;\n res_72243 = redout_72236 + res_72240;\n \n double redout_tmp_82445 = res_72243;\n \n redout_72236 = redout_tmp_82445;\n }\n res_72235 = redout_72236;\n res_72244 = 0.0 - res_72235;\n for (int32_t i_72251 = 0; i_72251 < D_68526; i_72251++) {\n double x_72252;\n double res_72254;\n bool cond_72255;\n bool cond_72256;\n \n x_72252 = *(__global double *) &mem_80530[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_72251 *\n computed_group_sizze_71318) *\n 8];\n res_72254 = x_72231 * x_72252;\n cond_72255 = slt32(i_72230, i_72251);\n cond_72256 = i_72230 == i_72251;\n if (cond_72255) {\n for (int32_t i_82448 = 0; i_82448 < D_68526;\n i_82448++) {\n *(__global double *) &mem_80568[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_82448 *\n computed",
"_group_sizze_71318) *\n 8] = 0.0;\n }\n for (int32_t i_82449 = 0; i_82449 < triD_68516;\n i_82449++) {\n *(__global double *) &mem_80571[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516) +\n local_tid_71321 +\n i_82449 *\n computed_group_sizze_71318) *\n 8] = 0.0;\n }\n for (int32_t i_82450 = 0; i_82450 < D_68526;\n i_82450++) {\n *(__global double *) &mem_81696[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_82450 *\n computed_group_sizze_71318) *\n 8] = *(__global\n double *) &mem_80568[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n ",
" i_82450 *\n computed_group_sizze_71318) *\n 8];\n }\n for (int32_t i_82451 = 0; i_82451 < triD_68516;\n i_82451++) {\n *(__global double *) &mem_81693[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516) +\n local_tid_71321 +\n i_82451 *\n computed_group_sizze_71318) *\n 8] = *(__global\n double *) &mem_80571[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516) +\n local_tid_71321 +\n i_82451 *\n computed_group_sizze_71318) *\n 8];\n }\n } else {\n if (cond_72256) {\n double res_72263;\n double deltaVec_arg_72264;\n \n res_72263 = futrts_exp64(qs_elem_elem_72234);\n ",
" deltaVec_arg_72264 = res_72254 * res_72263;\n for (int32_t i_72269 = 0; i_72269 < D_68526;\n i_72269++) {\n bool cond_72271;\n double res_72272;\n \n cond_72271 = i_72269 == i_72230;\n if (cond_72271) {\n res_72272 = deltaVec_arg_72264;\n } else {\n res_72272 = 0.0;\n }\n *(__global\n double *) &mem_80574[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_72269 *\n computed_group_sizze_71318) *\n 8] = res_72272;\n }\n for (int32_t i_82453 = 0; i_82453 < triD_68516;\n i_82453++) {\n *(__global\n double *) &mem_80577[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516) +\n local_tid_71321 +\n i_82453 *\n computed_group_sizze_71318) *\n 8] = 0.0;\n }\n for (int32_t i_82454 = 0; i_82454 < D_68526;\n ",
" i_82454++) {\n *(__global\n double *) &mem_81686[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_82454 *\n computed_group_sizze_71318) *\n 8] = *(__global\n double *) &mem_80574[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_82454 *\n computed_group_sizze_71318) *\n 8];\n }\n for (int32_t i_82455 = 0; i_82455 < triD_68516;\n i_82455++) {\n *(__global\n double *) &mem_81683[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516) +\n local_tid_71321 +\n i_82455 *\n computed_group_sizze_71318) *\n ",
" 8] = *(__global\n double *) &mem_80577[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516) +\n local_tid_71321 +\n i_82455 *\n computed_group_sizze_71318) *\n 8];\n }\n } else {\n int32_t y_72275;\n int32_t x_72276;\n int32_t res_72277;\n int32_t deltaVec_arg_72278;\n \n y_72275 = i_72230 - 1;\n x_72276 = i_72230 * y_72275;\n res_72277 = sdiv32(x_72276, 2);\n deltaVec_arg_72278 = i_72251 + res_72277;\n for (int32_t i_72283 = 0; i_72283 < triD_68516;\n i_72283++) {\n bool cond_72285;\n double res_72286;\n \n cond_72285 = i_72283 == deltaVec_arg_72278;\n if (cond_72285) {\n res_72286 = res_72254;\n } else {\n res_72286 = 0.0;\n }\n *(__global\n double *) &mem_80580[(group_id_71322 *\n ",
" (computed_group_sizze_71318 *\n triD_68516) +\n local_tid_71321 +\n i_72283 *\n computed_group_sizze_71318) *\n 8] = res_72286;\n }\n for (int32_t i_82457 = 0; i_82457 < D_68526;\n i_82457++) {\n *(__global\n double *) &mem_80583[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_82457 *\n computed_group_sizze_71318) *\n 8] = 0.0;\n }\n for (int32_t i_82458 = 0; i_82458 < D_68526;\n i_82458++) {\n *(__global\n double *) &mem_81686[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_82458 *\n computed_group_sizze_71318) *\n 8] = *(__global\n double *) &mem_80583[(group_id_71322 *\n ",
" (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_82458 *\n computed_group_sizze_71318) *\n 8];\n }\n for (int32_t i_82459 = 0; i_82459 < triD_68516;\n i_82459++) {\n *(__global\n double *) &mem_81683[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516) +\n local_tid_71321 +\n i_82459 *\n computed_group_sizze_71318) *\n 8] = *(__global\n double *) &mem_80580[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516) +\n local_tid_71321 +\n i_82459 *\n computed_group_sizze_71318) *\n 8];\n ",
" }\n }\n for (int32_t i_82460 = 0; i_82460 < D_68526;\n i_82460++) {\n *(__global double *) &mem_81696[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_82460 *\n computed_group_sizze_71318) *\n 8] = *(__global\n double *) &mem_81686[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_82460 *\n computed_group_sizze_71318) *\n 8];\n }\n for (int32_t i_82461 = 0; i_82461 < triD_68516;\n i_82461++) {\n *(__global double *) &mem_81693[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516) +\n local_tid_71321 +\n i_82461 *\n ",
" computed_group_sizze_71318) *\n 8] = *(__global\n double *) &mem_81683[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516) +\n local_tid_71321 +\n i_82461 *\n computed_group_sizze_71318) *\n 8];\n }\n }\n for (int32_t i_82462 = 0; i_82462 < D_68526; i_82462++) {\n *(__global double *) &mem_80561[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526 * D_68526) +\n local_tid_71321 +\n i_72251 *\n (computed_group_sizze_71318 *\n D_68526) + i_82462 *\n computed_group_sizze_71318) *\n 8] = *(__global\n double *) &mem_81696[(group_id_71322 *\n (computed_group_sizze_71318 *\n ",
" D_68526) +\n local_tid_71321 +\n i_82462 *\n computed_group_sizze_71318) *\n 8];\n }\n for (int32_t i_82463 = 0; i_82463 < triD_68516; i_82463++) {\n *(__global double *) &mem_80565[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516 *\n D_68526) +\n local_tid_71321 +\n i_72251 *\n (computed_group_sizze_71318 *\n triD_68516) +\n i_82463 *\n computed_group_sizze_71318) *\n 8] = *(__global\n double *) &mem_81693[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516) +\n local_tid_71321 +\n i_82463 *\n computed_group_sizze_71318)",
" *\n 8];\n }\n }\n for (int32_t i_72295 = 0; i_72295 < D_68526; i_72295++) {\n double res_72297;\n double redout_72298 = 0.0;\n \n for (int32_t i_72299 = 0; i_72299 < D_68526; i_72299++) {\n double x_72300;\n double res_72303;\n \n x_72300 = *(__global\n double *) &mem_80561[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526 * D_68526) +\n local_tid_71321 +\n (i_72299 *\n (computed_group_sizze_71318 *\n D_68526) + i_72295 *\n computed_group_sizze_71318)) *\n 8];\n res_72303 = redout_72298 + x_72300;\n \n double redout_tmp_82465 = res_72303;\n \n redout_72298 = redout_tmp_82465;\n }\n res_72297 = redout_72298;\n *(__global double *) &mem_80590[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 + i_72295 *\n computed_group_sizze_71318) *\n 8] = res_72297;\n ",
" }\n for (int32_t i_72309 = 0; i_72309 < triD_68516; i_72309++) {\n double res_72311;\n double redout_72312 = 0.0;\n \n for (int32_t i_72313 = 0; i_72313 < D_68526; i_72313++) {\n double x_72314;\n double res_72317;\n \n x_72314 = *(__global\n double *) &mem_80565[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516 *\n D_68526) +\n local_tid_71321 +\n (i_72313 *\n (computed_group_sizze_71318 *\n triD_68516) +\n i_72309 *\n computed_group_sizze_71318)) *\n 8];\n res_72317 = redout_72312 + x_72314;\n \n double redout_tmp_82467 = res_72317;\n \n redout_72312 = redout_tmp_82467;\n }\n res_72311 = redout_72312;\n *(__global double *) &mem_80593[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516) +\n local_tid_71321 + i_72309 *\n computed_group_sizze_71318) *\n 8] = re",
"s_72311;\n }\n for (int32_t i_82468 = 0; i_82468 < D_68526; i_82468++) {\n *(__global double *) &mem_80541[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526 * D_68526) +\n local_tid_71321 + i_72230 *\n (computed_group_sizze_71318 *\n D_68526) + i_82468 *\n computed_group_sizze_71318) *\n 8] = *(__global\n double *) &mem_80590[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_82468 *\n computed_group_sizze_71318) *\n 8];\n }\n for (int32_t i_82469 = 0; i_82469 < triD_68516; i_82469++) {\n *(__global double *) &mem_80545[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516 * D_68526) +\n local_tid_71321 + i_72230 *\n (computed_group_sizze_71318 *\n triD_68516) + i_82469 *\n ",
" computed_group_sizze_71318) *\n 8] = *(__global\n double *) &mem_80593[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516) +\n local_tid_71321 +\n i_82469 *\n computed_group_sizze_71318) *\n 8];\n }\n *(__global double *) &mem_80548[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) + local_tid_71321 +\n i_72230 *\n computed_group_sizze_71318) *\n 8] = res_72244;\n }\n for (int32_t i_72326 = 0; i_72326 < triD_68516; i_72326++) {\n double res_72328;\n double redout_72329 = 0.0;\n \n for (int32_t i_72330 = 0; i_72330 < D_68526; i_72330++) {\n double x_72331;\n double res_72334;\n \n x_72331 = *(__global double *) &mem_80545[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516 *\n D_68526) +\n loc",
"al_tid_71321 +\n (i_72330 *\n (computed_group_sizze_71318 *\n triD_68516) +\n i_72326 *\n computed_group_sizze_71318)) *\n 8];\n res_72334 = redout_72329 + x_72331;\n \n double redout_tmp_82471 = res_72334;\n \n redout_72329 = redout_tmp_82471;\n }\n res_72328 = redout_72329;\n *(__global double *) &mem_80551[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516) +\n local_tid_71321 + i_72326 *\n computed_group_sizze_71318) *\n 8] = res_72328;\n }\n for (int32_t i_72340 = 0; i_72340 < D_68526; i_72340++) {\n double res_72342;\n double res_72349;\n double redout_72343 = 0.0;\n \n for (int32_t i_72344 = 0; i_72344 < D_68526; i_72344++) {\n double x_72345;\n double res_72348;\n \n x_72345 = *(__global double *) &mem_80541[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526 *\n D_68526) +\n local_tid_71321 +\n ",
" (i_72344 *\n (computed_group_sizze_71318 *\n D_68526) +\n i_72340 *\n computed_group_sizze_71318)) *\n 8];\n res_72348 = redout_72343 + x_72345;\n \n double redout_tmp_82473 = res_72348;\n \n redout_72343 = redout_tmp_82473;\n }\n res_72342 = redout_72343;\n res_72349 = res_72161 + res_72342;\n *(__global double *) &mem_80554[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) + local_tid_71321 +\n i_72340 *\n computed_group_sizze_71318) *\n 8] = res_72349;\n }\n for (int32_t i_82474 = 0; i_82474 < D_68526; i_82474++) {\n *(__local double *) &mem_80597[(ctid_71310 * D_68526 +\n i_82474) * 8] = *(__global\n double *) &mem_80548[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_82474 *\n ",
" computed_group_sizze_71318) *\n 8];\n }\n for (int32_t i_82475 = 0; i_82475 < D_68526; i_82475++) {\n *(__local double *) &mem_80601[(ctid_71310 * D_68526 +\n i_82475) * 8] = *(__global\n double *) &mem_80554[(group_id_71322 *\n (computed_group_sizze_71318 *\n D_68526) +\n local_tid_71321 +\n i_82475 *\n computed_group_sizze_71318) *\n 8];\n }\n for (int32_t i_82476 = 0; i_82476 < triD_68516; i_82476++) {\n *(__local double *) &mem_80605[(ctid_71310 * triD_68516 +\n i_82476) * 8] = *(__global\n double *) &mem_80551[(group_id_71322 *\n (computed_group_sizze_71318 *\n triD_68516) +\n local_tid_71321 +\n i_82476 *\n computed_gro",
"up_sizze_71318) *\n 8];\n }\n *(__local double *) &mem_80608[ctid_71310 * 8] = res_72161;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n for (int32_t i_82477 = 0; i_82477 < squot32(K_68510 - local_tid_71321 +\n computed_group_sizze_71318 - 1,\n computed_group_sizze_71318);\n i_82477++) {\n *(__global double *) &mem_80612[(group_id_71322 * K_68510 + (i_82477 *\n computed_group_sizze_71318 +\n local_tid_71321)) *\n 8] = *(__local\n double *) &mem_80608[(i_82477 *\n computed_group_sizze_71318 +\n local_tid_71321) *\n 8];\n }\n for (int32_t i_82478 = 0; i_82478 < squot32(K_68510 * D_68526 -\n local_tid_71321 +\n computed_group_sizze_71318 - 1,\n computed_group_sizze_71318);\n i_82478++) {\n *(__global double *) &mem_80617[(group_id_71322 * (D_68526 * K_68510) +\n squot32(i_82478 *\n computed_group_sizze_71318 +\n local_tid_71321, D_68526) *\n D_68526 + (i_82478 *\n computed_group_sizze_71318 +\n local_tid_71321 -\n ",
" squot32(i_82478 *\n computed_group_sizze_71318 +\n local_tid_71321,\n D_68526) *\n D_68526)) * 8] = *(__local\n double *) &mem_80597[(squot32(i_82478 *\n computed_group_sizze_71318 +\n local_tid_71321,\n D_68526) *\n D_68526 +\n (i_82478 *\n computed_group_sizze_71318 +\n local_tid_71321 -\n squot32(i_82478 *\n computed_group_sizze_71318 +\n local_tid_71321,\n D_68526) *\n D_68526)) *\n 8];\n }\n for (int32_t i_82479 = 0; i_82",
"479 < squot32(K_68510 * D_68526 -\n local_tid_71321 +\n computed_group_sizze_71318 - 1,\n computed_group_sizze_71318);\n i_82479++) {\n *(__global double *) &mem_80622[(group_id_71322 * (D_68526 * K_68510) +\n squot32(i_82479 *\n computed_group_sizze_71318 +\n local_tid_71321, D_68526) *\n D_68526 + (i_82479 *\n computed_group_sizze_71318 +\n local_tid_71321 -\n squot32(i_82479 *\n computed_group_sizze_71318 +\n local_tid_71321,\n D_68526) *\n D_68526)) * 8] = *(__local\n double *) &mem_80601[(squot32(i_82479 *\n computed_group_sizze_71318 +\n local_tid_71321,\n D_68526) *\n D_68526 +\n (i_82479 *\n computed_group_sizze_71318 +\n ",
" local_tid_71321 -\n squot32(i_82479 *\n computed_group_sizze_71318 +\n local_tid_71321,\n D_68526) *\n D_68526)) *\n 8];\n }\n for (int32_t i_82480 = 0; i_82480 < squot32(K_68510 * triD_68516 -\n local_tid_71321 +\n computed_group_sizze_71318 - 1,\n computed_group_sizze_71318);\n i_82480++) {\n *(__global double *) &mem_80627[(group_id_71322 * (triD_68516 *\n K_68510) +\n squot32(i_82480 *\n computed_group_sizze_71318 +\n local_tid_71321, triD_68516) *\n triD_68516 + (i_82480 *\n computed_group_sizze_71318 +\n local_tid_71321 -\n squot32(i_82480 *\n computed_group_sizze_71318 +\n local_tid_71321,\n triD_68516) *\n ",
" triD_68516)) * 8] =\n *(__local double *) &mem_80605[(squot32(i_82480 *\n computed_group_sizze_71318 +\n local_tid_71321,\n triD_68516) * triD_68516 +\n (i_82480 *\n computed_group_sizze_71318 +\n local_tid_71321 - squot32(i_82480 *\n computed_group_sizze_71318 +\n local_tid_71321,\n triD_68516) *\n triD_68516)) * 8];\n }\n}\n__kernel void map_intra_group_72545(__local volatile\n int64_t *mem_80774_backing_aligned_0,\n __local volatile\n int64_t *mem_80778_backing_aligned_1,\n __local volatile\n int64_t *mem_80781_backing_aligned_2,\n __local volatile\n int64_t *mem_80821_backing_aligned_3,\n __local volatile\n int64_t *mem_80825_backing_aligned_4,\n __local volatile\n int64_t *mem_80828_backing_aligned_5,\n __local volatile\n int64_t *mem_80831_backing_aligned_6,\n __local volatile\n int64_t *mem_80834_backing_aligned_7,\n int32_t N_68508, int32_t D_68509,\n ",
" int32_t K_68510, int32_t D_68512,\n int32_t D_68514, int32_t triD_68516,\n int32_t D_68526,\n int32_t computed_group_sizze_76480, __global\n unsigned char *x_mem_80366, __global\n unsigned char *means_mem_80368, __global\n unsigned char *qs_mem_80369, __global\n unsigned char *icf_mem_80370, __global\n unsigned char *mem_80652, __global\n unsigned char *mem_80768, __global\n unsigned char *mem_80771, __global\n unsigned char *mem_80785, __global\n unsigned char *mem_80789, __global\n unsigned char *mem_80792, __global\n unsigned char *mem_80795, __global\n unsigned char *mem_80798, __global\n unsigned char *mem_80801, __global\n unsigned char *mem_80804, __global\n unsigned char *mem_80807, __global\n unsigned char *mem_80810, __global\n unsigned char *mem_80813, __global\n unsigned char *mem_80837, __global\n unsigned char *mem_80841, __global\n unsigned char *mem_80845, __global\n unsigned char *mem_80849, __global\n unsigned char *mem_81729, __global\n unsigned char *mem_81732, __global\n unsigned char *mem_81739, __global\n ",
" unsigned char *mem_81742)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n __local volatile char *restrict mem_80774_backing_0 =\n mem_80774_backing_aligned_0;\n __local volatile char *restrict mem_80778_backing_1 =\n mem_80778_backing_aligned_1;\n __local volatile char *restrict mem_80781_backing_2 =\n mem_80781_backing_aligned_2;\n __local volatile char *restrict mem_80821_backing_3 =\n mem_80821_backing_aligned_3;\n __local volatile char *restrict mem_80825_backing_4 =\n mem_80825_backing_aligned_4;\n __local volatile char *restrict mem_80828_backing_5 =\n mem_80828_backing_aligned_5;\n __local volatile char *restrict mem_80831_backing_6 =\n mem_80831_backing_aligned_6;\n __local volatile char *restrict mem_80834_backing_7 =\n mem_80834_backing_aligned_7;\n int32_t global_tid_72545;\n int32_t local_tid_72546;\n int32_t group_sizze_82575;\n int32_t wave_sizze_82574;\n int32_t group_id_72547;\n \n global_tid_72545 = get_global_id(0);\n local_tid_72546 = get_local_id(0);\n group_sizze_82575 = get_local_size(0);\n wave_sizze_82574 = LOCKSTEP_WIDTH;\n group_id_72547 = get_group_id(0);\n \n int32_t gtid_72454;\n int32_t gtid_72455;\n int32_t ltid_72457;\n \n gtid_72454 = squot32(global_tid_72545, K_68510 *\n computed_group_sizze_76480);\n gtid_72455 = squot32(global_tid_72545 - squot32(global_tid_72545, K_68510 *\n computed_group_sizze_76480) *\n (K_68510 * computed_group_sizze_76480),\n computed_group_sizze_76480);\n ltid_72457 = global_tid_72545 - squot32(global_tid_72545, K_68510 *\n computed_grou",
"p_sizze_76480) *\n (K_68510 * computed_group_sizze_76480) - squot32(global_tid_72545 -\n squot32(global_tid_72545,\n K_68510 *\n computed_group_sizze_76480) *\n (K_68510 *\n computed_group_sizze_76480),\n computed_group_sizze_76480) *\n computed_group_sizze_76480;\n \n double res_76700;\n double res_elem_76702;\n double res_76706;\n double res_76707;\n \n if ((slt32(gtid_72454, N_68508) && slt32(gtid_72455, K_68510)) &&\n slt32(ltid_72457, computed_group_sizze_76480)) {\n res_76700 = *(__global double *) &mem_80652[gtid_72454 * 8];\n res_elem_76702 = *(__global double *) &mem_80768[(gtid_72455 * N_68508 +\n gtid_72454) * 8];\n res_76706 = futrts_exp64(res_elem_76702);\n res_76707 = res_76700 * res_76706;\n }\n \n __local char *mem_80774;\n __local char *mem_80778;\n \n mem_80774 = (__local char *) mem_80774_backing_0;\n mem_80778 = (__local char *) mem_80778_backing_1;\n for (int32_t comb_iter_82576 = 0; comb_iter_82576 < squot32(D_68526 +\n computed_group_sizze_76480 -\n 1,\n computed_group_sizze_76480);\n comb_iter_82576++) {\n int32_t ctid_72462;\n int32_t flat_comb_id_82577 = comb_iter_82576 *\n computed_group_sizze_76480 + local_tid_72546;\n \n ctid_72462 = flat_comb_id_82577;\n if (slt32(ctid_72462, D_68526) && 1) {\n double qs_elem_elem_76713;\n ",
" double x_elem_elem_76714;\n double means_elem_elem_76715;\n double res_76739;\n \n qs_elem_elem_76713 = *(__global\n double *) &qs_mem_80369[(gtid_72455 *\n D_68514 +\n ltid_72457) * 8];\n x_elem_elem_76714 = *(__global double *) &x_mem_80366[(gtid_72454 *\n D_68509 +\n ltid_72457) *\n 8];\n means_elem_elem_76715 = *(__global\n double *) &means_mem_80368[(gtid_72455 *\n D_68512 +\n ltid_72457) *\n 8];\n for (int32_t i_76719 = 0; i_76719 < D_68526; i_76719++) {\n bool cond_76721;\n double res_76722;\n \n cond_76721 = slt32(ltid_72457, i_76719);\n if (cond_76721) {\n res_76722 = 0.0;\n } else {\n bool cond_76723;\n double res_76724;\n \n cond_76723 = ltid_72457 == i_76719;\n if (cond_76723) {\n double res_76725;\n \n res_76725 = futrts_exp64(qs_elem_elem_76713);\n res_76724 = res_76725;\n } else {\n int32_t y_76726;\n int32_t x_76727;\n int32_t res_76728;\n int32_t gmm_knossos_tri_arg_76729;\n int32_t y_76730;\n i",
"nt32_t x_76731;\n int32_t res_76732;\n int32_t x_76733;\n int32_t x_76734;\n int32_t y_76735;\n int32_t i_76736;\n double res_76737;\n \n y_76726 = D_68526 - 1;\n x_76727 = D_68526 * y_76726;\n res_76728 = sdiv32(x_76727, 2);\n gmm_knossos_tri_arg_76729 = D_68526 - i_76719;\n y_76730 = gmm_knossos_tri_arg_76729 - 1;\n x_76731 = gmm_knossos_tri_arg_76729 * y_76730;\n res_76732 = sdiv32(x_76731, 2);\n x_76733 = res_76728 - res_76732;\n x_76734 = ltid_72457 - i_76719;\n y_76735 = x_76734 - 1;\n i_76736 = x_76733 + y_76735;\n res_76737 = *(__global\n double *) &icf_mem_80370[(gtid_72455 *\n triD_68516 +\n i_76736) * 8];\n res_76724 = res_76737;\n }\n res_76722 = res_76724;\n }\n *(__global double *) &mem_80771[(group_id_72547 *\n (computed_group_sizze_76480 *\n D_68526) + local_tid_72546 +\n i_76719 *\n computed_group_sizze_76480) *\n 8] = res_76722;\n }\n res_76739 = x_elem_elem_76714 - means_elem_elem_76715;\n *(__local double *) &mem_80774[ctid_72462 * 8] = res_76739;\n for (int32_t i_82579 = 0; i_82579 < D_68526; i_82579++) {\n *(_",
"_local double *) &mem_80778[(ctid_72462 * D_68526 +\n i_82579) * 8] = *(__global\n double *) &mem_80771[(group_id_72547 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_72546 +\n i_82579 *\n computed_group_sizze_76480) *\n 8];\n }\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n double y_76740;\n double rev_sqnorm_arg_76741;\n \n if ((slt32(gtid_72454, N_68508) && slt32(gtid_72455, K_68510)) &&\n slt32(ltid_72457, computed_group_sizze_76480)) {\n y_76740 = 0.0 - res_76707;\n rev_sqnorm_arg_76741 = 0.5 * y_76740;\n }\n \n __local char *mem_80781;\n \n mem_80781 = (__local char *) mem_80781_backing_2;\n for (int32_t comb_iter_82580 = 0; comb_iter_82580 < squot32(D_68526 +\n computed_group_sizze_76480 -\n 1,\n computed_group_sizze_76480);\n comb_iter_82580++) {\n int32_t ctid_72474;\n int32_t flat_comb_id_82581 = comb_iter_82580 *\n computed_group_sizze_76480 + local_tid_72546;\n \n ctid_72474 = flat_comb_id_82581;\n if (slt32(ctid_72474, D_68526) && 1) {\n double res_76744;\n double x_76747 = 0.0;\n int32_t chunk_sizze_76745;\n ",
" int32_t chunk_offset_76746 = 0;\n \n chunk_sizze_76745 = D_68526;\n \n double res_76750;\n double acc_76753 = x_76747;\n int32_t groupstream_mapaccum_dummy_chunk_sizze_76751;\n \n groupstream_mapaccum_dummy_chunk_sizze_76751 = 1;\n if (chunk_sizze_76745 == D_68526) {\n for (int32_t i_76752 = 0; i_76752 < D_68526; i_76752++) {\n double x_76756;\n double x_76757;\n double res_76759;\n double res_76761;\n \n x_76756 = *(__local\n double *) &mem_80774[(chunk_offset_76746 +\n i_76752) * 8];\n x_76757 = *(__local double *) &mem_80778[(ltid_72457 *\n D_68526 +\n chunk_offset_76746 +\n i_76752) * 8];\n res_76759 = x_76756 * x_76757;\n res_76761 = acc_76753 + res_76759;\n \n double acc_tmp_82582 = res_76761;\n \n acc_76753 = acc_tmp_82582;\n }\n } else {\n for (int32_t i_76752 = 0; i_76752 < chunk_sizze_76745;\n i_76752++) {\n double x_76756;\n double x_76757;\n double res_76759;\n double res_76761;\n \n x_76756 = *(__local\n double *) &mem_80774[(chunk_offset_76746 +\n i_76752) * 8];\n x_76757 = *(__local double *) &mem_80778[(ltid_72457 *\n D_68526 +\n ",
" chunk_offset_76746 +\n i_76752) * 8];\n res_76759 = x_76756 * x_76757;\n res_76761 = acc_76753 + res_76759;\n \n double acc_tmp_82583 = res_76761;\n \n acc_76753 = acc_tmp_82583;\n }\n }\n res_76750 = acc_76753;\n x_76747 = res_76750;\n res_76744 = x_76747;\n \n double res_76762;\n double res_76763;\n \n res_76762 = rev_sqnorm_arg_76741 * res_76744;\n res_76763 = res_76762 + res_76762;\n *(__local double *) &mem_80781[ctid_72474 * 8] = res_76763;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n if ((slt32(gtid_72454, N_68508) && slt32(gtid_72455, K_68510)) &&\n slt32(ltid_72457, computed_group_sizze_76480)) { }\n \n __local char *mem_80821;\n __local char *mem_80825;\n __local char *mem_80828;\n \n mem_80821 = (__local char *) mem_80821_backing_3;\n mem_80825 = (__local char *) mem_80825_backing_4;\n mem_80828 = (__local char *) mem_80828_backing_5;\n for (int32_t comb_iter_82584 = 0; comb_iter_82584 < squot32(D_68526 +\n computed_group_sizze_76480 -\n 1,\n computed_group_sizze_76480);\n comb_iter_82584++) {\n int32_t ctid_72513;\n int32_t flat_comb_id_82585 = comb_iter_82584 *\n computed_group_sizze_76480 + local_tid_72546;\n \n ctid_72513 = flat_comb_id_82585;\n if (slt32(ctid_72513, D_68526) && 1) {\n double x_76768;\n double qs_elem_elem_76771;\n \n x_76768 = *(__local double *) &mem_80781[ltid_72457 * 8];\n ",
"qs_elem_elem_76771 = *(__global\n double *) &qs_mem_80369[(gtid_72455 *\n D_68514 +\n ltid_72457) * 8];\n \n double res_76772;\n double x_76775 = 0.0;\n int32_t chunk_sizze_76773;\n int32_t chunk_offset_76774 = 0;\n \n chunk_sizze_76773 = D_68526;\n \n double res_76778;\n double acc_76781 = x_76775;\n int32_t groupstream_mapaccum_dummy_chunk_sizze_76779;\n \n groupstream_mapaccum_dummy_chunk_sizze_76779 = 1;\n if (chunk_sizze_76773 == D_68526) {\n for (int32_t i_76780 = 0; i_76780 < D_68526; i_76780++) {\n double x_76784;\n double x_76785;\n double res_76787;\n double res_76789;\n \n x_76784 = *(__local double *) &mem_80778[(ltid_72457 +\n D_68526 *\n chunk_offset_76774 +\n D_68526 *\n i_76780 + 0 *\n D_68526) * 8];\n x_76785 = *(__local\n double *) &mem_80781[(chunk_offset_76774 +\n i_76780) * 8];\n res_76787 = x_76784 * x_76785;\n res_76789 = acc_76781 + res_76787;\n \n double acc_tmp_82586 = res_76789;\n \n acc_76781 = acc_tmp_82586;\n }\n } else {\n for (int32_t i_76780 = 0; i_76780 < chunk_sizze_76773;\n ",
" i_76780++) {\n double x_76784;\n double x_76785;\n double res_76787;\n double res_76789;\n \n x_76784 = *(__local double *) &mem_80778[(ltid_72457 +\n D_68526 *\n chunk_offset_76774 +\n D_68526 *\n i_76780 + 0 *\n D_68526) * 8];\n x_76785 = *(__local\n double *) &mem_80781[(chunk_offset_76774 +\n i_76780) * 8];\n res_76787 = x_76784 * x_76785;\n res_76789 = acc_76781 + res_76787;\n \n double acc_tmp_82587 = res_76789;\n \n acc_76781 = acc_tmp_82587;\n }\n }\n res_76778 = acc_76781;\n x_76775 = res_76778;\n res_76772 = x_76775;\n \n double res_76790;\n \n res_76790 = 0.0 - res_76772;\n for (int32_t i_76797 = 0; i_76797 < D_68526; i_76797++) {\n double x_76798;\n double res_76800;\n bool cond_76801;\n bool cond_76802;\n \n x_76798 = *(__local double *) &mem_80774[i_76797 * 8];\n res_76800 = x_76768 * x_76798;\n cond_76801 = slt32(ltid_72457, i_76797);\n cond_76802 = ltid_72457 == i_76797;\n if (cond_76801) {\n for (int32_t i_82590 = 0; i_82590 < D_68526; i_82590++) {\n *(__global double *) &mem_80798[(group_id_72547 *\n (computed_",
"group_sizze_76480 *\n D_68526) +\n local_tid_72546 +\n i_82590 *\n computed_group_sizze_76480) *\n 8] = 0.0;\n }\n for (int32_t i_82591 = 0; i_82591 < triD_68516; i_82591++) {\n *(__global double *) &mem_80801[(group_id_72547 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_72546 +\n i_82591 *\n computed_group_sizze_76480) *\n 8] = 0.0;\n }\n for (int32_t i_82592 = 0; i_82592 < D_68526; i_82592++) {\n *(__global double *) &mem_81742[(group_id_72547 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_72546 +\n i_82592 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_80798[(group_id_72547 *\n (computed_group_sizze_76480 *\n D_68526) +\n ",
" local_tid_72546 +\n i_82592 *\n computed_group_sizze_76480) *\n 8];\n }\n for (int32_t i_82593 = 0; i_82593 < triD_68516; i_82593++) {\n *(__global double *) &mem_81739[(group_id_72547 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_72546 +\n i_82593 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_80801[(group_id_72547 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_72546 +\n i_82593 *\n computed_group_sizze_76480) *\n 8];\n }\n } else {\n if (cond_76802) {\n double res_76809;\n double deltaVec_arg_76810;\n \n res_76809 = futrts_exp64(qs_elem_elem_76771);\n ",
" deltaVec_arg_76810 = res_76800 * res_76809;\n for (int32_t i_76815 = 0; i_76815 < D_68526;\n i_76815++) {\n bool cond_76817;\n double res_76818;\n \n cond_76817 = i_76815 == ltid_72457;\n if (cond_76817) {\n res_76818 = deltaVec_arg_76810;\n } else {\n res_76818 = 0.0;\n }\n *(__global double *) &mem_80804[(group_id_72547 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_72546 +\n i_76815 *\n computed_group_sizze_76480) *\n 8] = res_76818;\n }\n for (int32_t i_82595 = 0; i_82595 < triD_68516;\n i_82595++) {\n *(__global double *) &mem_80807[(group_id_72547 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_72546 +\n i_82595 *\n computed_group_sizze_76480) *\n 8] = 0.0;\n }\n for (int32_t i_82596 = 0; i_82596 < D_68526;\n i_82596++) {\n *(",
"__global double *) &mem_81732[(group_id_72547 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_72546 +\n i_82596 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_80804[(group_id_72547 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_72546 +\n i_82596 *\n computed_group_sizze_76480) *\n 8];\n }\n for (int32_t i_82597 = 0; i_82597 < triD_68516;\n i_82597++) {\n *(__global double *) &mem_81729[(group_id_72547 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_72546 +\n i_82597 *\n computed_group_sizze_76480) *\n 8] = *(__global\n ",
" double *) &mem_80807[(group_id_72547 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_72546 +\n i_82597 *\n computed_group_sizze_76480) *\n 8];\n }\n } else {\n int32_t y_76821;\n int32_t x_76822;\n int32_t res_76823;\n int32_t deltaVec_arg_76824;\n \n y_76821 = ltid_72457 - 1;\n x_76822 = ltid_72457 * y_76821;\n res_76823 = sdiv32(x_76822, 2);\n deltaVec_arg_76824 = i_76797 + res_76823;\n for (int32_t i_76829 = 0; i_76829 < triD_68516;\n i_76829++) {\n bool cond_76831;\n double res_76832;\n \n cond_76831 = i_76829 == deltaVec_arg_76824;\n if (cond_76831) {\n res_76832 = res_76800;\n } else {\n res_76832 = 0.0;\n }\n *(__global double *) &mem_80810[(group_id_72547 *\n (computed_group_sizze_76480 *\n triD_68516) +\n ",
" local_tid_72546 +\n i_76829 *\n computed_group_sizze_76480) *\n 8] = res_76832;\n }\n for (int32_t i_82599 = 0; i_82599 < D_68526;\n i_82599++) {\n *(__global double *) &mem_80813[(group_id_72547 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_72546 +\n i_82599 *\n computed_group_sizze_76480) *\n 8] = 0.0;\n }\n for (int32_t i_82600 = 0; i_82600 < D_68526;\n i_82600++) {\n *(__global double *) &mem_81732[(group_id_72547 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_72546 +\n i_82600 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_80813[(group_id_72547 *\n (computed_group_sizze_76480 *\n D_68526) ",
"+\n local_tid_72546 +\n i_82600 *\n computed_group_sizze_76480) *\n 8];\n }\n for (int32_t i_82601 = 0; i_82601 < triD_68516;\n i_82601++) {\n *(__global double *) &mem_81729[(group_id_72547 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_72546 +\n i_82601 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_80810[(group_id_72547 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_72546 +\n i_82601 *\n computed_group_sizze_76480) *\n 8];\n }\n }\n for (int32_t i_82602 = 0; i_82602 < D_68526; i_82602++) {\n ",
" *(__global double *) &mem_81742[(group_id_72547 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_72546 +\n i_82602 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_81732[(group_id_72547 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_72546 +\n i_82602 *\n computed_group_sizze_76480) *\n 8];\n }\n for (int32_t i_82603 = 0; i_82603 < triD_68516; i_82603++) {\n *(__global double *) &mem_81739[(group_id_72547 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_72546 +\n i_82603 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_81729[(group_id_72547 *\n ",
" (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_72546 +\n i_82603 *\n computed_group_sizze_76480) *\n 8];\n }\n }\n for (int32_t i_82604 = 0; i_82604 < D_68526; i_82604++) {\n *(__global double *) &mem_80785[(group_id_72547 *\n (computed_group_sizze_76480 *\n D_68526 * D_68526) +\n local_tid_72546 + i_76797 *\n (computed_group_sizze_76480 *\n D_68526) + i_82604 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_81742[(group_id_72547 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_72546 +\n i_82604 *\n computed_group_sizze_76480) *\n ",
" 8];\n }\n for (int32_t i_82605 = 0; i_82605 < triD_68516; i_82605++) {\n *(__global double *) &mem_80789[(group_id_72547 *\n (computed_group_sizze_76480 *\n triD_68516 * D_68526) +\n local_tid_72546 + i_76797 *\n (computed_group_sizze_76480 *\n triD_68516) + i_82605 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_81739[(group_id_72547 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_72546 +\n i_82605 *\n computed_group_sizze_76480) *\n 8];\n }\n }\n for (int32_t i_76841 = 0; i_76841 < D_68526; i_76841++) {\n double res_76843;\n double redout_76844 = 0.0;\n \n for (int32_t i_76845 = 0; i_76845 < D_68526; i_76845++) {\n double x_76846;\n double res_76849;\n \n x_76846 = *(__global double *) &mem_80785[(group_id_72547 *\n (computed_group_sizze_76480 *\n ",
" D_68526 *\n D_68526) +\n local_tid_72546 +\n (i_76845 *\n (computed_group_sizze_76480 *\n D_68526) +\n i_76841 *\n computed_group_sizze_76480)) *\n 8];\n res_76849 = redout_76844 + x_76846;\n \n double redout_tmp_82607 = res_76849;\n \n redout_76844 = redout_tmp_82607;\n }\n res_76843 = redout_76844;\n *(__global double *) &mem_80792[(group_id_72547 *\n (computed_group_sizze_76480 *\n D_68526) + local_tid_72546 +\n i_76841 *\n computed_group_sizze_76480) *\n 8] = res_76843;\n }\n for (int32_t i_76855 = 0; i_76855 < triD_68516; i_76855++) {\n double res_76857;\n double redout_76858 = 0.0;\n \n for (int32_t i_76859 = 0; i_76859 < D_68526; i_76859++) {\n double x_76860;\n double res_76863;\n \n x_76860 = *(__global double *) &mem_80789[(group_id_72547 *\n (computed_group_sizze_76480 *\n triD_68516 *\n ",
" D_68526) +\n local_tid_72546 +\n (i_76859 *\n (computed_group_sizze_76480 *\n triD_68516) +\n i_76855 *\n computed_group_sizze_76480)) *\n 8];\n res_76863 = redout_76858 + x_76860;\n \n double redout_tmp_82609 = res_76863;\n \n redout_76858 = redout_tmp_82609;\n }\n res_76857 = redout_76858;\n *(__global double *) &mem_80795[(group_id_72547 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_72546 + i_76855 *\n computed_group_sizze_76480) *\n 8] = res_76857;\n }\n for (int32_t i_82610 = 0; i_82610 < D_68526; i_82610++) {\n *(__local double *) &mem_80821[(ctid_72513 * D_68526 +\n i_82610) * 8] = *(__global\n double *) &mem_80792[(group_id_72547 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_72546 +\n ",
" i_82610 *\n computed_group_sizze_76480) *\n 8];\n }\n for (int32_t i_82611 = 0; i_82611 < triD_68516; i_82611++) {\n *(__local double *) &mem_80825[(ctid_72513 * triD_68516 +\n i_82611) * 8] = *(__global\n double *) &mem_80795[(group_id_72547 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_72546 +\n i_82611 *\n computed_group_sizze_76480) *\n 8];\n }\n *(__local double *) &mem_80828[ctid_72513 * 8] = res_76790;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n if ((slt32(gtid_72454, N_68508) && slt32(gtid_72455, K_68510)) &&\n slt32(ltid_72457, computed_group_sizze_76480)) { }\n \n __local char *mem_80831;\n \n mem_80831 = (__local char *) mem_80831_backing_6;\n for (int32_t comb_iter_82612 = 0; comb_iter_82612 < squot32(triD_68516 +\n computed_group_sizze_76480 -\n 1,\n computed_group_sizze_76480);\n comb_iter_82612++) {\n int32_t ctid_72523;\n int32_t",
" flat_comb_id_82613 = comb_iter_82612 *\n computed_group_sizze_76480 + local_tid_72546;\n \n ctid_72523 = flat_comb_id_82613;\n if (slt32(ctid_72523, triD_68516) && 1) {\n double res_76868;\n double x_76871 = 0.0;\n int32_t chunk_sizze_76869;\n int32_t chunk_offset_76870 = 0;\n \n chunk_sizze_76869 = D_68526;\n \n double res_76873;\n double acc_76876 = x_76871;\n int32_t groupstream_mapaccum_dummy_chunk_sizze_76874;\n \n groupstream_mapaccum_dummy_chunk_sizze_76874 = 1;\n if (chunk_sizze_76869 == D_68526) {\n for (int32_t i_76875 = 0; i_76875 < D_68526; i_76875++) {\n double x_76878;\n double res_76881;\n \n x_76878 = *(__local double *) &mem_80825[(ltid_72457 +\n triD_68516 *\n chunk_offset_76870 +\n triD_68516 *\n i_76875 + 0 *\n triD_68516) * 8];\n res_76881 = acc_76876 + x_76878;\n \n double acc_tmp_82614 = res_76881;\n \n acc_76876 = acc_tmp_82614;\n }\n } else {\n for (int32_t i_76875 = 0; i_76875 < chunk_sizze_76869;\n i_76875++) {\n double x_76878;\n double res_76881;\n \n x_76878 = *(__local double *) &mem_80825[(ltid_72457 +\n triD_68516 *\n chunk_offset_76870 +\n ",
" triD_68516 *\n i_76875 + 0 *\n triD_68516) * 8];\n res_76881 = acc_76876 + x_76878;\n \n double acc_tmp_82615 = res_76881;\n \n acc_76876 = acc_tmp_82615;\n }\n }\n res_76873 = acc_76876;\n x_76871 = res_76873;\n res_76868 = x_76871;\n *(__local double *) &mem_80831[ctid_72523 * 8] = res_76868;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n if ((slt32(gtid_72454, N_68508) && slt32(gtid_72455, K_68510)) &&\n slt32(ltid_72457, computed_group_sizze_76480)) { }\n \n __local char *mem_80834;\n \n mem_80834 = (__local char *) mem_80834_backing_7;\n for (int32_t comb_iter_82616 = 0; comb_iter_82616 < squot32(D_68526 +\n computed_group_sizze_76480 -\n 1,\n computed_group_sizze_76480);\n comb_iter_82616++) {\n int32_t ctid_72533;\n int32_t flat_comb_id_82617 = comb_iter_82616 *\n computed_group_sizze_76480 + local_tid_72546;\n \n ctid_72533 = flat_comb_id_82617;\n if (slt32(ctid_72533, D_68526) && 1) {\n double res_76885;\n double x_76888 = 0.0;\n int32_t chunk_sizze_76886;\n int32_t chunk_offset_76887 = 0;\n \n chunk_sizze_76886 = D_68526;\n \n double res_76890;\n double acc_76893 = x_76888;\n int32_t groupstream_mapaccum_dummy_chunk_sizze_76891;\n \n groupstream_mapaccum_dummy_chunk_sizze_76891 = 1;\n if (chunk_sizze_76886 == D_68526) {\n for (int32_t i_76892 = 0; i_7689",
"2 < D_68526; i_76892++) {\n double x_76895;\n double res_76898;\n \n x_76895 = *(__local double *) &mem_80821[(ltid_72457 +\n D_68526 *\n chunk_offset_76887 +\n D_68526 *\n i_76892 + 0 *\n D_68526) * 8];\n res_76898 = acc_76893 + x_76895;\n \n double acc_tmp_82618 = res_76898;\n \n acc_76893 = acc_tmp_82618;\n }\n } else {\n for (int32_t i_76892 = 0; i_76892 < chunk_sizze_76886;\n i_76892++) {\n double x_76895;\n double res_76898;\n \n x_76895 = *(__local double *) &mem_80821[(ltid_72457 +\n D_68526 *\n chunk_offset_76887 +\n D_68526 *\n i_76892 + 0 *\n D_68526) * 8];\n res_76898 = acc_76893 + x_76895;\n \n double acc_tmp_82619 = res_76898;\n \n acc_76893 = acc_tmp_82619;\n }\n }\n res_76890 = acc_76893;\n x_76888 = res_76890;\n res_76885 = x_76888;\n \n double res_76899 = res_76707 + res_76885;\n \n *(__local double *) &mem_80834[ctid_72533 * 8] = res_76899;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n ",
" if (local_tid_72546 == 0) {\n *(__global double *) &mem_80837[group_id_72547 * 8] = res_76707;\n }\n for (int32_t i_82621 = 0; i_82621 < squot32(D_68526 - local_tid_72546 +\n computed_group_sizze_76480 - 1,\n computed_group_sizze_76480);\n i_82621++) {\n *(__global double *) &mem_80841[(group_id_72547 * D_68526 + (i_82621 *\n computed_group_sizze_76480 +\n local_tid_72546)) *\n 8] = *(__local\n double *) &mem_80828[(i_82621 *\n computed_group_sizze_76480 +\n local_tid_72546) *\n 8];\n }\n for (int32_t i_82622 = 0; i_82622 < squot32(D_68526 - local_tid_72546 +\n computed_group_sizze_76480 - 1,\n computed_group_sizze_76480);\n i_82622++) {\n *(__global double *) &mem_80845[(group_id_72547 * D_68526 + (i_82622 *\n computed_group_sizze_76480 +\n local_tid_72546)) *\n 8] = *(__local\n double *) &mem_80834[(i_82622 *\n computed_group_sizze_76480 +\n local_tid_72546) *\n 8];\n }\n for (int32_t i_82623 = 0; i_82623 < squot32(triD_6",
"8516 - local_tid_72546 +\n computed_group_sizze_76480 - 1,\n computed_group_sizze_76480);\n i_82623++) {\n *(__global double *) &mem_80849[(group_id_72547 * triD_68516 +\n (i_82623 * computed_group_sizze_76480 +\n local_tid_72546)) * 8] = *(__local\n double *) &mem_80831[(i_82623 *\n computed_group_sizze_76480 +\n local_tid_72546) *\n 8];\n }\n}\n__kernel void map_intra_group_73000(__local volatile\n int64_t *mem_81373_backing_aligned_0,\n int32_t N_68508, int32_t K_68510,\n int32_t D_68526, __global\n unsigned char *mem_80863, __global\n unsigned char *mem_81370, __global\n unsigned char *mem_81376)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n __local volatile char *restrict mem_81373_backing_0 =\n mem_81373_backing_aligned_0;\n int32_t global_tid_73000;\n int32_t local_tid_73001;\n int32_t group_sizze_82991;\n int32_t wave_sizze_82990;\n int32_t group_id_73002;\n \n global_tid_73000 = get_global_id(0);\n local_tid_73001 = get_local_id(0);\n group_sizze_82991 = get_local_size(0);\n wave_sizze_82990 = LOCKSTEP_WIDTH;\n group_id_73002 = get_group_id(0);\n \n int32_t gtid_72991;\n int32_t gtid_72992;\n int32_t gtid_72993;\n int32_t ltid_72996;\n \n gti",
"d_72991 = squot32(global_tid_73000, K_68510 * D_68526 * D_68526);\n gtid_72992 = squot32(global_tid_73000 - squot32(global_tid_73000, K_68510 *\n D_68526 * D_68526) *\n (K_68510 * D_68526 * D_68526), D_68526 * D_68526);\n gtid_72993 = squot32(global_tid_73000 - squot32(global_tid_73000, K_68510 *\n D_68526 * D_68526) *\n (K_68510 * D_68526 * D_68526) -\n squot32(global_tid_73000 - squot32(global_tid_73000,\n K_68510 * D_68526 *\n D_68526) *\n (K_68510 * D_68526 * D_68526), D_68526 *\n D_68526) * (D_68526 * D_68526), D_68526);\n ltid_72996 = global_tid_73000 - squot32(global_tid_73000, K_68510 *\n D_68526 * D_68526) * (K_68510 *\n D_68526 *\n D_68526) -\n squot32(global_tid_73000 - squot32(global_tid_73000, K_68510 * D_68526 *\n D_68526) * (K_68510 * D_68526 *\n D_68526), D_68526 *\n D_68526) * (D_68526 * D_68526) - squot32(global_tid_73000 -\n squot32(global_tid_73000,\n K_68510 *\n D_68526 *\n D_68526) *\n (K_68510 * D_68526 *\n D_68526) -\n ",
" squot32(global_tid_73000 -\n squot32(global_tid_73000,\n K_68510 *\n D_68526 *\n D_68526) *\n (K_68510 *\n D_68526 *\n D_68526),\n D_68526 *\n D_68526) *\n (D_68526 * D_68526),\n D_68526) * D_68526;\n \n double res_77876;\n double x_79861;\n \n if (((slt32(gtid_72991, N_68508) && slt32(gtid_72992, K_68510)) &&\n slt32(gtid_72993, D_68526)) && slt32(ltid_72996, D_68526)) {\n res_77876 = *(__global double *) &mem_80863[(gtid_72991 * K_68510 +\n gtid_72992) * 8];\n x_79861 = *(__global double *) &mem_81370[(gtid_72991 * (D_68526 *\n D_68526 *\n K_68510) +\n gtid_72992 * (D_68526 *\n D_68526) +\n gtid_72993 * D_68526 +\n ltid_72996) * 8];\n }\n \n __local char *mem_81373;\n double res_77880;\n \n mem_81373 = (__local char *) mem_81373_backing_0;\n for (int32_t comb_iter_82992 = 0; comb_iter_82992 < 1; comb_iter_82992++) {\n int32_t ctid_72998;\n ",
" int32_t flat_comb_id_82993 = comb_iter_82992 * D_68526 +\n local_tid_73001;\n \n ctid_72998 = flat_comb_id_82993;\n if (slt32(ctid_72998, D_68526) && 1) {\n *(__local double *) &mem_81373[ctid_72998 * 8] = x_79861;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82994;\n int32_t skip_waves_82995;\n double x_77881;\n double x_77882;\n \n offset_82994 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_73001, D_68526)) {\n x_77881 = *(__local double *) &mem_81373[(local_tid_73001 +\n offset_82994) * 8];\n }\n }\n offset_82994 = 1;\n while (slt32(offset_82994, wave_sizze_82990)) {\n if (slt32(local_tid_73001 + offset_82994, D_68526) &&\n ((local_tid_73001 - squot32(local_tid_73001, wave_sizze_82990) *\n wave_sizze_82990) & (2 * offset_82994 - 1)) == 0) {\n // read array element\n {\n x_77882 = *(volatile __local\n double *) &mem_81373[(local_tid_73001 +\n offset_82994) * 8];\n }\n // apply reduction operation\n {\n double res_77883;\n \n if (((slt32(gtid_72991, N_68508) && slt32(gtid_72992,\n K_68510)) &&\n slt32(gtid_72993, D_68526)) && slt32(ltid_72996,\n D_68526)) {\n res_77883 = x_77881 + x_77882;\n }\n x_77881 = res_77883;\n }\n // write result of operation\n {\n *(volatile __local double *) &mem_81373[local_tid_73001 * 8] =\n x_77881;\n }\n }\n offset_82994 *= 2;\n }\n skip_waves_82995 = 1;\n while (slt32",
"(skip_waves_82995, squot32(D_68526 + wave_sizze_82990 - 1,\n wave_sizze_82990))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82994 = skip_waves_82995 * wave_sizze_82990;\n if (slt32(local_tid_73001 + offset_82994, D_68526) &&\n ((local_tid_73001 - squot32(local_tid_73001, wave_sizze_82990) *\n wave_sizze_82990) == 0 && (squot32(local_tid_73001,\n wave_sizze_82990) & (2 *\n skip_waves_82995 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_77882 = *(__local double *) &mem_81373[(local_tid_73001 +\n offset_82994) * 8];\n }\n // apply reduction operation\n {\n double res_77883;\n \n if (((slt32(gtid_72991, N_68508) && slt32(gtid_72992,\n K_68510)) &&\n slt32(gtid_72993, D_68526)) && slt32(ltid_72996,\n D_68526)) {\n res_77883 = x_77881 + x_77882;\n }\n x_77881 = res_77883;\n }\n // write result of operation\n {\n *(__local double *) &mem_81373[local_tid_73001 * 8] = x_77881;\n }\n }\n skip_waves_82995 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n res_77880 = *(__local double *) &mem_81373[0];\n \n double res_77884;\n \n if (((slt32(gtid_72991, N_68508) && slt32(gtid_72992, K_68510)) &&\n slt32(gtid_72993, D_68526)) && slt32(ltid_72996, D_68526)) {\n res_77884 = res_77876 + res_77880;\n }\n if (local_tid_73001 == 0) {\n *(__global double *) &mem_81376[group_id_73002 * ",
"8] = res_77884;\n }\n}\n__kernel void map_intra_group_73195(__local volatile\n int64_t *mem_81315_backing_aligned_0,\n int32_t N_68508, int32_t K_68510,\n int32_t triD_68516, int32_t D_68526,\n __global unsigned char *mem_81312, __global\n unsigned char *mem_81318)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n __local volatile char *restrict mem_81315_backing_0 =\n mem_81315_backing_aligned_0;\n int32_t global_tid_73195;\n int32_t local_tid_73196;\n int32_t group_sizze_82937;\n int32_t wave_sizze_82936;\n int32_t group_id_73197;\n \n global_tid_73195 = get_global_id(0);\n local_tid_73196 = get_local_id(0);\n group_sizze_82937 = get_local_size(0);\n wave_sizze_82936 = LOCKSTEP_WIDTH;\n group_id_73197 = get_group_id(0);\n \n int32_t gtid_73186;\n int32_t gtid_73187;\n int32_t gtid_73188;\n int32_t ltid_73191;\n \n gtid_73186 = squot32(global_tid_73195, K_68510 * triD_68516 * D_68526);\n gtid_73187 = squot32(global_tid_73195 - squot32(global_tid_73195, K_68510 *\n triD_68516 * D_68526) *\n (K_68510 * triD_68516 * D_68526), triD_68516 *\n D_68526);\n gtid_73188 = squot32(global_tid_73195 - squot32(global_tid_73195, K_68510 *\n triD_68516 * D_68526) *\n (K_68510 * triD_68516 * D_68526) -\n squot32(global_tid_73195 - squot32(global_tid_73195,\n K_68510 *\n triD_68516 *\n D_68526) *\n (K_68510 * triD_6851",
"6 * D_68526), triD_68516 *\n D_68526) * (triD_68516 * D_68526), D_68526);\n ltid_73191 = global_tid_73195 - squot32(global_tid_73195, K_68510 *\n triD_68516 * D_68526) * (K_68510 *\n triD_68516 *\n D_68526) -\n squot32(global_tid_73195 - squot32(global_tid_73195, K_68510 *\n triD_68516 * D_68526) * (K_68510 *\n triD_68516 *\n D_68526),\n triD_68516 * D_68526) * (triD_68516 * D_68526) -\n squot32(global_tid_73195 - squot32(global_tid_73195, K_68510 *\n triD_68516 * D_68526) * (K_68510 *\n triD_68516 *\n D_68526) -\n squot32(global_tid_73195 - squot32(global_tid_73195, K_68510 *\n triD_68516 * D_68526) *\n (K_68510 * triD_68516 * D_68526), triD_68516 *\n D_68526) * (triD_68516 * D_68526), D_68526) * D_68526;\n \n double x_79855;\n \n if (((slt32(gtid_73186, N_68508) && slt32(gtid_73187, K_68510)) &&\n slt32(gtid_73188, triD_68516)) && slt32(ltid_73191, D_68526)) {\n x_79855 = *(__global double *) &mem_81312[(gtid_73186 * (D_68526 *\n triD_68516 *\n K_68510) +\n gtid_73187 * (D_68526 *\n triD_68516) +\n ",
" gtid_73188 * D_68526 +\n ltid_73191) * 8];\n }\n \n __local char *mem_81315;\n double res_77783;\n \n mem_81315 = (__local char *) mem_81315_backing_0;\n for (int32_t comb_iter_82938 = 0; comb_iter_82938 < 1; comb_iter_82938++) {\n int32_t ctid_73193;\n int32_t flat_comb_id_82939 = comb_iter_82938 * D_68526 +\n local_tid_73196;\n \n ctid_73193 = flat_comb_id_82939;\n if (slt32(ctid_73193, D_68526) && 1) {\n *(__local double *) &mem_81315[ctid_73193 * 8] = x_79855;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82940;\n int32_t skip_waves_82941;\n double x_77784;\n double x_77785;\n \n offset_82940 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_73196, D_68526)) {\n x_77784 = *(__local double *) &mem_81315[(local_tid_73196 +\n offset_82940) * 8];\n }\n }\n offset_82940 = 1;\n while (slt32(offset_82940, wave_sizze_82936)) {\n if (slt32(local_tid_73196 + offset_82940, D_68526) &&\n ((local_tid_73196 - squot32(local_tid_73196, wave_sizze_82936) *\n wave_sizze_82936) & (2 * offset_82940 - 1)) == 0) {\n // read array element\n {\n x_77785 = *(volatile __local\n double *) &mem_81315[(local_tid_73196 +\n offset_82940) * 8];\n }\n // apply reduction operation\n {\n double res_77786;\n \n if (((slt32(gtid_73186, N_68508) && slt32(gtid_73187,\n K_68510)) &&\n slt32(gtid_73188, triD_68516)) && slt32(ltid_73191,\n D_68526)) {\n res_77786 = x_7778",
"4 + x_77785;\n }\n x_77784 = res_77786;\n }\n // write result of operation\n {\n *(volatile __local double *) &mem_81315[local_tid_73196 * 8] =\n x_77784;\n }\n }\n offset_82940 *= 2;\n }\n skip_waves_82941 = 1;\n while (slt32(skip_waves_82941, squot32(D_68526 + wave_sizze_82936 - 1,\n wave_sizze_82936))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82940 = skip_waves_82941 * wave_sizze_82936;\n if (slt32(local_tid_73196 + offset_82940, D_68526) &&\n ((local_tid_73196 - squot32(local_tid_73196, wave_sizze_82936) *\n wave_sizze_82936) == 0 && (squot32(local_tid_73196,\n wave_sizze_82936) & (2 *\n skip_waves_82941 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_77785 = *(__local double *) &mem_81315[(local_tid_73196 +\n offset_82940) * 8];\n }\n // apply reduction operation\n {\n double res_77786;\n \n if (((slt32(gtid_73186, N_68508) && slt32(gtid_73187,\n K_68510)) &&\n slt32(gtid_73188, triD_68516)) && slt32(ltid_73191,\n D_68526)) {\n res_77786 = x_77784 + x_77785;\n }\n x_77784 = res_77786;\n }\n // write result of operation\n {\n *(__local double *) &mem_81315[local_tid_73196 * 8] = x_77784;\n }\n }\n skip_waves_82941 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n res_7778",
"3 = *(__local double *) &mem_81315[0];\n if (local_tid_73196 == 0) {\n *(__global double *) &mem_81318[group_id_73197 * 8] = res_77783;\n }\n}\n__kernel void map_intra_group_73444(__local volatile\n int64_t *mem_81029_backing_aligned_0,\n __local volatile\n int64_t *mem_81055_backing_aligned_1,\n __local volatile\n int64_t *mem_81059_backing_aligned_2,\n __local volatile\n int64_t *mem_81062_backing_aligned_3,\n __local volatile\n int64_t *mem_81065_backing_aligned_4,\n int32_t N_68508, int32_t K_68510,\n int32_t triD_68516, int32_t D_68526,\n int32_t computed_group_sizze_76480, __global\n unsigned char *mem_80873, __global\n unsigned char *mem_80889, __global\n unsigned char *res_r_r_mem_80930, __global\n unsigned char *mem_80942, __global\n unsigned char *mem_81026, __global\n unsigned char *mem_81032, __global\n unsigned char *mem_81035, __global\n unsigned char *mem_81038, __global\n unsigned char *mem_81041, __global\n unsigned char *mem_81044, __global\n unsigned char *mem_81047, __global\n unsigned char *mem_81069, __global\n unsigned char *mem_81073, __global\n unsigned char *mem_810",
"76, __global\n unsigned char *mem_81769, __global\n unsigned char *mem_81772, __global\n unsigned char *mem_81779, __global\n unsigned char *mem_81782)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n __local volatile char *restrict mem_81029_backing_0 =\n mem_81029_backing_aligned_0;\n __local volatile char *restrict mem_81055_backing_1 =\n mem_81055_backing_aligned_1;\n __local volatile char *restrict mem_81059_backing_2 =\n mem_81059_backing_aligned_2;\n __local volatile char *restrict mem_81062_backing_3 =\n mem_81062_backing_aligned_3;\n __local volatile char *restrict mem_81065_backing_4 =\n mem_81065_backing_aligned_4;\n int32_t global_tid_73444;\n int32_t local_tid_73445;\n int32_t group_sizze_82724;\n int32_t wave_sizze_82723;\n int32_t group_id_73446;\n \n global_tid_73444 = get_global_id(0);\n local_tid_73445 = get_local_id(0);\n group_sizze_82724 = get_local_size(0);\n wave_sizze_82723 = LOCKSTEP_WIDTH;\n group_id_73446 = get_group_id(0);\n \n int32_t gtid_73396;\n int32_t gtid_73397;\n int32_t gtid_73398;\n int32_t ltid_73401;\n \n gtid_73396 = squot32(global_tid_73444, K_68510 * D_68526 *\n computed_group_sizze_76480);\n gtid_73397 = squot32(global_tid_73444 - squot32(global_tid_73444, K_68510 *\n D_68526 *\n computed_group_sizze_76480) *\n (K_68510 * D_68526 * computed_group_sizze_76480),\n D_68526 * computed_group_sizze_76480);\n gtid_73398 = squot32(global_tid_73444 - squot32(global_tid_73444, K_68510 *\n ",
" D_68526 *\n computed_group_sizze_76480) *\n (K_68510 * D_68526 * computed_group_sizze_76480) -\n squot32(global_tid_73444 - squot32(global_tid_73444,\n K_68510 * D_68526 *\n computed_group_sizze_76480) *\n (K_68510 * D_68526 *\n computed_group_sizze_76480), D_68526 *\n computed_group_sizze_76480) * (D_68526 *\n computed_group_sizze_76480),\n computed_group_sizze_76480);\n ltid_73401 = global_tid_73444 - squot32(global_tid_73444, K_68510 *\n D_68526 *\n computed_group_sizze_76480) *\n (K_68510 * D_68526 * computed_group_sizze_76480) -\n squot32(global_tid_73444 - squot32(global_tid_73444, K_68510 * D_68526 *\n computed_group_sizze_76480) *\n (K_68510 * D_68526 * computed_group_sizze_76480), D_68526 *\n computed_group_sizze_76480) * (D_68526 *\n computed_group_sizze_76480) -\n squot32(global_tid_73444 - squot32(global_tid_73444, K_68510 * D_68526 *\n computed_group_sizze_76480) *\n (K_68510 * D_68526 * computed_group_sizze_76480) -\n squot32(global_tid_73444 - squot32(global_tid_73444, K_68510 *\n D_68526 *\n computed_group_sizze_76480) *\n (K_68510 * D_68526 * computed_group_sizze_76480),\n D_68526 * computed_group_sizze_76480) * (D_68526 *\n ",
" computed_group_sizze_76480),\n computed_group_sizze_76480) * computed_group_sizze_76480;\n \n double x_77300;\n double qs_elem_elem_77303;\n \n if (((slt32(gtid_73396, N_68508) && slt32(gtid_73397, K_68510)) &&\n slt32(gtid_73398, D_68526)) && slt32(ltid_73401,\n computed_group_sizze_76480)) {\n x_77300 = *(__global double *) &mem_81026[(gtid_73397 * (N_68508 *\n D_68526) +\n gtid_73398 * N_68508 +\n gtid_73396) * 8];\n qs_elem_elem_77303 = *(__global double *) &mem_80873[(gtid_73397 *\n (N_68508 *\n D_68526) +\n gtid_73398 *\n N_68508 +\n gtid_73396) * 8];\n }\n \n __local char *mem_81029;\n double res_77308;\n \n mem_81029 = (__local char *) mem_81029_backing_0;\n for (int32_t comb_iter_82725 = 0; comb_iter_82725 < squot32(D_68526 +\n computed_group_sizze_76480 -\n 1,\n computed_group_sizze_76480);\n comb_iter_82725++) {\n int32_t ctid_73403;\n int32_t flat_comb_id_82726 = comb_iter_82725 *\n computed_group_sizze_76480 + local_tid_73445;\n \n ctid_73403 = flat_comb_id_82726;\n if (slt32(ctid_73403, D_68526) && 1) {\n double x_77305;\n double x_77306;\n double res_77307;\n \n ",
" x_77305 = *(__global double *) &mem_80942[(gtid_73398 * (K_68510 *\n N_68508 *\n D_68526) +\n ltid_73401 * (K_68510 *\n N_68508) +\n gtid_73396 * K_68510 +\n gtid_73397) * 8];\n x_77306 = *(__global double *) &res_r_r_mem_80930[(gtid_73396 *\n (D_68526 *\n K_68510) +\n gtid_73397 *\n D_68526 +\n ltid_73401) * 8];\n res_77307 = x_77305 * x_77306;\n *(__local double *) &mem_81029[ctid_73403 * 8] = res_77307;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82727;\n int32_t skip_waves_82728;\n double x_77309;\n double x_77310;\n \n offset_82727 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_73445, D_68526)) {\n x_77309 = *(__local double *) &mem_81029[(local_tid_73445 +\n offset_82727) * 8];\n }\n }\n offset_82727 = 1;\n while (slt32(offset_82727, wave_sizze_82723)) {\n if (slt32(local_tid_73445 + offset_82727, D_68526) &&\n ((local_tid_73445 - squot32(local_tid_73445, wave_sizze_82723) *\n wave_sizze_82723) & (2 * offset_82727 - 1)) == 0) {\n // read array element\n {\n x_77310 = *(volatile __local\n double *) &mem_81029[(local_tid_73445 +\n ",
" offset_82727) * 8];\n }\n // apply reduction operation\n {\n double res_77311;\n \n if (((slt32(gtid_73396, N_68508) && slt32(gtid_73397,\n K_68510)) &&\n slt32(gtid_73398, D_68526)) && slt32(ltid_73401,\n computed_group_sizze_76480)) {\n res_77311 = x_77309 + x_77310;\n }\n x_77309 = res_77311;\n }\n // write result of operation\n {\n *(volatile __local double *) &mem_81029[local_tid_73445 * 8] =\n x_77309;\n }\n }\n offset_82727 *= 2;\n }\n skip_waves_82728 = 1;\n while (slt32(skip_waves_82728, squot32(computed_group_sizze_76480 +\n wave_sizze_82723 - 1,\n wave_sizze_82723))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82727 = skip_waves_82728 * wave_sizze_82723;\n if (slt32(local_tid_73445 + offset_82727, D_68526) &&\n ((local_tid_73445 - squot32(local_tid_73445, wave_sizze_82723) *\n wave_sizze_82723) == 0 && (squot32(local_tid_73445,\n wave_sizze_82723) & (2 *\n skip_waves_82728 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_77310 = *(__local double *) &mem_81029[(local_tid_73445 +\n offset_82727) * 8];\n }\n // apply reduction operation\n {\n double res_77311;\n \n if (((slt32(gtid_73396, N_68508) && slt32(gtid",
"_73397,\n K_68510)) &&\n slt32(gtid_73398, D_68526)) && slt32(ltid_73401,\n computed_group_sizze_76480)) {\n res_77311 = x_77309 + x_77310;\n }\n x_77309 = res_77311;\n }\n // write result of operation\n {\n *(__local double *) &mem_81029[local_tid_73445 * 8] = x_77309;\n }\n }\n skip_waves_82728 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n res_77308 = *(__local double *) &mem_81029[0];\n \n double res_77312;\n bool cond_77318;\n bool cond_77319;\n \n if (((slt32(gtid_73396, N_68508) && slt32(gtid_73397, K_68510)) &&\n slt32(gtid_73398, D_68526)) && slt32(ltid_73401,\n computed_group_sizze_76480)) {\n res_77312 = 0.0 - res_77308;\n cond_77318 = slt32(gtid_73398, ltid_73401);\n cond_77319 = gtid_73398 == ltid_73401;\n }\n \n __local char *mem_81055;\n __local char *mem_81059;\n \n mem_81055 = (__local char *) mem_81055_backing_1;\n mem_81059 = (__local char *) mem_81059_backing_2;\n for (int32_t comb_iter_82729 = 0; comb_iter_82729 < squot32(D_68526 +\n computed_group_sizze_76480 -\n 1,\n computed_group_sizze_76480);\n comb_iter_82729++) {\n int32_t ctid_73412;\n int32_t flat_comb_id_82730 = comb_iter_82729 *\n computed_group_sizze_76480 + local_tid_73445;\n \n ctid_73412 = flat_comb_id_82730;\n if (slt32(ctid_73412, D_68526) && 1) {\n double x_77315;\n double res_77317;\n \n x_77315 = *(__global double *) &mem_80889[(gtid_73396 * (D_68526 *\n ",
" K_68510) +\n gtid_73397 * D_68526 +\n ltid_73401) * 8];\n res_77317 = x_77300 * x_77315;\n if (cond_77318) {\n for (int32_t i_82731 = 0; i_82731 < D_68526; i_82731++) {\n *(__global double *) &mem_81032[(group_id_73446 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_73445 + i_82731 *\n computed_group_sizze_76480) *\n 8] = 0.0;\n }\n for (int32_t i_82732 = 0; i_82732 < triD_68516; i_82732++) {\n *(__global double *) &mem_81035[(group_id_73446 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_73445 + i_82732 *\n computed_group_sizze_76480) *\n 8] = 0.0;\n }\n for (int32_t i_82733 = 0; i_82733 < D_68526; i_82733++) {\n *(__global double *) &mem_81782[(group_id_73446 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_73445 + i_82733 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_81032[(group_id_73446 *\n ",
" (computed_group_sizze_76480 *\n D_68526) +\n local_tid_73445 +\n i_82733 *\n computed_group_sizze_76480) *\n 8];\n }\n for (int32_t i_82734 = 0; i_82734 < triD_68516; i_82734++) {\n *(__global double *) &mem_81779[(group_id_73446 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_73445 + i_82734 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_81035[(group_id_73446 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_73445 +\n i_82734 *\n computed_group_sizze_76480) *\n 8];\n }\n } else {\n if (cond_77319) {\n double res_77326;\n double deltaVec_arg_77327;\n \n ",
" res_77326 = futrts_exp64(qs_elem_elem_77303);\n deltaVec_arg_77327 = res_77317 * res_77326;\n for (int32_t i_77332 = 0; i_77332 < D_68526; i_77332++) {\n bool cond_77334;\n double res_77335;\n \n cond_77334 = i_77332 == gtid_73398;\n if (cond_77334) {\n res_77335 = deltaVec_arg_77327;\n } else {\n res_77335 = 0.0;\n }\n *(__global double *) &mem_81038[(group_id_73446 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_73445 +\n i_77332 *\n computed_group_sizze_76480) *\n 8] = res_77335;\n }\n for (int32_t i_82736 = 0; i_82736 < triD_68516; i_82736++) {\n *(__global double *) &mem_81041[(group_id_73446 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_73445 +\n i_82736 *\n computed_group_sizze_76480) *\n 8] = 0.0;\n }\n for (int32_t i_82737 = 0; i_82737 < D_68526; i_82737++) {\n *(__global double *) &mem_81772[(group_id_73446 *\n (computed_group_sizze_76480 *\n",
" D_68526) +\n local_tid_73445 +\n i_82737 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_81038[(group_id_73446 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_73445 +\n i_82737 *\n computed_group_sizze_76480) *\n 8];\n }\n for (int32_t i_82738 = 0; i_82738 < triD_68516; i_82738++) {\n *(__global double *) &mem_81769[(group_id_73446 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_73445 +\n i_82738 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_81041[(group_id_73446 *\n (computed_group_sizze_76480 *\n ",
" triD_68516) +\n local_tid_73445 +\n i_82738 *\n computed_group_sizze_76480) *\n 8];\n }\n } else {\n int32_t y_77338;\n int32_t x_77339;\n int32_t res_77340;\n int32_t deltaVec_arg_77341;\n \n y_77338 = gtid_73398 - 1;\n x_77339 = gtid_73398 * y_77338;\n res_77340 = sdiv32(x_77339, 2);\n deltaVec_arg_77341 = ltid_73401 + res_77340;\n for (int32_t i_77346 = 0; i_77346 < triD_68516; i_77346++) {\n bool cond_77348;\n double res_77349;\n \n cond_77348 = i_77346 == deltaVec_arg_77341;\n if (cond_77348) {\n res_77349 = res_77317;\n } else {\n res_77349 = 0.0;\n }\n *(__global double *) &mem_81044[(group_id_73446 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_73445 +\n i_77346 *\n computed_group_sizze_76480) *\n 8] = res_77349;\n }\n for (int32_t i_82740 = 0; i_82740 < D_68526; i_82740++) {\n *(__",
"global double *) &mem_81047[(group_id_73446 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_73445 +\n i_82740 *\n computed_group_sizze_76480) *\n 8] = 0.0;\n }\n for (int32_t i_82741 = 0; i_82741 < D_68526; i_82741++) {\n *(__global double *) &mem_81772[(group_id_73446 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_73445 +\n i_82741 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_81047[(group_id_73446 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_73445 +\n i_82741 *\n computed_group_sizze_76480) *\n 8];\n }\n for (int32_t i_82742 = 0; i_82742 < triD_68516; i_82742++) {\n *(__global double *) &mem_8176",
"9[(group_id_73446 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_73445 +\n i_82742 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_81044[(group_id_73446 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_73445 +\n i_82742 *\n computed_group_sizze_76480) *\n 8];\n }\n }\n for (int32_t i_82743 = 0; i_82743 < D_68526; i_82743++) {\n *(__global double *) &mem_81782[(group_id_73446 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_73445 + i_82743 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_81772[(group_id_73446 *\n (computed_group_sizze_76480 *\n ",
" D_68526) +\n local_tid_73445 +\n i_82743 *\n computed_group_sizze_76480) *\n 8];\n }\n for (int32_t i_82744 = 0; i_82744 < triD_68516; i_82744++) {\n *(__global double *) &mem_81779[(group_id_73446 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_73445 + i_82744 *\n computed_group_sizze_76480) *\n 8] = *(__global\n double *) &mem_81769[(group_id_73446 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_73445 +\n i_82744 *\n computed_group_sizze_76480) *\n 8];\n }\n }\n for (int32_t i_82745 = 0; i_82745 < D_68526; i_82745++) {\n *(__local double *) &mem_81055[(ctid_73412 * D_68526 +\n i_82745) * 8] = *(__global\n ",
" double *) &mem_81782[(group_id_73446 *\n (computed_group_sizze_76480 *\n D_68526) +\n local_tid_73445 +\n i_82745 *\n computed_group_sizze_76480) *\n 8];\n }\n for (int32_t i_82746 = 0; i_82746 < triD_68516; i_82746++) {\n *(__local double *) &mem_81059[(ctid_73412 * triD_68516 +\n i_82746) * 8] = *(__global\n double *) &mem_81779[(group_id_73446 *\n (computed_group_sizze_76480 *\n triD_68516) +\n local_tid_73445 +\n i_82746 *\n computed_group_sizze_76480) *\n 8];\n }\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n if (((slt32(gtid_73396, N_68508) && slt32(gtid_73397, K_68510)) &&\n slt32(gtid_73398, D_68526)) && slt32(ltid_73401,\n computed_group_sizze_76480)) { }\n \n __local char *mem_81062;\n \n mem_81062 = (__local char *) mem_81062_",
"backing_3;\n for (int32_t comb_iter_82747 = 0; comb_iter_82747 < squot32(D_68526 +\n computed_group_sizze_76480 -\n 1,\n computed_group_sizze_76480);\n comb_iter_82747++) {\n int32_t ctid_73422;\n int32_t flat_comb_id_82748 = comb_iter_82747 *\n computed_group_sizze_76480 + local_tid_73445;\n \n ctid_73422 = flat_comb_id_82748;\n if (slt32(ctid_73422, D_68526) && 1) {\n double res_77355;\n double x_77358 = 0.0;\n int32_t chunk_sizze_77356;\n int32_t chunk_offset_77357 = 0;\n \n chunk_sizze_77356 = D_68526;\n \n double res_77360;\n double acc_77363 = x_77358;\n int32_t groupstream_mapaccum_dummy_chunk_sizze_77361;\n \n groupstream_mapaccum_dummy_chunk_sizze_77361 = 1;\n if (chunk_sizze_77356 == D_68526) {\n for (int32_t i_77362 = 0; i_77362 < D_68526; i_77362++) {\n double x_77365;\n double res_77368;\n \n x_77365 = *(__local double *) &mem_81055[(ltid_73401 +\n D_68526 *\n chunk_offset_77357 +\n D_68526 *\n i_77362 + 0 *\n D_68526) * 8];\n res_77368 = acc_77363 + x_77365;\n \n double acc_tmp_82749 = res_77368;\n \n acc_77363 = acc_tmp_82749;\n }\n } else {\n for (int32_t i_77362 = 0; i_77362 < chunk_",
"sizze_77356;\n i_77362++) {\n double x_77365;\n double res_77368;\n \n x_77365 = *(__local double *) &mem_81055[(ltid_73401 +\n D_68526 *\n chunk_offset_77357 +\n D_68526 *\n i_77362 + 0 *\n D_68526) * 8];\n res_77368 = acc_77363 + x_77365;\n \n double acc_tmp_82750 = res_77368;\n \n acc_77363 = acc_tmp_82750;\n }\n }\n res_77360 = acc_77363;\n x_77358 = res_77360;\n res_77355 = x_77358;\n *(__local double *) &mem_81062[ctid_73422 * 8] = res_77355;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n if (((slt32(gtid_73396, N_68508) && slt32(gtid_73397, K_68510)) &&\n slt32(gtid_73398, D_68526)) && slt32(ltid_73401,\n computed_group_sizze_76480)) { }\n \n __local char *mem_81065;\n \n mem_81065 = (__local char *) mem_81065_backing_4;\n for (int32_t comb_iter_82751 = 0; comb_iter_82751 < squot32(triD_68516 +\n computed_group_sizze_76480 -\n 1,\n computed_group_sizze_76480);\n comb_iter_82751++) {\n int32_t ctid_73432;\n int32_t flat_comb_id_82752 = comb_iter_82751 *\n computed_group_sizze_76480 + local_tid_73445;\n \n ctid_73432 = flat_comb_id_82752;\n if (slt32(ctid_73432, triD_68516) && 1) {\n double res_77372;\n doub",
"le x_77375 = 0.0;\n int32_t chunk_sizze_77373;\n int32_t chunk_offset_77374 = 0;\n \n chunk_sizze_77373 = D_68526;\n \n double res_77377;\n double acc_77380 = x_77375;\n int32_t groupstream_mapaccum_dummy_chunk_sizze_77378;\n \n groupstream_mapaccum_dummy_chunk_sizze_77378 = 1;\n if (chunk_sizze_77373 == D_68526) {\n for (int32_t i_77379 = 0; i_77379 < D_68526; i_77379++) {\n double x_77382;\n double res_77385;\n \n x_77382 = *(__local double *) &mem_81059[(ltid_73401 +\n triD_68516 *\n chunk_offset_77374 +\n triD_68516 *\n i_77379 + 0 *\n triD_68516) * 8];\n res_77385 = acc_77380 + x_77382;\n \n double acc_tmp_82753 = res_77385;\n \n acc_77380 = acc_tmp_82753;\n }\n } else {\n for (int32_t i_77379 = 0; i_77379 < chunk_sizze_77373;\n i_77379++) {\n double x_77382;\n double res_77385;\n \n x_77382 = *(__local double *) &mem_81059[(ltid_73401 +\n triD_68516 *\n chunk_offset_77374 +\n triD_68516 *\n i_77379 + 0 *\n triD_68516) * 8];\n res_77385 = acc_77380",
" + x_77382;\n \n double acc_tmp_82754 = res_77385;\n \n acc_77380 = acc_tmp_82754;\n }\n }\n res_77377 = acc_77380;\n x_77375 = res_77377;\n res_77372 = x_77375;\n *(__local double *) &mem_81065[ctid_73432 * 8] = res_77372;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n for (int32_t i_82755 = 0; i_82755 < squot32(triD_68516 - local_tid_73445 +\n computed_group_sizze_76480 - 1,\n computed_group_sizze_76480);\n i_82755++) {\n *(__global double *) &mem_81069[(group_id_73446 * triD_68516 +\n (i_82755 * computed_group_sizze_76480 +\n local_tid_73445)) * 8] = *(__local\n double *) &mem_81065[(i_82755 *\n computed_group_sizze_76480 +\n local_tid_73445) *\n 8];\n }\n for (int32_t i_82756 = 0; i_82756 < squot32(D_68526 - local_tid_73445 +\n computed_group_sizze_76480 - 1,\n computed_group_sizze_76480);\n i_82756++) {\n *(__global double *) &mem_81073[(group_id_73446 * D_68526 + (i_82756 *\n computed_group_sizze_76480 +\n local_tid_73445)) *\n 8] = *(__local\n double *) &mem_81062[(i_82756 *\n ",
" computed_group_sizze_76480 +\n local_tid_73445) *\n 8];\n }\n if (local_tid_73445 == 0) {\n *(__global double *) &mem_81076[group_id_73446 * 8] = res_77312;\n }\n}\n__kernel void map_intra_group_73690(__local volatile\n int64_t *mem_81253_backing_aligned_0,\n int32_t N_68508, int32_t K_68510,\n int32_t triD_68516, int32_t D_68526,\n __global unsigned char *mem_81250, __global\n unsigned char *mem_81256)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n __local volatile char *restrict mem_81253_backing_0 =\n mem_81253_backing_aligned_0;\n int32_t global_tid_73690;\n int32_t local_tid_73691;\n int32_t group_sizze_82883;\n int32_t wave_sizze_82882;\n int32_t group_id_73692;\n \n global_tid_73690 = get_global_id(0);\n local_tid_73691 = get_local_id(0);\n group_sizze_82883 = get_local_size(0);\n wave_sizze_82882 = LOCKSTEP_WIDTH;\n group_id_73692 = get_group_id(0);\n \n int32_t gtid_73679;\n int32_t gtid_73680;\n int32_t gtid_73681;\n int32_t gtid_73682;\n int32_t ltid_73686;\n \n gtid_73679 = squot32(global_tid_73690, K_68510 * D_68526 * triD_68516 *\n D_68526);\n gtid_73680 = squot32(global_tid_73690 - squot32(global_tid_73690, K_68510 *\n D_68526 * triD_68516 *\n D_68526) * (K_68510 *\n D_68526 *\n triD_68516 *\n ",
" D_68526),\n D_68526 * triD_68516 * D_68526);\n gtid_73681 = squot32(global_tid_73690 - squot32(global_tid_73690, K_68510 *\n D_68526 * triD_68516 *\n D_68526) * (K_68510 *\n D_68526 *\n triD_68516 *\n D_68526) -\n squot32(global_tid_73690 - squot32(global_tid_73690,\n K_68510 * D_68526 *\n triD_68516 *\n D_68526) *\n (K_68510 * D_68526 * triD_68516 * D_68526),\n D_68526 * triD_68516 * D_68526) * (D_68526 *\n triD_68516 *\n D_68526),\n triD_68516 * D_68526);\n gtid_73682 = squot32(global_tid_73690 - squot32(global_tid_73690, K_68510 *\n D_68526 * triD_68516 *\n D_68526) * (K_68510 *\n D_68526 *\n triD_68516 *\n D_68526) -\n squot32(global_tid_73690 - squot32(global_tid_73690,\n K_68510 * D_68526 *\n triD_68516 *\n D_68526) *\n (K_68510",
" * D_68526 * triD_68516 * D_68526),\n D_68526 * triD_68516 * D_68526) * (D_68526 *\n triD_68516 *\n D_68526) -\n squot32(global_tid_73690 - squot32(global_tid_73690,\n K_68510 * D_68526 *\n triD_68516 *\n D_68526) *\n (K_68510 * D_68526 * triD_68516 * D_68526) -\n squot32(global_tid_73690 -\n squot32(global_tid_73690, K_68510 *\n D_68526 * triD_68516 *\n D_68526) * (K_68510 * D_68526 *\n triD_68516 *\n D_68526), D_68526 *\n triD_68516 * D_68526) * (D_68526 *\n triD_68516 *\n D_68526),\n triD_68516 * D_68526) * (triD_68516 * D_68526),\n D_68526);\n ltid_73686 = global_tid_73690 - squot32(global_tid_73690, K_68510 *\n D_68526 * triD_68516 * D_68526) *\n (K_68510 * D_68526 * triD_68516 * D_68526) - squot32(global_tid_73690 -\n squot32(global_tid_73690,\n K_68510 *\n D_68526 *\n triD_68516",
" *\n D_68526) *\n (K_68510 *\n D_68526 *\n triD_68516 *\n D_68526),\n D_68526 *\n triD_68516 *\n D_68526) *\n (D_68526 * triD_68516 * D_68526) - squot32(global_tid_73690 -\n squot32(global_tid_73690,\n K_68510 * D_68526 *\n triD_68516 *\n D_68526) * (K_68510 *\n D_68526 *\n triD_68516 *\n D_68526) -\n squot32(global_tid_73690 -\n squot32(global_tid_73690,\n K_68510 *\n D_68526 *\n triD_68516 *\n D_68526) *\n (K_68510 * D_68526 *\n triD_68516 *\n D_68526), D_68526 *\n triD_68516 *\n ",
" D_68526) * (D_68526 *\n triD_68516 *\n D_68526),\n triD_68516 * D_68526) *\n (triD_68516 * D_68526) - squot32(global_tid_73690 -\n squot32(global_tid_73690, K_68510 *\n D_68526 * triD_68516 *\n D_68526) * (K_68510 * D_68526 *\n triD_68516 *\n D_68526) -\n squot32(global_tid_73690 -\n squot32(global_tid_73690,\n K_68510 * D_68526 *\n triD_68516 * D_68526) *\n (K_68510 * D_68526 *\n triD_68516 * D_68526),\n D_68526 * triD_68516 *\n D_68526) * (D_68526 *\n triD_68516 *\n D_68526) -\n squot32(global_tid_73690 -\n squot32(global_tid_73690,\n K_68510 * D_68526 *\n triD_68516 * D_68526) *\n (K_68510 * D_68526 *\n triD_68516 * D_68526) -\n squot32(global_tid_",
"73690 -\n squot32(global_tid_73690,\n K_68510 *\n D_68526 *\n triD_68516 *\n D_68526) *\n (K_68510 * D_68526 *\n triD_68516 * D_68526),\n D_68526 * triD_68516 *\n D_68526) * (D_68526 *\n triD_68516 *\n D_68526),\n triD_68516 * D_68526) *\n (triD_68516 * D_68526), D_68526) *\n D_68526;\n \n double x_79849;\n \n if ((((slt32(gtid_73679, N_68508) && slt32(gtid_73680, K_68510)) &&\n slt32(gtid_73681, D_68526)) && slt32(gtid_73682, triD_68516)) &&\n slt32(ltid_73686, D_68526)) {\n x_79849 = *(__global double *) &mem_81250[(gtid_73679 * (D_68526 *\n triD_68516 *\n D_68526 *\n K_68510) +\n gtid_73680 * (D_68526 *\n triD_68516 *\n D_68526) +\n gtid_73681 * (D_68526 *\n triD_68516) +\n gtid_73",
"682 * D_68526 +\n ltid_73686) * 8];\n }\n \n __local char *mem_81253;\n double res_77684;\n \n mem_81253 = (__local char *) mem_81253_backing_0;\n for (int32_t comb_iter_82884 = 0; comb_iter_82884 < 1; comb_iter_82884++) {\n int32_t ctid_73688;\n int32_t flat_comb_id_82885 = comb_iter_82884 * D_68526 +\n local_tid_73691;\n \n ctid_73688 = flat_comb_id_82885;\n if (slt32(ctid_73688, D_68526) && 1) {\n *(__local double *) &mem_81253[ctid_73688 * 8] = x_79849;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82886;\n int32_t skip_waves_82887;\n double x_77685;\n double x_77686;\n \n offset_82886 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_73691, D_68526)) {\n x_77685 = *(__local double *) &mem_81253[(local_tid_73691 +\n offset_82886) * 8];\n }\n }\n offset_82886 = 1;\n while (slt32(offset_82886, wave_sizze_82882)) {\n if (slt32(local_tid_73691 + offset_82886, D_68526) &&\n ((local_tid_73691 - squot32(local_tid_73691, wave_sizze_82882) *\n wave_sizze_82882) & (2 * offset_82886 - 1)) == 0) {\n // read array element\n {\n x_77686 = *(volatile __local\n double *) &mem_81253[(local_tid_73691 +\n offset_82886) * 8];\n }\n // apply reduction operation\n {\n double res_77687;\n \n if ((((slt32(gtid_73679, N_68508) && slt32(gtid_73680,\n K_68510)) &&\n slt32(gtid_73681, D_68526)) && slt32(gtid_73682,\n triD_68516)) &&\n slt32(ltid_73686, D_68526)) {\n ",
" res_77687 = x_77685 + x_77686;\n }\n x_77685 = res_77687;\n }\n // write result of operation\n {\n *(volatile __local double *) &mem_81253[local_tid_73691 * 8] =\n x_77685;\n }\n }\n offset_82886 *= 2;\n }\n skip_waves_82887 = 1;\n while (slt32(skip_waves_82887, squot32(D_68526 + wave_sizze_82882 - 1,\n wave_sizze_82882))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82886 = skip_waves_82887 * wave_sizze_82882;\n if (slt32(local_tid_73691 + offset_82886, D_68526) &&\n ((local_tid_73691 - squot32(local_tid_73691, wave_sizze_82882) *\n wave_sizze_82882) == 0 && (squot32(local_tid_73691,\n wave_sizze_82882) & (2 *\n skip_waves_82887 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_77686 = *(__local double *) &mem_81253[(local_tid_73691 +\n offset_82886) * 8];\n }\n // apply reduction operation\n {\n double res_77687;\n \n if ((((slt32(gtid_73679, N_68508) && slt32(gtid_73680,\n K_68510)) &&\n slt32(gtid_73681, D_68526)) && slt32(gtid_73682,\n triD_68516)) &&\n slt32(ltid_73686, D_68526)) {\n res_77687 = x_77685 + x_77686;\n }\n x_77685 = res_77687;\n }\n // write result of operation\n {\n *(__local double *) &mem_81253[local_tid_73691 * 8] = x_77685;\n }\n ",
"}\n skip_waves_82887 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n res_77684 = *(__local double *) &mem_81253[0];\n if (local_tid_73691 == 0) {\n *(__global double *) &mem_81256[group_id_73692 * 8] = res_77684;\n }\n}\n__kernel void map_intra_group_73863(__local volatile\n int64_t *mem_81184_backing_aligned_0,\n int32_t N_68508, int32_t K_68510,\n int32_t D_68526, __global\n unsigned char *mem_81181, __global\n unsigned char *mem_81187)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n __local volatile char *restrict mem_81184_backing_0 =\n mem_81184_backing_aligned_0;\n int32_t global_tid_73863;\n int32_t local_tid_73864;\n int32_t group_sizze_82829;\n int32_t wave_sizze_82828;\n int32_t group_id_73865;\n \n global_tid_73863 = get_global_id(0);\n local_tid_73864 = get_local_id(0);\n group_sizze_82829 = get_local_size(0);\n wave_sizze_82828 = LOCKSTEP_WIDTH;\n group_id_73865 = get_group_id(0);\n \n int32_t gtid_73852;\n int32_t gtid_73853;\n int32_t gtid_73854;\n int32_t gtid_73855;\n int32_t ltid_73859;\n \n gtid_73852 = squot32(global_tid_73863, K_68510 * D_68526 * D_68526 *\n D_68526);\n gtid_73853 = squot32(global_tid_73863 - squot32(global_tid_73863, K_68510 *\n D_68526 * D_68526 *\n D_68526) * (K_68510 *\n D_68526 *\n D_68526 *\n D_68526),\n D_68526 * D_68526 * D_68526);\n gtid_73854 = squot32(global_tid_73863 - squot32(global_tid_73863, K_",
"68510 *\n D_68526 * D_68526 *\n D_68526) * (K_68510 *\n D_68526 *\n D_68526 *\n D_68526) -\n squot32(global_tid_73863 - squot32(global_tid_73863,\n K_68510 * D_68526 *\n D_68526 * D_68526) *\n (K_68510 * D_68526 * D_68526 * D_68526),\n D_68526 * D_68526 * D_68526) * (D_68526 *\n D_68526 *\n D_68526),\n D_68526 * D_68526);\n gtid_73855 = squot32(global_tid_73863 - squot32(global_tid_73863, K_68510 *\n D_68526 * D_68526 *\n D_68526) * (K_68510 *\n D_68526 *\n D_68526 *\n D_68526) -\n squot32(global_tid_73863 - squot32(global_tid_73863,\n K_68510 * D_68526 *\n D_68526 * D_68526) *\n (K_68510 * D_68526 * D_68526 * D_68526),\n D_68526 * D_68526 * D_68526) * (D_68526 *\n D_68526 *\n D_68526) -\n squot32(global_tid_73863 - ",
"squot32(global_tid_73863,\n K_68510 * D_68526 *\n D_68526 * D_68526) *\n (K_68510 * D_68526 * D_68526 * D_68526) -\n squot32(global_tid_73863 -\n squot32(global_tid_73863, K_68510 *\n D_68526 * D_68526 * D_68526) *\n (K_68510 * D_68526 * D_68526 *\n D_68526), D_68526 * D_68526 *\n D_68526) * (D_68526 * D_68526 *\n D_68526), D_68526 *\n D_68526) * (D_68526 * D_68526), D_68526);\n ltid_73859 = global_tid_73863 - squot32(global_tid_73863, K_68510 *\n D_68526 * D_68526 * D_68526) *\n (K_68510 * D_68526 * D_68526 * D_68526) - squot32(global_tid_73863 -\n squot32(global_tid_73863,\n K_68510 *\n D_68526 *\n D_68526 *\n D_68526) *\n (K_68510 * D_68526 *\n D_68526 * D_68526),\n D_68526 * D_68526 *\n D_68526) * (D_68526 *\n D_68526 *\n D_68526) -\n squot32(global_tid_73863 - squot32(global_tid_73863, K_685",
"10 * D_68526 *\n D_68526 * D_68526) * (K_68510 *\n D_68526 *\n D_68526 *\n D_68526) -\n squot32(global_tid_73863 - squot32(global_tid_73863, K_68510 *\n D_68526 * D_68526 *\n D_68526) * (K_68510 *\n D_68526 *\n D_68526 *\n D_68526),\n D_68526 * D_68526 * D_68526) * (D_68526 * D_68526 *\n D_68526), D_68526 *\n D_68526) * (D_68526 * D_68526) - squot32(global_tid_73863 -\n squot32(global_tid_73863,\n K_68510 *\n D_68526 *\n D_68526 *\n D_68526) *\n (K_68510 * D_68526 *\n D_68526 * D_68526) -\n squot32(global_tid_73863 -\n squot32(global_tid_73863,\n K_68510 *\n D_68526 *\n D_68526 *\n ",
" D_68526) *\n (K_68510 *\n D_68526 *\n D_68526 *\n D_68526),\n D_68526 *\n D_68526 *\n D_68526) *\n (D_68526 * D_68526 *\n D_68526) -\n squot32(global_tid_73863 -\n squot32(global_tid_73863,\n K_68510 *\n D_68526 *\n D_68526 *\n D_68526) *\n (K_68510 *\n D_68526 *\n D_68526 *\n D_68526) -\n squot32(global_tid_73863 -\n squot32(global_tid_73863,\n K_68510 *\n D_68526 *\n D_68526 *\n ",
" D_68526) *\n (K_68510 *\n D_68526 *\n D_68526 *\n D_68526),\n D_68526 *\n D_68526 *\n D_68526) *\n (D_68526 *\n D_68526 *\n D_68526),\n D_68526 *\n D_68526) *\n (D_68526 * D_68526),\n D_68526) * D_68526;\n \n double x_79843;\n \n if ((((slt32(gtid_73852, N_68508) && slt32(gtid_73853, K_68510)) &&\n slt32(gtid_73854, D_68526)) && slt32(gtid_73855, D_68526)) &&\n slt32(ltid_73859, D_68526)) {\n x_79843 = *(__global double *) &mem_81181[(gtid_73852 * (D_68526 *\n D_68526 *\n D_68526 *\n K_68510) +\n gtid_73853 * (D_68526 *\n D_68526 *\n D_68526) +\n gtid_7385",
"4 * (D_68526 *\n D_68526) +\n gtid_73855 * D_68526 +\n ltid_73859) * 8];\n }\n \n __local char *mem_81184;\n double res_77585;\n \n mem_81184 = (__local char *) mem_81184_backing_0;\n for (int32_t comb_iter_82830 = 0; comb_iter_82830 < 1; comb_iter_82830++) {\n int32_t ctid_73861;\n int32_t flat_comb_id_82831 = comb_iter_82830 * D_68526 +\n local_tid_73864;\n \n ctid_73861 = flat_comb_id_82831;\n if (slt32(ctid_73861, D_68526) && 1) {\n *(__local double *) &mem_81184[ctid_73861 * 8] = x_79843;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82832;\n int32_t skip_waves_82833;\n double x_77586;\n double x_77587;\n \n offset_82832 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_73864, D_68526)) {\n x_77586 = *(__local double *) &mem_81184[(local_tid_73864 +\n offset_82832) * 8];\n }\n }\n offset_82832 = 1;\n while (slt32(offset_82832, wave_sizze_82828)) {\n if (slt32(local_tid_73864 + offset_82832, D_68526) &&\n ((local_tid_73864 - squot32(local_tid_73864, wave_sizze_82828) *\n wave_sizze_82828) & (2 * offset_82832 - 1)) == 0) {\n // read array element\n {\n x_77587 = *(volatile __local\n double *) &mem_81184[(local_tid_73864 +\n offset_82832) * 8];\n }\n // apply reduction operation\n {\n double res_77588;\n \n if ((((slt32(gtid_73852, N_68508) && slt32(gtid_73853,\n K_68510)) &&\n slt32(gtid_73854, D_68526)) &&",
" slt32(gtid_73855,\n D_68526)) &&\n slt32(ltid_73859, D_68526)) {\n res_77588 = x_77586 + x_77587;\n }\n x_77586 = res_77588;\n }\n // write result of operation\n {\n *(volatile __local double *) &mem_81184[local_tid_73864 * 8] =\n x_77586;\n }\n }\n offset_82832 *= 2;\n }\n skip_waves_82833 = 1;\n while (slt32(skip_waves_82833, squot32(D_68526 + wave_sizze_82828 - 1,\n wave_sizze_82828))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82832 = skip_waves_82833 * wave_sizze_82828;\n if (slt32(local_tid_73864 + offset_82832, D_68526) &&\n ((local_tid_73864 - squot32(local_tid_73864, wave_sizze_82828) *\n wave_sizze_82828) == 0 && (squot32(local_tid_73864,\n wave_sizze_82828) & (2 *\n skip_waves_82833 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_77587 = *(__local double *) &mem_81184[(local_tid_73864 +\n offset_82832) * 8];\n }\n // apply reduction operation\n {\n double res_77588;\n \n if ((((slt32(gtid_73852, N_68508) && slt32(gtid_73853,\n K_68510)) &&\n slt32(gtid_73854, D_68526)) && slt32(gtid_73855,\n D_68526)) &&\n slt32(ltid_73859, D_68526)) {\n res_77588 = x_77586 + x_77587;\n }\n x_77586 = res_77588;\n }\n /",
"/ write result of operation\n {\n *(__local double *) &mem_81184[local_tid_73864 * 8] = x_77586;\n }\n }\n skip_waves_82833 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n res_77585 = *(__local double *) &mem_81184[0];\n if (local_tid_73864 == 0) {\n *(__global double *) &mem_81187[group_id_73865 * 8] = res_77585;\n }\n}\n__kernel void map_intra_group_74690(__local volatile\n int64_t *mem_80911_backing_aligned_0,\n int32_t N_68508, int32_t K_68510,\n int32_t D_68526, __global\n unsigned char *mem_80879, __global\n unsigned char *mem_80889, __global\n unsigned char *mem_80893, __global\n unsigned char *mem_80914)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n __local volatile char *restrict mem_80911_backing_0 =\n mem_80911_backing_aligned_0;\n int32_t global_tid_74690;\n int32_t local_tid_74691;\n int32_t group_sizze_82641;\n int32_t wave_sizze_82640;\n int32_t group_id_74692;\n \n global_tid_74690 = get_global_id(0);\n local_tid_74691 = get_local_id(0);\n group_sizze_82641 = get_local_size(0);\n wave_sizze_82640 = LOCKSTEP_WIDTH;\n group_id_74692 = get_group_id(0);\n \n int32_t gtid_74681;\n int32_t gtid_74682;\n int32_t gtid_74683;\n int32_t ltid_74686;\n \n gtid_74681 = squot32(global_tid_74690, K_68510 * D_68526 * D_68526);\n gtid_74682 = squot32(global_tid_74690 - squot32(global_tid_74690, K_68510 *\n D_68526 * D_68526) *\n (K_68510 * D_68526 * D_68526), D_68526 * D_68526);\n gtid_74683 = squot32(global_tid_74690 - squot32(global_tid_74690, K_68510 *\n ",
" D_68526 * D_68526) *\n (K_68510 * D_68526 * D_68526) -\n squot32(global_tid_74690 - squot32(global_tid_74690,\n K_68510 * D_68526 *\n D_68526) *\n (K_68510 * D_68526 * D_68526), D_68526 *\n D_68526) * (D_68526 * D_68526), D_68526);\n ltid_74686 = global_tid_74690 - squot32(global_tid_74690, K_68510 *\n D_68526 * D_68526) * (K_68510 *\n D_68526 *\n D_68526) -\n squot32(global_tid_74690 - squot32(global_tid_74690, K_68510 * D_68526 *\n D_68526) * (K_68510 * D_68526 *\n D_68526), D_68526 *\n D_68526) * (D_68526 * D_68526) - squot32(global_tid_74690 -\n squot32(global_tid_74690,\n K_68510 *\n D_68526 *\n D_68526) *\n (K_68510 * D_68526 *\n D_68526) -\n squot32(global_tid_74690 -\n squot32(global_tid_74690,\n K_68510 *\n D_68526 *\n D_68526) *\n ",
" (K_68510 *\n D_68526 *\n D_68526),\n D_68526 *\n D_68526) *\n (D_68526 * D_68526),\n D_68526) * D_68526;\n \n double rev_sqnorm_arg_77063;\n double x_79805;\n double x_79807;\n double res_77068;\n \n if (((slt32(gtid_74681, N_68508) && slt32(gtid_74682, K_68510)) &&\n slt32(gtid_74683, D_68526)) && slt32(ltid_74686, D_68526)) {\n rev_sqnorm_arg_77063 = *(__global double *) &mem_80893[(gtid_74681 *\n K_68510 +\n gtid_74682) *\n 8];\n x_79805 = *(__global double *) &mem_80889[(gtid_74681 * (D_68526 *\n K_68510) +\n gtid_74682 * D_68526 +\n ltid_74686) * 8];\n x_79807 = *(__global double *) &mem_80879[(gtid_74681 * (D_68526 *\n D_68526 *\n K_68510) +\n gtid_74682 * (D_68526 *\n D_68526) +\n gtid_74683 * D_68526 +\n ltid_74686) * 8];\n res_77068 = x_79805 * x_79807;\n }\n \n __local char *mem_80911;\n double res_77069;\n \n mem_80911 = (__local char *) mem_80911_back",
"ing_0;\n for (int32_t comb_iter_82642 = 0; comb_iter_82642 < 1; comb_iter_82642++) {\n int32_t ctid_74688;\n int32_t flat_comb_id_82643 = comb_iter_82642 * D_68526 +\n local_tid_74691;\n \n ctid_74688 = flat_comb_id_82643;\n if (slt32(ctid_74688, D_68526) && 1) {\n *(__local double *) &mem_80911[ctid_74688 * 8] = res_77068;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82644;\n int32_t skip_waves_82645;\n double x_77070;\n double x_77071;\n \n offset_82644 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_74691, D_68526)) {\n x_77070 = *(__local double *) &mem_80911[(local_tid_74691 +\n offset_82644) * 8];\n }\n }\n offset_82644 = 1;\n while (slt32(offset_82644, wave_sizze_82640)) {\n if (slt32(local_tid_74691 + offset_82644, D_68526) &&\n ((local_tid_74691 - squot32(local_tid_74691, wave_sizze_82640) *\n wave_sizze_82640) & (2 * offset_82644 - 1)) == 0) {\n // read array element\n {\n x_77071 = *(volatile __local\n double *) &mem_80911[(local_tid_74691 +\n offset_82644) * 8];\n }\n // apply reduction operation\n {\n double res_77072;\n \n if (((slt32(gtid_74681, N_68508) && slt32(gtid_74682,\n K_68510)) &&\n slt32(gtid_74683, D_68526)) && slt32(ltid_74686,\n D_68526)) {\n res_77072 = x_77070 + x_77071;\n }\n x_77070 = res_77072;\n }\n // write result of operation\n {\n *(volatile __local double *) &mem_80911[local_tid_74691 * 8] =\n ",
" x_77070;\n }\n }\n offset_82644 *= 2;\n }\n skip_waves_82645 = 1;\n while (slt32(skip_waves_82645, squot32(D_68526 + wave_sizze_82640 - 1,\n wave_sizze_82640))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82644 = skip_waves_82645 * wave_sizze_82640;\n if (slt32(local_tid_74691 + offset_82644, D_68526) &&\n ((local_tid_74691 - squot32(local_tid_74691, wave_sizze_82640) *\n wave_sizze_82640) == 0 && (squot32(local_tid_74691,\n wave_sizze_82640) & (2 *\n skip_waves_82645 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_77071 = *(__local double *) &mem_80911[(local_tid_74691 +\n offset_82644) * 8];\n }\n // apply reduction operation\n {\n double res_77072;\n \n if (((slt32(gtid_74681, N_68508) && slt32(gtid_74682,\n K_68510)) &&\n slt32(gtid_74683, D_68526)) && slt32(ltid_74686,\n D_68526)) {\n res_77072 = x_77070 + x_77071;\n }\n x_77070 = res_77072;\n }\n // write result of operation\n {\n *(__local double *) &mem_80911[local_tid_74691 * 8] = x_77070;\n }\n }\n skip_waves_82645 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n res_77069 = *(__local double *) &mem_80911[0];\n \n double res_77073;\n double res_77074;\n \n if (((slt32(gtid_74681, N_68508) && slt32(gtid_74682, K_68510)) &&\n slt32(gtid_74683, D_68526)) && slt32(ltid_74686, D_6852",
"6)) {\n res_77073 = rev_sqnorm_arg_77063 * res_77069;\n res_77074 = res_77073 + res_77073;\n }\n if (local_tid_74691 == 0) {\n *(__global double *) &mem_80914[group_id_74692 * 8] = res_77074;\n }\n}\n__kernel void map_intra_group_78025(__local volatile\n int64_t *mem_81457_backing_aligned_0,\n int32_t N_68508, int32_t K_68510,\n int32_t D_68526, __global\n unsigned char *alphas_mem_80367, __global\n unsigned char *res_mem_81412, __global\n unsigned char *mem_81460, __global\n unsigned char *mem_81464)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n __local volatile char *restrict mem_81457_backing_0 =\n mem_81457_backing_aligned_0;\n int32_t global_tid_78025;\n int32_t local_tid_78026;\n int32_t group_sizze_83075;\n int32_t wave_sizze_83074;\n int32_t group_id_78027;\n \n global_tid_78025 = get_global_id(0);\n local_tid_78026 = get_local_id(0);\n group_sizze_83075 = get_local_size(0);\n wave_sizze_83074 = LOCKSTEP_WIDTH;\n group_id_78027 = get_group_id(0);\n \n int32_t gtid_78012;\n int32_t ltid_78013;\n \n gtid_78012 = squot32(global_tid_78025, D_68526);\n ltid_78013 = global_tid_78025 - squot32(global_tid_78025, D_68526) *\n D_68526;\n \n double alphas_elem_78423;\n double x_79883;\n \n if (slt32(gtid_78012, K_68510) && slt32(ltid_78013, D_68526)) {\n alphas_elem_78423 = *(__global double *) &alphas_mem_80367[gtid_78012 *\n 8];\n \n double x_78429 = 0.0;\n \n for (int32_t chunk_offset_78428 = 0; chunk_offset_78428 < N_68508;\n chunk_offset_78428++) {\n double x_78436;\n ",
" double res_78439;\n \n x_78436 = *(__global double *) &res_mem_81412[(chunk_offset_78428 *\n (D_68526 * K_68510) +\n gtid_78012 *\n D_68526 +\n ltid_78013) * 8];\n res_78439 = x_78429 + x_78436;\n \n double x_tmp_83076 = res_78439;\n \n x_78429 = x_tmp_83076;\n }\n x_79883 = x_78429;\n }\n \n __local char *mem_81457;\n \n mem_81457 = (__local char *) mem_81457_backing_0;\n for (int32_t comb_iter_83077 = 0; comb_iter_83077 < 1; comb_iter_83077++) {\n int32_t ctid_78023;\n int32_t flat_comb_id_83078 = comb_iter_83077 * D_68526 +\n local_tid_78026;\n \n ctid_78023 = flat_comb_id_83078;\n if (slt32(ctid_78023, D_68526) && 1) {\n *(__local double *) &mem_81457[ctid_78023 * 8] = x_79883;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n double res_78440;\n \n if (slt32(gtid_78012, K_68510) && slt32(ltid_78013, D_68526)) {\n res_78440 = futrts_exp64(alphas_elem_78423);\n }\n if (local_tid_78026 == 0) {\n *(__global double *) &mem_81460[group_id_78027 * 8] = res_78440;\n }\n for (int32_t i_83080 = 0; i_83080 < squot32(D_68526 - local_tid_78026 +\n D_68526 - 1, D_68526);\n i_83080++) {\n *(__global double *) &mem_81464[(group_id_78027 * D_68526 + (i_83080 *\n D_68526 +\n local_tid_78026)) *\n 8] = *(__local\n double *) &mem_81457[(i_83080 *\n ",
" D_68526 +\n local_tid_78026) *\n 8];\n }\n}\n__kernel void map_intra_group_78182(__local volatile\n int64_t *mem_81476_backing_aligned_0,\n int32_t N_68508, int32_t K_68510,\n int32_t D_68526, __global\n unsigned char *mem_81473, __global\n unsigned char *mem_81479)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n __local volatile char *restrict mem_81476_backing_0 =\n mem_81476_backing_aligned_0;\n int32_t global_tid_78182;\n int32_t local_tid_78183;\n int32_t group_sizze_83085;\n int32_t wave_sizze_83084;\n int32_t group_id_78184;\n \n global_tid_78182 = get_global_id(0);\n local_tid_78183 = get_local_id(0);\n group_sizze_83085 = get_local_size(0);\n wave_sizze_83084 = LOCKSTEP_WIDTH;\n group_id_78184 = get_group_id(0);\n \n int32_t gtid_78175;\n int32_t gtid_78176;\n int32_t ltid_78178;\n \n gtid_78175 = squot32(global_tid_78182, D_68526 * N_68508);\n gtid_78176 = squot32(global_tid_78182 - squot32(global_tid_78182, D_68526 *\n N_68508) * (D_68526 *\n N_68508),\n N_68508);\n ltid_78178 = global_tid_78182 - squot32(global_tid_78182, D_68526 *\n N_68508) * (D_68526 * N_68508) -\n squot32(global_tid_78182 - squot32(global_tid_78182, D_68526 *\n N_68508) * (D_68526 * N_68508),\n N_68508) * N_68508;\n \n double x_79887;\n \n if ((slt32(gtid_78175, K_68510) && slt32(gtid_78176, D_68526)) &&\n slt32(ltid_781",
"78, N_68508)) {\n x_79887 = *(__global double *) &mem_81473[(gtid_78175 * (N_68508 *\n D_68526) +\n gtid_78176 * N_68508 +\n ltid_78178) * 8];\n }\n \n __local char *mem_81476;\n double res_78478;\n \n mem_81476 = (__local char *) mem_81476_backing_0;\n for (int32_t comb_iter_83086 = 0; comb_iter_83086 < 1; comb_iter_83086++) {\n int32_t ctid_78180;\n int32_t flat_comb_id_83087 = comb_iter_83086 * N_68508 +\n local_tid_78183;\n \n ctid_78180 = flat_comb_id_83087;\n if (slt32(ctid_78180, N_68508) && 1) {\n *(__local double *) &mem_81476[ctid_78180 * 8] = x_79887;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_83088;\n int32_t skip_waves_83089;\n double x_78479;\n double x_78480;\n \n offset_83088 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_78183, N_68508)) {\n x_78479 = *(__local double *) &mem_81476[(local_tid_78183 +\n offset_83088) * 8];\n }\n }\n offset_83088 = 1;\n while (slt32(offset_83088, wave_sizze_83084)) {\n if (slt32(local_tid_78183 + offset_83088, N_68508) &&\n ((local_tid_78183 - squot32(local_tid_78183, wave_sizze_83084) *\n wave_sizze_83084) & (2 * offset_83088 - 1)) == 0) {\n // read array element\n {\n x_78480 = *(volatile __local\n double *) &mem_81476[(local_tid_78183 +\n offset_83088) * 8];\n }\n // apply reduction operation\n {\n double res_78481;\n \n if ((slt32(gtid_78175, K_68510) && slt32(gtid_78176,\n ",
" D_68526)) &&\n slt32(ltid_78178, N_68508)) {\n res_78481 = x_78479 + x_78480;\n }\n x_78479 = res_78481;\n }\n // write result of operation\n {\n *(volatile __local double *) &mem_81476[local_tid_78183 * 8] =\n x_78479;\n }\n }\n offset_83088 *= 2;\n }\n skip_waves_83089 = 1;\n while (slt32(skip_waves_83089, squot32(N_68508 + wave_sizze_83084 - 1,\n wave_sizze_83084))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_83088 = skip_waves_83089 * wave_sizze_83084;\n if (slt32(local_tid_78183 + offset_83088, N_68508) &&\n ((local_tid_78183 - squot32(local_tid_78183, wave_sizze_83084) *\n wave_sizze_83084) == 0 && (squot32(local_tid_78183,\n wave_sizze_83084) & (2 *\n skip_waves_83089 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_78480 = *(__local double *) &mem_81476[(local_tid_78183 +\n offset_83088) * 8];\n }\n // apply reduction operation\n {\n double res_78481;\n \n if ((slt32(gtid_78175, K_68510) && slt32(gtid_78176,\n D_68526)) &&\n slt32(ltid_78178, N_68508)) {\n res_78481 = x_78479 + x_78480;\n }\n x_78479 = res_78481;\n }\n // write result of operation\n {\n *(__local double *) &mem_81476[local_tid_78183 * 8] = x_78479;\n }\n }\n skip_waves_83089 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n ",
" res_78478 = *(__local double *) &mem_81476[0];\n if (local_tid_78183 == 0) {\n *(__global double *) &mem_81479[group_id_78184 * 8] = res_78478;\n }\n}\n__kernel void map_intra_group_78574(__local volatile\n int64_t *mem_81549_backing_aligned_0,\n __local volatile\n int64_t *mem_81552_backing_aligned_1,\n __local volatile\n int64_t *mem_81555_backing_aligned_2,\n int32_t N_68508, int32_t K_68510,\n int32_t D_68514, int32_t triD_68516,\n int32_t D_68526, double res_68862,\n double t1389_68865, double res_68867,\n int32_t computed_group_sizze_78572, __global\n unsigned char *alphas_mem_80367, __global\n unsigned char *qs_mem_80369, __global\n unsigned char *icf_mem_80370, __global\n unsigned char *res_mem_81411, __global\n unsigned char *res_mem_81413, __global\n unsigned char *res_mem_81414, __global\n unsigned char *mem_81559, __global\n unsigned char *mem_81563, __global\n unsigned char *mem_81566)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n __local volatile char *restrict mem_81549_backing_0 =\n mem_81549_backing_aligned_0;\n __local volatile char *restrict mem_81552_backing_1 =\n mem_81552_backing_aligned_1;\n __local volatile char *restrict mem_81555_backing_2 =\n mem_81555_backing_aligned_2;\n ",
" int32_t global_tid_78574;\n int32_t local_tid_78575;\n int32_t group_sizze_83165;\n int32_t wave_sizze_83164;\n int32_t group_id_78576;\n \n global_tid_78574 = get_global_id(0);\n local_tid_78575 = get_local_id(0);\n group_sizze_83165 = get_local_size(0);\n wave_sizze_83164 = LOCKSTEP_WIDTH;\n group_id_78576 = get_group_id(0);\n \n int32_t gtid_78545;\n int32_t ltid_78546;\n \n gtid_78545 = squot32(global_tid_78574, computed_group_sizze_78572);\n ltid_78546 = global_tid_78574 - squot32(global_tid_78574,\n computed_group_sizze_78572) *\n computed_group_sizze_78572;\n \n double alphas_elem_78784;\n \n if (slt32(gtid_78545, K_68510) && slt32(ltid_78546,\n computed_group_sizze_78572)) {\n alphas_elem_78784 = *(__global double *) &alphas_mem_80367[gtid_78545 *\n 8];\n }\n \n __local char *mem_81549;\n double res_78789;\n \n mem_81549 = (__local char *) mem_81549_backing_0;\n for (int32_t comb_iter_83166 = 0; comb_iter_83166 < squot32(N_68508 +\n computed_group_sizze_78572 -\n 1,\n computed_group_sizze_78572);\n comb_iter_83166++) {\n int32_t ctid_78548;\n int32_t flat_comb_id_83167 = comb_iter_83166 *\n computed_group_sizze_78572 + local_tid_78575;\n \n ctid_78548 = flat_comb_id_83167;\n if (slt32(ctid_78548, N_68508) && 1) {\n double x_78788 = *(__global double *) &res_mem_81411[(ltid_78546 *\n K_68510 +\n gtid_78545) *\n 8];\n",
" \n *(__local double *) &mem_81549[ctid_78548 * 8] = x_78788;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_83168;\n int32_t skip_waves_83169;\n double x_78790;\n double x_78791;\n \n offset_83168 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_78575, N_68508)) {\n x_78790 = *(__local double *) &mem_81549[(local_tid_78575 +\n offset_83168) * 8];\n }\n }\n offset_83168 = 1;\n while (slt32(offset_83168, wave_sizze_83164)) {\n if (slt32(local_tid_78575 + offset_83168, N_68508) &&\n ((local_tid_78575 - squot32(local_tid_78575, wave_sizze_83164) *\n wave_sizze_83164) & (2 * offset_83168 - 1)) == 0) {\n // read array element\n {\n x_78791 = *(volatile __local\n double *) &mem_81549[(local_tid_78575 +\n offset_83168) * 8];\n }\n // apply reduction operation\n {\n double res_78792;\n \n if (slt32(gtid_78545, K_68510) && slt32(ltid_78546,\n computed_group_sizze_78572)) {\n res_78792 = x_78790 + x_78791;\n }\n x_78790 = res_78792;\n }\n // write result of operation\n {\n *(volatile __local double *) &mem_81549[local_tid_78575 * 8] =\n x_78790;\n }\n }\n offset_83168 *= 2;\n }\n skip_waves_83169 = 1;\n while (slt32(skip_waves_83169, squot32(computed_group_sizze_78572 +\n wave_sizze_83164 - 1,\n wave_sizze_83164))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_83168 = skip_waves_83169 * wave_sizze_83164;\n if (slt32(l",
"ocal_tid_78575 + offset_83168, N_68508) &&\n ((local_tid_78575 - squot32(local_tid_78575, wave_sizze_83164) *\n wave_sizze_83164) == 0 && (squot32(local_tid_78575,\n wave_sizze_83164) & (2 *\n skip_waves_83169 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_78791 = *(__local double *) &mem_81549[(local_tid_78575 +\n offset_83168) * 8];\n }\n // apply reduction operation\n {\n double res_78792;\n \n if (slt32(gtid_78545, K_68510) && slt32(ltid_78546,\n computed_group_sizze_78572)) {\n res_78792 = x_78790 + x_78791;\n }\n x_78790 = res_78792;\n }\n // write result of operation\n {\n *(__local double *) &mem_81549[local_tid_78575 * 8] = x_78790;\n }\n }\n skip_waves_83169 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n res_78789 = *(__local double *) &mem_81549[0];\n \n double res_78793;\n double res_78794;\n double res_78795;\n \n if (slt32(gtid_78545, K_68510) && slt32(ltid_78546,\n computed_group_sizze_78572)) {\n res_78793 = futrts_exp64(alphas_elem_78784);\n res_78794 = res_68862 * res_78793;\n res_78795 = res_78789 + res_78794;\n }\n \n __local char *mem_81552;\n __local char *mem_81555;\n \n mem_81552 = (__local char *) mem_81552_backing_1;\n for (int32_t comb_iter_83170 = 0; comb_iter_83170 < squot32(D_68526 +\n computed_group_sizze_78572 -\n ",
" 1,\n computed_group_sizze_78572);\n comb_iter_83170++) {\n int32_t ctid_78558;\n int32_t flat_comb_id_83171 = comb_iter_83170 *\n computed_group_sizze_78572 + local_tid_78575;\n \n ctid_78558 = flat_comb_id_83171;\n if (slt32(ctid_78558, D_68526) && 1) {\n double qs_elem_elem_78799 = *(__global\n double *) &qs_mem_80369[(gtid_78545 *\n D_68514 +\n ltid_78546) *\n 8];\n double res_78801;\n double x_78804 = 0.0;\n int32_t chunk_sizze_78802;\n int32_t chunk_offset_78803 = 0;\n \n chunk_sizze_78802 = N_68508;\n \n double res_78806;\n double acc_78809 = x_78804;\n int32_t groupstream_mapaccum_dummy_chunk_sizze_78807;\n \n groupstream_mapaccum_dummy_chunk_sizze_78807 = 1;\n if (chunk_sizze_78802 == N_68508) {\n for (int32_t i_78808 = 0; i_78808 < N_68508; i_78808++) {\n double x_78811;\n double res_78814;\n \n x_78811 = *(__global double *) &res_mem_81413[(gtid_78545 *\n D_68526 +\n ltid_78546 +\n D_68526 *\n K_68510 *\n chunk_offset_78803 +\n D_68526 *\n ",
" K_68510 *\n i_78808 + 0 *\n (D_68526 *\n K_68510)) *\n 8];\n res_78814 = acc_78809 + x_78811;\n \n double acc_tmp_83172 = res_78814;\n \n acc_78809 = acc_tmp_83172;\n }\n } else {\n for (int32_t i_78808 = 0; i_78808 < chunk_sizze_78802;\n i_78808++) {\n double x_78811;\n double res_78814;\n \n x_78811 = *(__global double *) &res_mem_81413[(gtid_78545 *\n D_68526 +\n ltid_78546 +\n D_68526 *\n K_68510 *\n chunk_offset_78803 +\n D_68526 *\n K_68510 *\n i_78808 + 0 *\n (D_68526 *\n K_68510)) *\n 8];\n res_78814 = acc_78809 + x_78811;\n \n double acc_tmp_83173 = res_78814;\n \n acc_78809 = acc_tmp_83173;\n }\n }\n res_78806 = acc_78809;\n x_78804 = ",
"res_78806;\n res_78801 = x_78804;\n \n double res_78815;\n double res_78816;\n double res_78817;\n double res_78819;\n double res_78820;\n double res_78821;\n \n res_78815 = futrts_exp64(qs_elem_elem_78799);\n res_78816 = t1389_68865 * res_78815;\n res_78817 = res_78816 + res_78816;\n res_78819 = res_78815 * res_78817;\n res_78820 = res_68867 + res_78819;\n res_78821 = res_78801 + res_78820;\n *(__local double *) &mem_81552[ctid_78558 * 8] = res_78821;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n mem_81555 = (__local char *) mem_81555_backing_2;\n for (int32_t comb_iter_83174 = 0; comb_iter_83174 < squot32(triD_68516 +\n computed_group_sizze_78572 -\n 1,\n computed_group_sizze_78572);\n comb_iter_83174++) {\n int32_t ctid_78568;\n int32_t flat_comb_id_83175 = comb_iter_83174 *\n computed_group_sizze_78572 + local_tid_78575;\n \n ctid_78568 = flat_comb_id_83175;\n if (slt32(ctid_78568, triD_68516) && 1) {\n double icf_elem_elem_78824 = *(__global\n double *) &icf_mem_80370[(gtid_78545 *\n triD_68516 +\n ltid_78546) *\n 8];\n double res_78825;\n double x_78828 = 0.0;\n int32_t chunk_sizze_78826;\n int32_t chunk_offset_78827 = 0;\n \n chunk_sizze_78826 = N_68508;\n \n double res_78830;\n double acc_78833 = x_78828;\n int32_t",
" groupstream_mapaccum_dummy_chunk_sizze_78831;\n \n groupstream_mapaccum_dummy_chunk_sizze_78831 = 1;\n if (chunk_sizze_78826 == N_68508) {\n for (int32_t i_78832 = 0; i_78832 < N_68508; i_78832++) {\n double x_78835;\n double res_78838;\n \n x_78835 = *(__global double *) &res_mem_81414[(gtid_78545 *\n triD_68516 +\n ltid_78546 +\n triD_68516 *\n K_68510 *\n chunk_offset_78827 +\n triD_68516 *\n K_68510 *\n i_78832 + 0 *\n (triD_68516 *\n K_68510)) *\n 8];\n res_78838 = acc_78833 + x_78835;\n \n double acc_tmp_83176 = res_78838;\n \n acc_78833 = acc_tmp_83176;\n }\n } else {\n for (int32_t i_78832 = 0; i_78832 < chunk_sizze_78826;\n i_78832++) {\n double x_78835;\n double res_78838;\n \n x_78835 = *(__global double *) &res_mem_81414[(gtid_78545 *\n triD_68516 +\n ltid_78546 +\n ",
" triD_68516 *\n K_68510 *\n chunk_offset_78827 +\n triD_68516 *\n K_68510 *\n i_78832 + 0 *\n (triD_68516 *\n K_68510)) *\n 8];\n res_78838 = acc_78833 + x_78835;\n \n double acc_tmp_83177 = res_78838;\n \n acc_78833 = acc_tmp_83177;\n }\n }\n res_78830 = acc_78833;\n x_78828 = res_78830;\n res_78825 = x_78828;\n \n double res_78839;\n double res_78840;\n double res_78841;\n \n res_78839 = t1389_68865 * icf_elem_elem_78824;\n res_78840 = res_78839 + res_78839;\n res_78841 = res_78825 + res_78840;\n *(__local double *) &mem_81555[ctid_78568 * 8] = res_78841;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n for (int32_t i_83178 = 0; i_83178 < squot32(triD_68516 - local_tid_78575 +\n computed_group_sizze_78572 - 1,\n computed_group_sizze_78572);\n i_83178++) {\n *(__global double *) &mem_81559[(group_id_78576 * triD_68516 +\n (i_83178 * computed_group_sizze_78572 +\n local_tid_78575)) * 8] = *(__local\n double *) &mem_81555[(i_83178 *\n ",
" computed_group_sizze_78572 +\n local_tid_78575) *\n 8];\n }\n for (int32_t i_83179 = 0; i_83179 < squot32(D_68526 - local_tid_78575 +\n computed_group_sizze_78572 - 1,\n computed_group_sizze_78572);\n i_83179++) {\n *(__global double *) &mem_81563[(group_id_78576 * D_68526 + (i_83179 *\n computed_group_sizze_78572 +\n local_tid_78575)) *\n 8] = *(__local\n double *) &mem_81552[(i_83179 *\n computed_group_sizze_78572 +\n local_tid_78575) *\n 8];\n }\n if (local_tid_78575 == 0) {\n *(__global double *) &mem_81566[group_id_78576 * 8] = res_78795;\n }\n}\n__kernel void map_intra_group_78874(__local volatile\n int64_t *mem_81626_backing_aligned_0,\n int32_t N_68508, int32_t K_68510,\n int32_t K_68515, int32_t triD_68516,\n double t1389_68865, __global\n unsigned char *mem_81618, __global\n unsigned char *mem_81623, __global\n unsigned char *mem_81629)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n __local volat",
"ile char *restrict mem_81626_backing_0 =\n mem_81626_backing_aligned_0;\n int32_t global_tid_78874;\n int32_t local_tid_78875;\n int32_t group_sizze_83276;\n int32_t wave_sizze_83275;\n int32_t group_id_78876;\n \n global_tid_78874 = get_global_id(0);\n local_tid_78875 = get_local_id(0);\n group_sizze_83276 = get_local_size(0);\n wave_sizze_83275 = LOCKSTEP_WIDTH;\n group_id_78876 = get_group_id(0);\n \n int32_t gtid_78867;\n int32_t gtid_78868;\n int32_t ltid_78870;\n \n gtid_78867 = squot32(global_tid_78874, triD_68516 * N_68508);\n gtid_78868 = squot32(global_tid_78874 - squot32(global_tid_78874,\n triD_68516 * N_68508) *\n (triD_68516 * N_68508), N_68508);\n ltid_78870 = global_tid_78874 - squot32(global_tid_78874, triD_68516 *\n N_68508) * (triD_68516 * N_68508) -\n squot32(global_tid_78874 - squot32(global_tid_78874, triD_68516 *\n N_68508) * (triD_68516 * N_68508),\n N_68508) * N_68508;\n \n double icf_elem_elem_79427;\n double x_79977;\n \n if ((slt32(gtid_78867, K_68510) && slt32(gtid_78868, triD_68516)) &&\n slt32(ltid_78870, N_68508)) {\n icf_elem_elem_79427 = *(__global double *) &mem_81618[(gtid_78868 *\n K_68515 +\n gtid_78867) * 8];\n x_79977 = *(__global double *) &mem_81623[(gtid_78867 * (N_68508 *\n triD_68516) +\n gtid_78868 * N_68508 +\n ltid_78870) * 8];\n }\n \n __local char *mem_81626;\n double res_79430;\n \n mem_81626 = (__local char *) mem_81626_backing_0;\n for (int32_t comb_iter_83277 =",
" 0; comb_iter_83277 < 1; comb_iter_83277++) {\n int32_t ctid_78872;\n int32_t flat_comb_id_83278 = comb_iter_83277 * N_68508 +\n local_tid_78875;\n \n ctid_78872 = flat_comb_id_83278;\n if (slt32(ctid_78872, N_68508) && 1) {\n *(__local double *) &mem_81626[ctid_78872 * 8] = x_79977;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_83279;\n int32_t skip_waves_83280;\n double x_79431;\n double x_79432;\n \n offset_83279 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_78875, N_68508)) {\n x_79431 = *(__local double *) &mem_81626[(local_tid_78875 +\n offset_83279) * 8];\n }\n }\n offset_83279 = 1;\n while (slt32(offset_83279, wave_sizze_83275)) {\n if (slt32(local_tid_78875 + offset_83279, N_68508) &&\n ((local_tid_78875 - squot32(local_tid_78875, wave_sizze_83275) *\n wave_sizze_83275) & (2 * offset_83279 - 1)) == 0) {\n // read array element\n {\n x_79432 = *(volatile __local\n double *) &mem_81626[(local_tid_78875 +\n offset_83279) * 8];\n }\n // apply reduction operation\n {\n double res_79433;\n \n if ((slt32(gtid_78867, K_68510) && slt32(gtid_78868,\n triD_68516)) &&\n slt32(ltid_78870, N_68508)) {\n res_79433 = x_79431 + x_79432;\n }\n x_79431 = res_79433;\n }\n // write result of operation\n {\n *(volatile __local double *) &mem_81626[local_tid_78875 * 8] =\n x_79431;\n }\n }\n offset_83279 *= 2;\n }\n skip_waves_83280 = 1;\n while (slt32(skip_wave",
"s_83280, squot32(N_68508 + wave_sizze_83275 - 1,\n wave_sizze_83275))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_83279 = skip_waves_83280 * wave_sizze_83275;\n if (slt32(local_tid_78875 + offset_83279, N_68508) &&\n ((local_tid_78875 - squot32(local_tid_78875, wave_sizze_83275) *\n wave_sizze_83275) == 0 && (squot32(local_tid_78875,\n wave_sizze_83275) & (2 *\n skip_waves_83280 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_79432 = *(__local double *) &mem_81626[(local_tid_78875 +\n offset_83279) * 8];\n }\n // apply reduction operation\n {\n double res_79433;\n \n if ((slt32(gtid_78867, K_68510) && slt32(gtid_78868,\n triD_68516)) &&\n slt32(ltid_78870, N_68508)) {\n res_79433 = x_79431 + x_79432;\n }\n x_79431 = res_79433;\n }\n // write result of operation\n {\n *(__local double *) &mem_81626[local_tid_78875 * 8] = x_79431;\n }\n }\n skip_waves_83280 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n res_79430 = *(__local double *) &mem_81626[0];\n \n double res_79434;\n double res_79435;\n double res_79436;\n \n if ((slt32(gtid_78867, K_68510) && slt32(gtid_78868, triD_68516)) &&\n slt32(ltid_78870, N_68508)) {\n res_79434 = t1389_68865 * icf_elem_elem_79427;\n res_79435 = res_79434 + res_79434;\n res_79436 = res_79430 + res_79435;\n }\n if (local_tid_78875 == 0) {\n *(__global double *) &mem_81629[grou",
"p_id_78876 * 8] = res_79436;\n }\n}\n__kernel void map_intra_group_79031(__local volatile\n int64_t *mem_81592_backing_aligned_0,\n int32_t N_68508, int32_t K_68510,\n int32_t K_68513, int32_t D_68526,\n double t1389_68865, double res_68867,\n __global unsigned char *mem_81584, __global\n unsigned char *mem_81589, __global\n unsigned char *mem_81595)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n __local volatile char *restrict mem_81592_backing_0 =\n mem_81592_backing_aligned_0;\n int32_t global_tid_79031;\n int32_t local_tid_79032;\n int32_t group_sizze_83227;\n int32_t wave_sizze_83226;\n int32_t group_id_79033;\n \n global_tid_79031 = get_global_id(0);\n local_tid_79032 = get_local_id(0);\n group_sizze_83227 = get_local_size(0);\n wave_sizze_83226 = LOCKSTEP_WIDTH;\n group_id_79033 = get_group_id(0);\n \n int32_t gtid_79024;\n int32_t gtid_79025;\n int32_t ltid_79027;\n \n gtid_79024 = squot32(global_tid_79031, D_68526 * N_68508);\n gtid_79025 = squot32(global_tid_79031 - squot32(global_tid_79031, D_68526 *\n N_68508) * (D_68526 *\n N_68508),\n N_68508);\n ltid_79027 = global_tid_79031 - squot32(global_tid_79031, D_68526 *\n N_68508) * (D_68526 * N_68508) -\n squot32(global_tid_79031 - squot32(global_tid_79031, D_68526 *\n N_68508) * (D_68526 * N_68508),\n N_68508) * N_68508;\n \n double qs_elem_elem_79330;\n double x_79967;\n \n if ((slt32(gtid_79024, K_68510) && slt32(gtid_7",
"9025, D_68526)) &&\n slt32(ltid_79027, N_68508)) {\n qs_elem_elem_79330 = *(__global double *) &mem_81584[(gtid_79025 *\n K_68513 +\n gtid_79024) * 8];\n x_79967 = *(__global double *) &mem_81589[(gtid_79024 * (N_68508 *\n D_68526) +\n gtid_79025 * N_68508 +\n ltid_79027) * 8];\n }\n \n __local char *mem_81592;\n double res_79334;\n \n mem_81592 = (__local char *) mem_81592_backing_0;\n for (int32_t comb_iter_83228 = 0; comb_iter_83228 < 1; comb_iter_83228++) {\n int32_t ctid_79029;\n int32_t flat_comb_id_83229 = comb_iter_83228 * N_68508 +\n local_tid_79032;\n \n ctid_79029 = flat_comb_id_83229;\n if (slt32(ctid_79029, N_68508) && 1) {\n *(__local double *) &mem_81592[ctid_79029 * 8] = x_79967;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_83230;\n int32_t skip_waves_83231;\n double x_79335;\n double x_79336;\n \n offset_83230 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_79032, N_68508)) {\n x_79335 = *(__local double *) &mem_81592[(local_tid_79032 +\n offset_83230) * 8];\n }\n }\n offset_83230 = 1;\n while (slt32(offset_83230, wave_sizze_83226)) {\n if (slt32(local_tid_79032 + offset_83230, N_68508) &&\n ((local_tid_79032 - squot32(local_tid_79032, wave_sizze_83226) *\n wave_sizze_83226) & (2 * offset_83230 - 1)) == 0) {\n // read array element\n {\n x_79336 = *(volatile __local\n double *) &mem_81592[(local_tid_79032 +\n ",
" offset_83230) * 8];\n }\n // apply reduction operation\n {\n double res_79337;\n \n if ((slt32(gtid_79024, K_68510) && slt32(gtid_79025,\n D_68526)) &&\n slt32(ltid_79027, N_68508)) {\n res_79337 = x_79335 + x_79336;\n }\n x_79335 = res_79337;\n }\n // write result of operation\n {\n *(volatile __local double *) &mem_81592[local_tid_79032 * 8] =\n x_79335;\n }\n }\n offset_83230 *= 2;\n }\n skip_waves_83231 = 1;\n while (slt32(skip_waves_83231, squot32(N_68508 + wave_sizze_83226 - 1,\n wave_sizze_83226))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_83230 = skip_waves_83231 * wave_sizze_83226;\n if (slt32(local_tid_79032 + offset_83230, N_68508) &&\n ((local_tid_79032 - squot32(local_tid_79032, wave_sizze_83226) *\n wave_sizze_83226) == 0 && (squot32(local_tid_79032,\n wave_sizze_83226) & (2 *\n skip_waves_83231 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_79336 = *(__local double *) &mem_81592[(local_tid_79032 +\n offset_83230) * 8];\n }\n // apply reduction operation\n {\n double res_79337;\n \n if ((slt32(gtid_79024, K_68510) && slt32(gtid_79025,\n D_68526)) &&\n slt32(ltid_79027, N_68508)) {\n res_79337 = x_79335 + x_79336;\n }\n ",
" x_79335 = res_79337;\n }\n // write result of operation\n {\n *(__local double *) &mem_81592[local_tid_79032 * 8] = x_79335;\n }\n }\n skip_waves_83231 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n res_79334 = *(__local double *) &mem_81592[0];\n \n double res_79338;\n double res_79339;\n double res_79340;\n double res_79342;\n double res_79343;\n double res_79344;\n \n if ((slt32(gtid_79024, K_68510) && slt32(gtid_79025, D_68526)) &&\n slt32(ltid_79027, N_68508)) {\n res_79338 = futrts_exp64(qs_elem_elem_79330);\n res_79339 = t1389_68865 * res_79338;\n res_79340 = res_79339 + res_79339;\n res_79342 = res_79338 * res_79340;\n res_79343 = res_68867 + res_79342;\n res_79344 = res_79334 + res_79343;\n }\n if (local_tid_79032 == 0) {\n *(__global double *) &mem_81595[group_id_79033 * 8] = res_79344;\n }\n}\n__kernel void map_transpose_f64(int32_t destoffset_1, int32_t srcoffset_3,\n int32_t num_arrays_4, int32_t x_elems_5,\n int32_t y_elems_6, int32_t in_elems_7,\n int32_t out_elems_8, int32_t mulx_9,\n int32_t muly_10, __global\n unsigned char *destmem_0, __global\n unsigned char *srcmem_2)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n ALIGNED_LOCAL_MEMORY(block_11_backing_0, 8448);\n \n __local char *block_11;\n \n block_11 = (__local char *) block_11_backing_0;\n \n int32_t get_global_id_0_37;\n \n get_global_id_0_37 = get_global_id(0);\n \n int32_t get_local_id_0_38;\n \n get_local_id_0_38 = get_local_id(0);\n \n int32_t get_local_id_1_39;\n \n get_local_id_1_39 = get_local_id(1);\n \n int32_t get_group_id_0_40;\n \n get_group_id_0_40 = get_group_id(0);\n ",
" \n int32_t get_group_id_1_41;\n \n get_group_id_1_41 = get_group_id(1);\n \n int32_t get_group_id_2_42;\n \n get_group_id_2_42 = get_group_id(2);\n \n int32_t our_array_offset_30 = get_group_id_2_42 * x_elems_5 * y_elems_6;\n int32_t odata_offset_33 = squot32(destoffset_1, 8) + our_array_offset_30;\n int32_t idata_offset_34 = squot32(srcoffset_3, 8) + our_array_offset_30;\n int32_t x_index_31 = get_global_id_0_37;\n int32_t y_index_32 = get_group_id_1_41 * 32 + get_local_id_1_39;\n \n if (slt32(x_index_31, x_elems_5)) {\n for (int32_t j_43 = 0; j_43 < 4; j_43++) {\n int32_t index_in_35 = (y_index_32 + j_43 * 8) * x_elems_5 +\n x_index_31;\n \n if (slt32(y_index_32 + j_43 * 8, y_elems_6) && slt32(index_in_35,\n in_elems_7)) {\n *(__local double *) &block_11[((get_local_id_1_39 + j_43 * 8) *\n 33 + get_local_id_0_38) *\n sizeof(double)] = *(__global\n double *) &srcmem_2[(idata_offset_34 +\n index_in_35) *\n sizeof(double)];\n }\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n x_index_31 = get_group_id_1_41 * 32 + get_local_id_0_38;\n y_index_32 = get_group_id_0_40 * 32 + get_local_id_1_39;\n if (slt32(x_index_31, y_elems_6)) {\n for (int32_t j_43 = 0; j_43 < 4; j_43++) {\n int32_t index_out_36 = (y_index_32 + j_43 * 8) * y_elems_6 +\n x_index_31;\n \n if (slt32(y_index_32 + j_43 * 8, x_elems_5) && slt32(index_out_36,\n out_elems_8)) {\n *(__glob",
"al double *) &destmem_0[(odata_offset_33 +\n index_out_36) *\n sizeof(double)] = *(__local\n double *) &block_11[(get_local_id_0_38 *\n 33 +\n get_local_id_1_39 +\n j_43 *\n 8) *\n sizeof(double)];\n }\n }\n }\n}\n__kernel void map_transpose_f64_low_height(int32_t destoffset_1,\n int32_t srcoffset_3,\n int32_t num_arrays_4,\n int32_t x_elems_5, int32_t y_elems_6,\n int32_t in_elems_7,\n int32_t out_elems_8, int32_t mulx_9,\n int32_t muly_10, __global\n unsigned char *destmem_0, __global\n unsigned char *srcmem_2)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n ALIGNED_LOCAL_MEMORY(block_11_backing_0, 2176);\n \n __local char *block_11;\n \n block_11 = (__local char *) block_11_backing_0;\n \n int32_t get_global_id_0_37;\n \n get_global_id_0_37 = get_global_id(0);\n \n int32_t get_local_id_0_38;\n \n get_local_id_0_38 = get_local_id(0);\n \n int32_t get_local_id_1_39;\n \n get_local_id_1_39 = get_local_id(1);\n \n int32_t get_group_id_0_40;\n \n get_group_i",
"d_0_40 = get_group_id(0);\n \n int32_t get_group_id_1_41;\n \n get_group_id_1_41 = get_group_id(1);\n \n int32_t get_group_id_2_42;\n \n get_group_id_2_42 = get_group_id(2);\n \n int32_t our_array_offset_30 = get_group_id_2_42 * x_elems_5 * y_elems_6;\n int32_t odata_offset_33 = squot32(destoffset_1, 8) + our_array_offset_30;\n int32_t idata_offset_34 = squot32(srcoffset_3, 8) + our_array_offset_30;\n int32_t x_index_31 = get_group_id_0_40 * 16 * mulx_9 + get_local_id_0_38 +\n srem32(get_local_id_1_39, mulx_9) * 16;\n int32_t y_index_32 = get_group_id_1_41 * 16 + squot32(get_local_id_1_39,\n mulx_9);\n int32_t index_in_35 = y_index_32 * x_elems_5 + x_index_31;\n \n if (slt32(x_index_31, x_elems_5) && (slt32(y_index_32, y_elems_6) &&\n slt32(index_in_35, in_elems_7))) {\n *(__local double *) &block_11[(get_local_id_1_39 * 17 +\n get_local_id_0_38) * sizeof(double)] =\n *(__global double *) &srcmem_2[(idata_offset_34 + index_in_35) *\n sizeof(double)];\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n x_index_31 = get_group_id_1_41 * 16 + squot32(get_local_id_0_38, mulx_9);\n y_index_32 = get_group_id_0_40 * 16 * mulx_9 + get_local_id_1_39 +\n srem32(get_local_id_0_38, mulx_9) * 16;\n \n int32_t index_out_36 = y_index_32 * y_elems_6 + x_index_31;\n \n if (slt32(x_index_31, y_elems_6) && (slt32(y_index_32, x_elems_5) &&\n slt32(index_out_36, out_elems_8))) {\n *(__global double *) &destmem_0[(odata_offset_33 + index_out_36) *\n sizeof(double)] = *(__local\n double *) &block_11[(get_local_id_0_38 *\n 17 +\n ",
" get_local_id_1_39) *\n sizeof(double)];\n }\n}\n__kernel void map_transpose_f64_low_width(int32_t destoffset_1,\n int32_t srcoffset_3,\n int32_t num_arrays_4,\n int32_t x_elems_5, int32_t y_elems_6,\n int32_t in_elems_7,\n int32_t out_elems_8, int32_t mulx_9,\n int32_t muly_10, __global\n unsigned char *destmem_0, __global\n unsigned char *srcmem_2)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n ALIGNED_LOCAL_MEMORY(block_11_backing_0, 2176);\n \n __local char *block_11;\n \n block_11 = (__local char *) block_11_backing_0;\n \n int32_t get_global_id_0_37;\n \n get_global_id_0_37 = get_global_id(0);\n \n int32_t get_local_id_0_38;\n \n get_local_id_0_38 = get_local_id(0);\n \n int32_t get_local_id_1_39;\n \n get_local_id_1_39 = get_local_id(1);\n \n int32_t get_group_id_0_40;\n \n get_group_id_0_40 = get_group_id(0);\n \n int32_t get_group_id_1_41;\n \n get_group_id_1_41 = get_group_id(1);\n \n int32_t get_group_id_2_42;\n \n get_group_id_2_42 = get_group_id(2);\n \n int32_t our_array_offset_30 = get_group_id_2_42 * x_elems_5 * y_elems_6;\n int32_t odata_offset_33 = squot32(destoffset_1, 8) + our_array_offset_30;\n int32_t idata_offset_34 = squot32(srcoffset_3, 8) + our_array_offset_30;\n int32_t x_index_31 = get_group_id_0_40 * 16 + squot32(get_local_id_0_38,\n muly_10);\n int32_t y_index_32 = get_group_id_1_41 * 16 * muly_10 + get",
"_local_id_1_39 +\n srem32(get_local_id_0_38, muly_10) * 16;\n int32_t index_in_35 = y_index_32 * x_elems_5 + x_index_31;\n \n if (slt32(x_index_31, x_elems_5) && (slt32(y_index_32, y_elems_6) &&\n slt32(index_in_35, in_elems_7))) {\n *(__local double *) &block_11[(get_local_id_1_39 * 17 +\n get_local_id_0_38) * sizeof(double)] =\n *(__global double *) &srcmem_2[(idata_offset_34 + index_in_35) *\n sizeof(double)];\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n x_index_31 = get_group_id_1_41 * 16 * muly_10 + get_local_id_0_38 +\n srem32(get_local_id_1_39, muly_10) * 16;\n y_index_32 = get_group_id_0_40 * 16 + squot32(get_local_id_1_39, muly_10);\n \n int32_t index_out_36 = y_index_32 * y_elems_6 + x_index_31;\n \n if (slt32(x_index_31, y_elems_6) && (slt32(y_index_32, x_elems_5) &&\n slt32(index_out_36, out_elems_8))) {\n *(__global double *) &destmem_0[(odata_offset_33 + index_out_36) *\n sizeof(double)] = *(__local\n double *) &block_11[(get_local_id_0_38 *\n 17 +\n get_local_id_1_39) *\n sizeof(double)];\n }\n}\n__kernel void map_transpose_f64_small(int32_t destoffset_1, int32_t srcoffset_3,\n int32_t num_arrays_4, int32_t x_elems_5,\n int32_t y_elems_6, int32_t in_elems_7,\n int32_t out_elems_8, int32_t mulx_9,\n int32_t muly_10, __global\n unsigned char *des",
"tmem_0, __global\n unsigned char *srcmem_2)\n{\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n ALIGNED_LOCAL_MEMORY(block_11_backing_0, 1);\n \n __local char *block_11;\n \n block_11 = (__local char *) block_11_backing_0;\n \n int32_t get_global_id_0_37;\n \n get_global_id_0_37 = get_global_id(0);\n \n int32_t get_local_id_0_38;\n \n get_local_id_0_38 = get_local_id(0);\n \n int32_t get_local_id_1_39;\n \n get_local_id_1_39 = get_local_id(1);\n \n int32_t get_group_id_0_40;\n \n get_group_id_0_40 = get_group_id(0);\n \n int32_t get_group_id_1_41;\n \n get_group_id_1_41 = get_group_id(1);\n \n int32_t get_group_id_2_42;\n \n get_group_id_2_42 = get_group_id(2);\n \n int32_t our_array_offset_30 = squot32(get_global_id_0_37, y_elems_6 *\n x_elems_5) * (y_elems_6 * x_elems_5);\n int32_t x_index_31 = squot32(srem32(get_global_id_0_37, y_elems_6 *\n x_elems_5), y_elems_6);\n int32_t y_index_32 = srem32(get_global_id_0_37, y_elems_6);\n int32_t odata_offset_33 = squot32(destoffset_1, 8) + our_array_offset_30;\n int32_t idata_offset_34 = squot32(srcoffset_3, 8) + our_array_offset_30;\n int32_t index_in_35 = y_index_32 * x_elems_5 + x_index_31;\n int32_t index_out_36 = x_index_31 * y_elems_6 + y_index_32;\n \n if (slt32(get_global_id_0_37, in_elems_7)) {\n *(__global double *) &destmem_0[(odata_offset_33 + index_out_36) *\n sizeof(double)] = *(__global\n double *) &srcmem_2[(idata_offset_34 +\n index_in_35) *\n sizeof(double)];\n }\n}\n__kernel void segred_large_69652(int32_t N_68316, ",
"int32_t D_68317,\n int32_t K_68318, int32_t K_68319,\n int32_t K_68321, int32_t K_68323,\n int32_t D_68333, int32_t num_groups_70125,\n __global unsigned char *x_mem_80366, __global\n unsigned char *alphas_mem_80367, __global\n unsigned char *mem_80418, __global\n unsigned char *mem_80422, __global\n unsigned char *mem_80426, __global\n unsigned char *mem_80429, __global\n unsigned char *mem_80432,\n int32_t thread_per_segment_82077, __global\n unsigned char *group_res_arr_mem_82078,\n __global unsigned char *counter_mem_82080)\n{\n const int32_t group_sizze_70115 = gmm_objectivezigroup_sizze_69634;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n ALIGNED_LOCAL_MEMORY(red_arr_mem_82082_backing_0, 8 *\n gmm_objectivezigroup_sizze_69634);\n ALIGNED_LOCAL_MEMORY(sync_arr_mem_82084_backing_1, 1);\n \n int32_t global_tid_69652;\n int32_t local_tid_69653;\n int32_t group_sizze_82074;\n int32_t wave_sizze_82073;\n int32_t group_id_69654;\n \n global_tid_69652 = get_global_id(0);\n local_tid_69653 = get_local_id(0);\n group_sizze_82074 = get_local_size(0);\n wave_sizze_82073 = LOCKSTEP_WIDTH;\n group_id_69654 = get_group_id(0);\n \n int32_t gtid_69623;\n int32_t gtid_69651;\n __local char *red_arr_mem_82082;\n \n red_arr_mem_82082 = (__local char *) red_arr_mem_82082_backing_0;\n \n __local char *sync_arr_mem_82084;\n \n sync_arr_mem_82084 = (__local char *) sync_arr_mem_82084_backing_1;\n gtid_69623 = squot32(group_id_69654, squot32(num_groups_70125 + smax32",
"(1,\n N_68316) -\n 1, smax32(1, N_68316)));\n \n int32_t chunk_sizze_82086 = smin32(squot32(K_68318 + group_sizze_70115 *\n squot32(num_groups_70125 +\n smax32(1, N_68316) - 1,\n smax32(1, N_68316)) - 1,\n group_sizze_70115 *\n squot32(num_groups_70125 +\n smax32(1, N_68316) - 1,\n smax32(1, N_68316))),\n squot32(K_68318 -\n srem32(global_tid_69652,\n group_sizze_70115 *\n squot32(num_groups_70125 +\n smax32(1,\n N_68316) -\n 1, smax32(1,\n N_68316))) +\n thread_per_segment_82077 - 1,\n thread_per_segment_82077));\n double x_70131;\n double x_70132;\n \n x_70131 = 0.0;\n for (int32_t i_82090 = 0; i_82090 < chunk_sizze_82086; i_82090++) {\n gtid_69651 = srem32(global_tid_69652, group_sizze_70115 *\n squot32(num_groups_70125 + smax32(1, N_68316) - 1,\n smax32(1, N_68316))) +\n thread_per_segment_82077 * i_82090;\n // apply map function\n {\n double alphas_elem_70135;\n ",
" double res_70140;\n double x_70154;\n double res_70164;\n double y_70217;\n double res_70218;\n double res_70219;\n \n alphas_elem_70135 = *(__global\n double *) &alphas_mem_80367[gtid_69651 * 8];\n \n double x_70143 = 0.0;\n \n for (int32_t chunk_offset_70142 = 0; chunk_offset_70142 < D_68333;\n chunk_offset_70142++) {\n double x_70150;\n double res_70153;\n \n x_70150 = *(__global double *) &mem_80418[(chunk_offset_70142 *\n K_68321 +\n gtid_69651) * 8];\n res_70153 = x_70143 + x_70150;\n \n double x_tmp_82091 = res_70153;\n \n x_70143 = x_tmp_82091;\n }\n res_70140 = x_70143;\n x_70154 = alphas_elem_70135 + res_70140;\n for (int32_t i_70159 = 0; i_70159 < D_68333; i_70159++) {\n double x_elem_elem_70160;\n double means_elem_elem_70161;\n double res_70162;\n \n x_elem_elem_70160 = *(__global\n double *) &x_mem_80366[(gtid_69623 *\n D_68317 +\n i_70159) * 8];\n means_elem_elem_70161 = *(__global\n double *) &mem_80422[(i_70159 *\n K_68319 +\n gtid_69651) *\n 8];\n res_70162 = x_elem_elem_70160 - means_elem_elem_70161;\n *(__global double *) &mem_80429",
"[(group_id_69654 *\n (group_sizze_70115 * D_68333) +\n local_tid_69653 + i_70159 *\n group_sizze_70115) * 8] =\n res_70162;\n }\n \n double x_70167 = 0.0;\n \n for (int32_t chunk_offset_70166 = 0; chunk_offset_70166 < D_68333;\n chunk_offset_70166++) {\n double qs_elem_elem_70177;\n double res_70179;\n double res_70214;\n double res_70216;\n \n qs_elem_elem_70177 = *(__global\n double *) &mem_80418[(chunk_offset_70166 *\n K_68321 +\n gtid_69651) * 8];\n \n double x_70182 = 0.0;\n \n for (int32_t chunk_offset_70181 = 0; chunk_offset_70181 <\n D_68333; chunk_offset_70181++) {\n double x_70192;\n bool cond_70194;\n double res_70195;\n double res_70211;\n double res_70213;\n \n x_70192 = *(__global double *) &mem_80429[(group_id_69654 *\n (group_sizze_70115 *\n D_68333) +\n local_tid_69653 +\n chunk_offset_70181 *\n group_sizze_70115) *\n 8];\n cond_70194 = slt32(chunk_offset_70166, chunk_offset_70181);\n if (cond_70194) {\n ",
" res_70195 = 0.0;\n } else {\n bool cond_70196;\n double res_70197;\n \n cond_70196 = chunk_offset_70166 == chunk_offset_70181;\n if (cond_70196) {\n double res_70198;\n \n res_70198 = futrts_exp64(qs_elem_elem_70177);\n res_70197 = res_70198;\n } else {\n int32_t y_70199;\n int32_t x_70200;\n int32_t res_70201;\n int32_t gmm_knossos_tri_arg_70202;\n int32_t y_70203;\n int32_t x_70204;\n int32_t res_70205;\n int32_t x_70206;\n int32_t x_70207;\n int32_t y_70208;\n int32_t i_70209;\n double res_70210;\n \n y_70199 = D_68333 - 1;\n x_70200 = D_68333 * y_70199;\n res_70201 = sdiv32(x_70200, 2);\n gmm_knossos_tri_arg_70202 = D_68333 -\n chunk_offset_70181;\n y_70203 = gmm_knossos_tri_arg_70202 - 1;\n x_70204 = gmm_knossos_tri_arg_70202 * y_70203;\n res_70205 = sdiv32(x_70204, 2);\n x_70206 = res_70201 - res_70205;\n x_70207 = chunk_offset_70166 - chunk_offset_70181;\n y_70208 = x_70207 - 1;\n i_70209 = x_70206 + y_70208;\n res_70210 = *(__global\n double *) &mem_80426[(i_70209 *\n ",
" K_68323 +\n gtid_69651) *\n 8];\n res_70197 = res_70210;\n }\n res_70195 = res_70197;\n }\n res_70211 = x_70192 * res_70195;\n res_70213 = x_70182 + res_70211;\n \n double x_tmp_82094 = res_70213;\n \n x_70182 = x_tmp_82094;\n }\n res_70179 = x_70182;\n res_70214 = res_70179 * res_70179;\n res_70216 = x_70167 + res_70214;\n \n double x_tmp_82093 = res_70216;\n \n x_70167 = x_tmp_82093;\n }\n res_70164 = x_70167;\n y_70217 = 0.5 * res_70164;\n res_70218 = x_70154 - y_70217;\n res_70219 = futrts_exp64(res_70218);\n // save results to be reduced\n {\n x_70132 = res_70219;\n }\n // save map-out results\n { }\n // apply reduction operator\n {\n double res_70133 = x_70131 + x_70132;\n \n x_70131 = res_70133;\n }\n }\n }\n // to reduce current chunk, first store our result to memory\n {\n *(__local double *) &red_arr_mem_82082[local_tid_69653 * 8] = x_70131;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82095;\n int32_t skip_waves_82096;\n double x_82087;\n double x_82088;\n \n offset_82095 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_69653, group_sizze_70115)) {\n x_82087 = *(__local double *) &red_arr_mem_82082[(local_tid_69653 +\n offset_82095) *\n ",
" 8];\n }\n }\n offset_82095 = 1;\n while (slt32(offset_82095, wave_sizze_82073)) {\n if (slt32(local_tid_69653 + offset_82095, group_sizze_70115) &&\n ((local_tid_69653 - squot32(local_tid_69653, wave_sizze_82073) *\n wave_sizze_82073) & (2 * offset_82095 - 1)) == 0) {\n // read array element\n {\n x_82088 = *(volatile __local\n double *) &red_arr_mem_82082[(local_tid_69653 +\n offset_82095) * 8];\n }\n // apply reduction operation\n {\n double res_82089 = x_82087 + x_82088;\n \n x_82087 = res_82089;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_82082[local_tid_69653 * 8] = x_82087;\n }\n }\n offset_82095 *= 2;\n }\n skip_waves_82096 = 1;\n while (slt32(skip_waves_82096, squot32(group_sizze_70115 +\n wave_sizze_82073 - 1,\n wave_sizze_82073))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82095 = skip_waves_82096 * wave_sizze_82073;\n if (slt32(local_tid_69653 + offset_82095, group_sizze_70115) &&\n ((local_tid_69653 - squot32(local_tid_69653, wave_sizze_82073) *\n wave_sizze_82073) == 0 && (squot32(local_tid_69653,\n wave_sizze_82073) & (2 *\n skip_waves_82096 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_82088 = *(__local\n double *) &red_arr_mem_82082[(local_tid_69653 +\n ",
" offset_82095) * 8];\n }\n // apply reduction operation\n {\n double res_82089 = x_82087 + x_82088;\n \n x_82087 = res_82089;\n }\n // write result of operation\n {\n *(__local double *) &red_arr_mem_82082[local_tid_69653 * 8] =\n x_82087;\n }\n }\n skip_waves_82096 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n if (squot32(num_groups_70125 + smax32(1, N_68316) - 1, smax32(1,\n N_68316)) ==\n 1) {\n // first thread in group saves final result to memory\n {\n if (local_tid_69653 == 0) {\n *(__global double *) &mem_80432[gtid_69623 * 8] = x_82087;\n }\n }\n } else {\n int32_t old_counter_82097;\n \n // first thread in group saves group result to global memory\n {\n if (local_tid_69653 == 0) {\n *(__global double *) &group_res_arr_mem_82078[group_id_69654 *\n 8] = x_82087;\n mem_fence_global();\n old_counter_82097 = atomic_add((volatile __global int *) &\n counter_mem_82080[srem32(squot32(group_id_69654,\n squot32(num_groups_70125 +\n smax32(1,\n N_68316) -\n 1,\n smax32(1,\n ",
" N_68316))),\n 1024) *\n 4], 1);\n *(__local bool *) &sync_arr_mem_82084[0] = old_counter_82097 ==\n squot32(num_groups_70125 + smax32(1, N_68316) - 1, smax32(1,\n N_68316)) -\n 1;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n bool is_last_group_82098 = *(__local bool *) &sync_arr_mem_82084[0];\n \n if (is_last_group_82098) {\n if (local_tid_69653 == 0) {\n old_counter_82097 = atomic_add((volatile __global int *) &\n counter_mem_82080[srem32(squot32(group_id_69654,\n squot32(num_groups_70125 +\n smax32(1,\n N_68316) -\n 1,\n smax32(1,\n N_68316))),\n 1024) *\n 4], 0 -\n squot32(num_groups_70125 +\n smax32(1, N_68316) - 1,\n smax32(1, N_68316)));\n }\n // read in the per-group-results\n {\n if (slt32(local_tid_69653, squot32(num_groups_70125 + smax32(1,\n ",
" N_68316) -\n 1, smax32(1, N_68316)))) {\n x_70131 = *(__global\n double *) &group_res_arr_mem_82078[(squot32(group_id_69654,\n squot32(num_groups_70125 +\n smax32(1,\n N_68316) -\n 1,\n smax32(1,\n N_68316))) *\n squot32(num_groups_70125 +\n smax32(1,\n N_68316) -\n 1,\n smax32(1,\n N_68316)) +\n local_tid_69653) *\n 8];\n } else {\n x_70131 = 0.0;\n }\n *(__local double *) &red_arr_mem_82082[local_tid_69653 * 8] =\n x_70131;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n // reduce the per-group results\n {\n int32_t offset_82099;\n int32_t skip_waves_82100;\n double x_82087;\n doub",
"le x_82088;\n \n offset_82099 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_69653, group_sizze_70115)) {\n x_82087 = *(__local\n double *) &red_arr_mem_82082[(local_tid_69653 +\n offset_82099) *\n 8];\n }\n }\n offset_82099 = 1;\n while (slt32(offset_82099, wave_sizze_82073)) {\n if (slt32(local_tid_69653 + offset_82099,\n group_sizze_70115) && ((local_tid_69653 -\n squot32(local_tid_69653,\n wave_sizze_82073) *\n wave_sizze_82073) & (2 *\n offset_82099 -\n 1)) ==\n 0) {\n // read array element\n {\n x_82088 = *(volatile __local\n double *) &red_arr_mem_82082[(local_tid_69653 +\n offset_82099) *\n 8];\n }\n // apply reduction operation\n {\n double res_82089 = x_82087 + x_82088;\n \n x_82087 = res_82089;\n }\n // write result of operation\n {\n *(volatile __local\n ",
" double *) &red_arr_mem_82082[local_tid_69653 *\n 8] = x_82087;\n }\n }\n offset_82099 *= 2;\n }\n skip_waves_82100 = 1;\n while (slt32(skip_waves_82100, squot32(group_sizze_70115 +\n wave_sizze_82073 - 1,\n wave_sizze_82073))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82099 = skip_waves_82100 * wave_sizze_82073;\n if (slt32(local_tid_69653 + offset_82099,\n group_sizze_70115) && ((local_tid_69653 -\n squot32(local_tid_69653,\n wave_sizze_82073) *\n wave_sizze_82073) == 0 &&\n (squot32(local_tid_69653,\n wave_sizze_82073) &\n (2 * skip_waves_82100 -\n 1)) == 0)) {\n // read array element\n {\n x_82088 = *(__local\n double *) &red_arr_mem_82082[(local_tid_69653 +\n offset_82099) *\n 8];\n }\n // apply reduction operation\n {\n double res_82089 = x_82087 + x_82088;\n \n x_82087 = res_82089;\n }\n // write result of oper",
"ation\n {\n *(__local\n double *) &red_arr_mem_82082[local_tid_69653 *\n 8] = x_82087;\n }\n }\n skip_waves_82100 *= 2;\n }\n // and back to memory with the final result\n {\n if (local_tid_69653 == 0) {\n *(__global double *) &mem_80432[gtid_69623 * 8] =\n x_82087;\n }\n }\n }\n }\n }\n}\n__kernel void segred_large_70701(int32_t K_68318, int32_t D_68322,\n int32_t D_68333, int32_t num_groups_71156,\n __global unsigned char *qs_mem_80369, __global\n unsigned char *mem_80492,\n int32_t thread_per_segment_82310, __global\n unsigned char *group_res_arr_mem_82311,\n __global unsigned char *counter_mem_82313)\n{\n const int32_t group_sizze_71146 = gmm_objectivezigroup_sizze_70683;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n ALIGNED_LOCAL_MEMORY(red_arr_mem_82315_backing_0, 8 *\n gmm_objectivezigroup_sizze_70683);\n ALIGNED_LOCAL_MEMORY(sync_arr_mem_82317_backing_1, 1);\n \n int32_t global_tid_70701;\n int32_t local_tid_70702;\n int32_t group_sizze_82307;\n int32_t wave_sizze_82306;\n int32_t group_id_70703;\n \n global_tid_70701 = get_global_id(0);\n local_tid_70702 = get_local_id(0);\n group_sizze_82307 = get_local_size(0);\n wave_sizze_82306 = LOCKSTEP_WIDTH;\n group_id_70703 = get_group_id(0);\n \n int32_t gtid_70679;\n int32_t gtid_70700;\n __local char *red_arr_mem_82315;\n \n red_arr_mem_82315 = (__local char *) red_arr_mem_823",
"15_backing_0;\n \n __local char *sync_arr_mem_82317;\n \n sync_arr_mem_82317 = (__local char *) sync_arr_mem_82317_backing_1;\n gtid_70679 = squot32(group_id_70703, squot32(num_groups_71156 + smax32(1,\n K_68318) -\n 1, smax32(1, K_68318)));\n \n int32_t chunk_sizze_82319 = smin32(squot32(D_68333 + group_sizze_71146 *\n squot32(num_groups_71156 +\n smax32(1, K_68318) - 1,\n smax32(1, K_68318)) - 1,\n group_sizze_71146 *\n squot32(num_groups_71156 +\n smax32(1, K_68318) - 1,\n smax32(1, K_68318))),\n squot32(D_68333 -\n srem32(global_tid_70701,\n group_sizze_71146 *\n squot32(num_groups_71156 +\n smax32(1,\n K_68318) -\n 1, smax32(1,\n K_68318))) +\n thread_per_segment_82310 - 1,\n thread_per_segment_82310));\n double x_71162;\n double x_71163;\n \n x_71162 = 0.0;\n for (int32_t i_82323 = 0; i_82323 < chunk_sizze_82319; i_82323++) {\n gtid_70700 = srem32(global_tid_70701, group_sizze_71146 *\n squot32(num_groups_71156 + smax32(1, K",
"_68318) - 1,\n smax32(1, K_68318))) +\n thread_per_segment_82310 * i_82323;\n // apply map function\n {\n double x_71166 = *(__global double *) &qs_mem_80369[(gtid_70679 *\n D_68322 +\n gtid_70700) *\n 8];\n \n // save results to be reduced\n {\n x_71163 = x_71166;\n }\n // save map-out results\n { }\n // apply reduction operator\n {\n double res_71164 = x_71162 + x_71163;\n \n x_71162 = res_71164;\n }\n }\n }\n // to reduce current chunk, first store our result to memory\n {\n *(__local double *) &red_arr_mem_82315[local_tid_70702 * 8] = x_71162;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82324;\n int32_t skip_waves_82325;\n double x_82320;\n double x_82321;\n \n offset_82324 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_70702, group_sizze_71146)) {\n x_82320 = *(__local double *) &red_arr_mem_82315[(local_tid_70702 +\n offset_82324) *\n 8];\n }\n }\n offset_82324 = 1;\n while (slt32(offset_82324, wave_sizze_82306)) {\n if (slt32(local_tid_70702 + offset_82324, group_sizze_71146) &&\n ((local_tid_70702 - squot32(local_tid_70702, wave_sizze_82306) *\n wave_sizze_82306) & (2 * offset_82324 - 1)) == 0) {\n // read array element\n {\n x_82321 = *(volatile __local\n double *) &red_arr_mem_82315[(local_tid_70702 +\n ",
" offset_82324) * 8];\n }\n // apply reduction operation\n {\n double res_82322 = x_82320 + x_82321;\n \n x_82320 = res_82322;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_82315[local_tid_70702 * 8] = x_82320;\n }\n }\n offset_82324 *= 2;\n }\n skip_waves_82325 = 1;\n while (slt32(skip_waves_82325, squot32(group_sizze_71146 +\n wave_sizze_82306 - 1,\n wave_sizze_82306))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82324 = skip_waves_82325 * wave_sizze_82306;\n if (slt32(local_tid_70702 + offset_82324, group_sizze_71146) &&\n ((local_tid_70702 - squot32(local_tid_70702, wave_sizze_82306) *\n wave_sizze_82306) == 0 && (squot32(local_tid_70702,\n wave_sizze_82306) & (2 *\n skip_waves_82325 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_82321 = *(__local\n double *) &red_arr_mem_82315[(local_tid_70702 +\n offset_82324) * 8];\n }\n // apply reduction operation\n {\n double res_82322 = x_82320 + x_82321;\n \n x_82320 = res_82322;\n }\n // write result of operation\n {\n *(__local double *) &red_arr_mem_82315[local_tid_70702 * 8] =\n x_82320;\n }\n }\n skip_waves_82325 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n if (squot32(num_groups_71156 + smax32(1, K_68318) - 1, smax32(1,\n ",
" K_68318)) ==\n 1) {\n // first thread in group saves final result to memory\n {\n if (local_tid_70702 == 0) {\n *(__global double *) &mem_80492[gtid_70679 * 8] = x_82320;\n }\n }\n } else {\n int32_t old_counter_82326;\n \n // first thread in group saves group result to global memory\n {\n if (local_tid_70702 == 0) {\n *(__global double *) &group_res_arr_mem_82311[group_id_70703 *\n 8] = x_82320;\n mem_fence_global();\n old_counter_82326 = atomic_add((volatile __global int *) &\n counter_mem_82313[srem32(squot32(group_id_70703,\n squot32(num_groups_71156 +\n smax32(1,\n K_68318) -\n 1,\n smax32(1,\n K_68318))),\n 1024) *\n 4], 1);\n *(__local bool *) &sync_arr_mem_82317[0] = old_counter_82326 ==\n squot32(num_groups_71156 + smax32(1, K_68318) - 1, smax32(1,\n K_68318)) -\n 1;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n bool is_last_group_82327 = *(__local bool *) &sync_",
"arr_mem_82317[0];\n \n if (is_last_group_82327) {\n if (local_tid_70702 == 0) {\n old_counter_82326 = atomic_add((volatile __global int *) &\n counter_mem_82313[srem32(squot32(group_id_70703,\n squot32(num_groups_71156 +\n smax32(1,\n K_68318) -\n 1,\n smax32(1,\n K_68318))),\n 1024) *\n 4], 0 -\n squot32(num_groups_71156 +\n smax32(1, K_68318) - 1,\n smax32(1, K_68318)));\n }\n // read in the per-group-results\n {\n if (slt32(local_tid_70702, squot32(num_groups_71156 + smax32(1,\n K_68318) -\n 1, smax32(1, K_68318)))) {\n x_71162 = *(__global\n double *) &group_res_arr_mem_82311[(squot32(group_id_70703,\n squot32(num_groups_71156 +\n smax32(1,\n ",
" K_68318) -\n 1,\n smax32(1,\n K_68318))) *\n squot32(num_groups_71156 +\n smax32(1,\n K_68318) -\n 1,\n smax32(1,\n K_68318)) +\n local_tid_70702) *\n 8];\n } else {\n x_71162 = 0.0;\n }\n *(__local double *) &red_arr_mem_82315[local_tid_70702 * 8] =\n x_71162;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n // reduce the per-group results\n {\n int32_t offset_82328;\n int32_t skip_waves_82329;\n double x_82320;\n double x_82321;\n \n offset_82328 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_70702, group_sizze_71146)) {\n x_82320 = *(__local\n double *) &red_arr_mem_82315[(local_tid_70702 +\n offset_82328) *\n 8];\n }\n }\n offset_82328 = 1;\n ",
" while (slt32(offset_82328, wave_sizze_82306)) {\n if (slt32(local_tid_70702 + offset_82328,\n group_sizze_71146) && ((local_tid_70702 -\n squot32(local_tid_70702,\n wave_sizze_82306) *\n wave_sizze_82306) & (2 *\n offset_82328 -\n 1)) ==\n 0) {\n // read array element\n {\n x_82321 = *(volatile __local\n double *) &red_arr_mem_82315[(local_tid_70702 +\n offset_82328) *\n 8];\n }\n // apply reduction operation\n {\n double res_82322 = x_82320 + x_82321;\n \n x_82320 = res_82322;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_82315[local_tid_70702 *\n 8] = x_82320;\n }\n }\n offset_82328 *= 2;\n }\n skip_waves_82329 = 1;\n while (slt32(skip_waves_82329, squot32(group_sizze_71146 +\n wave_sizze_82306 - 1,\n wave_sizze_82306))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n ",
"offset_82328 = skip_waves_82329 * wave_sizze_82306;\n if (slt32(local_tid_70702 + offset_82328,\n group_sizze_71146) && ((local_tid_70702 -\n squot32(local_tid_70702,\n wave_sizze_82306) *\n wave_sizze_82306) == 0 &&\n (squot32(local_tid_70702,\n wave_sizze_82306) &\n (2 * skip_waves_82329 -\n 1)) == 0)) {\n // read array element\n {\n x_82321 = *(__local\n double *) &red_arr_mem_82315[(local_tid_70702 +\n offset_82328) *\n 8];\n }\n // apply reduction operation\n {\n double res_82322 = x_82320 + x_82321;\n \n x_82320 = res_82322;\n }\n // write result of operation\n {\n *(__local\n double *) &red_arr_mem_82315[local_tid_70702 *\n 8] = x_82320;\n }\n }\n skip_waves_82329 *= 2;\n }\n // and back to memory with the final result\n {\n if (local_tid_70702 == 0) {\n *(__global double *) &mem_80492[gtid_70679 * 8] =\n x_82320;\n ",
" }\n }\n }\n }\n }\n}\n__kernel void segred_large_70743(int32_t K_68318, int32_t triD_68324,\n int32_t num_groups_71120, __global\n unsigned char *icf_mem_80370, __global\n unsigned char *mem_80486,\n int32_t thread_per_segment_82271, __global\n unsigned char *group_res_arr_mem_82272,\n __global unsigned char *counter_mem_82274)\n{\n const int32_t group_sizze_71110 = gmm_objectivezigroup_sizze_70725;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n ALIGNED_LOCAL_MEMORY(red_arr_mem_82276_backing_0, 8 *\n gmm_objectivezigroup_sizze_70725);\n ALIGNED_LOCAL_MEMORY(sync_arr_mem_82278_backing_1, 1);\n \n int32_t global_tid_70743;\n int32_t local_tid_70744;\n int32_t group_sizze_82268;\n int32_t wave_sizze_82267;\n int32_t group_id_70745;\n \n global_tid_70743 = get_global_id(0);\n local_tid_70744 = get_local_id(0);\n group_sizze_82268 = get_local_size(0);\n wave_sizze_82267 = LOCKSTEP_WIDTH;\n group_id_70745 = get_group_id(0);\n \n int32_t gtid_70721;\n int32_t gtid_70742;\n __local char *red_arr_mem_82276;\n \n red_arr_mem_82276 = (__local char *) red_arr_mem_82276_backing_0;\n \n __local char *sync_arr_mem_82278;\n \n sync_arr_mem_82278 = (__local char *) sync_arr_mem_82278_backing_1;\n gtid_70721 = squot32(group_id_70745, squot32(num_groups_71120 + smax32(1,\n K_68318) -\n 1, smax32(1, K_68318)));\n \n int32_t chunk_sizze_82280 = smin32(squot32(triD_68324 + group_sizze_71110 *\n squot32(num_groups_71120 +\n smax",
"32(1, K_68318) - 1,\n smax32(1, K_68318)) - 1,\n group_sizze_71110 *\n squot32(num_groups_71120 +\n smax32(1, K_68318) - 1,\n smax32(1, K_68318))),\n squot32(triD_68324 -\n srem32(global_tid_70743,\n group_sizze_71110 *\n squot32(num_groups_71120 +\n smax32(1,\n K_68318) -\n 1, smax32(1,\n K_68318))) +\n thread_per_segment_82271 - 1,\n thread_per_segment_82271));\n double x_71126;\n double x_71127;\n \n x_71126 = 0.0;\n for (int32_t i_82284 = 0; i_82284 < chunk_sizze_82280; i_82284++) {\n gtid_70742 = srem32(global_tid_70743, group_sizze_71110 *\n squot32(num_groups_71120 + smax32(1, K_68318) - 1,\n smax32(1, K_68318))) +\n thread_per_segment_82271 * i_82284;\n // apply map function\n {\n double x_71130;\n double res_71131;\n \n x_71130 = *(__global double *) &icf_mem_80370[(gtid_70721 *\n triD_68324 +\n gtid_70742) * 8];\n res_71131 = x_71130 * x_71130;\n // save results to be reduced\n {\n x_71127 = res_711",
"31;\n }\n // save map-out results\n { }\n // apply reduction operator\n {\n double res_71128 = x_71126 + x_71127;\n \n x_71126 = res_71128;\n }\n }\n }\n // to reduce current chunk, first store our result to memory\n {\n *(__local double *) &red_arr_mem_82276[local_tid_70744 * 8] = x_71126;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82285;\n int32_t skip_waves_82286;\n double x_82281;\n double x_82282;\n \n offset_82285 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_70744, group_sizze_71110)) {\n x_82281 = *(__local double *) &red_arr_mem_82276[(local_tid_70744 +\n offset_82285) *\n 8];\n }\n }\n offset_82285 = 1;\n while (slt32(offset_82285, wave_sizze_82267)) {\n if (slt32(local_tid_70744 + offset_82285, group_sizze_71110) &&\n ((local_tid_70744 - squot32(local_tid_70744, wave_sizze_82267) *\n wave_sizze_82267) & (2 * offset_82285 - 1)) == 0) {\n // read array element\n {\n x_82282 = *(volatile __local\n double *) &red_arr_mem_82276[(local_tid_70744 +\n offset_82285) * 8];\n }\n // apply reduction operation\n {\n double res_82283 = x_82281 + x_82282;\n \n x_82281 = res_82283;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_82276[local_tid_70744 * 8] = x_82281;\n }\n }\n offset_82285 *= 2;\n }\n skip_waves_82286 = 1;\n while (slt32(skip_waves_82286, squot32(group_sizze_71110 +\n ",
" wave_sizze_82267 - 1,\n wave_sizze_82267))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82285 = skip_waves_82286 * wave_sizze_82267;\n if (slt32(local_tid_70744 + offset_82285, group_sizze_71110) &&\n ((local_tid_70744 - squot32(local_tid_70744, wave_sizze_82267) *\n wave_sizze_82267) == 0 && (squot32(local_tid_70744,\n wave_sizze_82267) & (2 *\n skip_waves_82286 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_82282 = *(__local\n double *) &red_arr_mem_82276[(local_tid_70744 +\n offset_82285) * 8];\n }\n // apply reduction operation\n {\n double res_82283 = x_82281 + x_82282;\n \n x_82281 = res_82283;\n }\n // write result of operation\n {\n *(__local double *) &red_arr_mem_82276[local_tid_70744 * 8] =\n x_82281;\n }\n }\n skip_waves_82286 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n if (squot32(num_groups_71120 + smax32(1, K_68318) - 1, smax32(1,\n K_68318)) ==\n 1) {\n // first thread in group saves final result to memory\n {\n if (local_tid_70744 == 0) {\n *(__global double *) &mem_80486[gtid_70721 * 8] = x_82281;\n }\n }\n } else {\n int32_t old_counter_82287;\n \n // first thread in group saves group result to global memory\n {\n if (local_tid_70744 == 0) {\n *(__global double *) &group_res_arr_mem_82272[group_id_70745 *\n ",
" 8] = x_82281;\n mem_fence_global();\n old_counter_82287 = atomic_add((volatile __global int *) &\n counter_mem_82274[srem32(squot32(group_id_70745,\n squot32(num_groups_71120 +\n smax32(1,\n K_68318) -\n 1,\n smax32(1,\n K_68318))),\n 1024) *\n 4], 1);\n *(__local bool *) &sync_arr_mem_82278[0] = old_counter_82287 ==\n squot32(num_groups_71120 + smax32(1, K_68318) - 1, smax32(1,\n K_68318)) -\n 1;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n bool is_last_group_82288 = *(__local bool *) &sync_arr_mem_82278[0];\n \n if (is_last_group_82288) {\n if (local_tid_70744 == 0) {\n old_counter_82287 = atomic_add((volatile __global int *) &\n counter_mem_82274[srem32(squot32(group_id_70745,\n squot32(num_groups_71120 +\n smax32(1,\n ",
" K_68318) -\n 1,\n smax32(1,\n K_68318))),\n 1024) *\n 4], 0 -\n squot32(num_groups_71120 +\n smax32(1, K_68318) - 1,\n smax32(1, K_68318)));\n }\n // read in the per-group-results\n {\n if (slt32(local_tid_70744, squot32(num_groups_71120 + smax32(1,\n K_68318) -\n 1, smax32(1, K_68318)))) {\n x_71126 = *(__global\n double *) &group_res_arr_mem_82272[(squot32(group_id_70745,\n squot32(num_groups_71120 +\n smax32(1,\n K_68318) -\n 1,\n smax32(1,\n K_68318))) *\n squot32(num_groups_71120 +\n smax32(1,\n ",
" K_68318) -\n 1,\n smax32(1,\n K_68318)) +\n local_tid_70744) *\n 8];\n } else {\n x_71126 = 0.0;\n }\n *(__local double *) &red_arr_mem_82276[local_tid_70744 * 8] =\n x_71126;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n // reduce the per-group results\n {\n int32_t offset_82289;\n int32_t skip_waves_82290;\n double x_82281;\n double x_82282;\n \n offset_82289 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_70744, group_sizze_71110)) {\n x_82281 = *(__local\n double *) &red_arr_mem_82276[(local_tid_70744 +\n offset_82289) *\n 8];\n }\n }\n offset_82289 = 1;\n while (slt32(offset_82289, wave_sizze_82267)) {\n if (slt32(local_tid_70744 + offset_82289,\n group_sizze_71110) && ((local_tid_70744 -\n squot32(local_tid_70744,\n wave_sizze_82267) *\n wave_sizze_82267) & (2 *\n offset_82289 -\n ",
" 1)) ==\n 0) {\n // read array element\n {\n x_82282 = *(volatile __local\n double *) &red_arr_mem_82276[(local_tid_70744 +\n offset_82289) *\n 8];\n }\n // apply reduction operation\n {\n double res_82283 = x_82281 + x_82282;\n \n x_82281 = res_82283;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_82276[local_tid_70744 *\n 8] = x_82281;\n }\n }\n offset_82289 *= 2;\n }\n skip_waves_82290 = 1;\n while (slt32(skip_waves_82290, squot32(group_sizze_71110 +\n wave_sizze_82267 - 1,\n wave_sizze_82267))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82289 = skip_waves_82290 * wave_sizze_82267;\n if (slt32(local_tid_70744 + offset_82289,\n group_sizze_71110) && ((local_tid_70744 -\n squot32(local_tid_70744,\n wave_sizze_82267) *\n wave_sizze_82267) == 0 &&\n (squot32(local_tid_70744,\n ",
" wave_sizze_82267) &\n (2 * skip_waves_82290 -\n 1)) == 0)) {\n // read array element\n {\n x_82282 = *(__local\n double *) &red_arr_mem_82276[(local_tid_70744 +\n offset_82289) *\n 8];\n }\n // apply reduction operation\n {\n double res_82283 = x_82281 + x_82282;\n \n x_82281 = res_82283;\n }\n // write result of operation\n {\n *(__local\n double *) &red_arr_mem_82276[local_tid_70744 *\n 8] = x_82281;\n }\n }\n skip_waves_82290 *= 2;\n }\n // and back to memory with the final result\n {\n if (local_tid_70744 == 0) {\n *(__global double *) &mem_80486[gtid_70721 * 8] =\n x_82281;\n }\n }\n }\n }\n }\n}\n__kernel void segred_large_70768(int32_t K_68318, int32_t D_68322,\n int32_t D_68333, int32_t num_groups_71094,\n __global unsigned char *qs_mem_80369, __global\n unsigned char *mem_80483,\n int32_t thread_per_segment_82234, __global\n unsigned char *group_res_arr_mem_82235,\n __global unsigned char *counter_mem",
"_82237)\n{\n const int32_t group_sizze_71084 = gmm_objectivezigroup_sizze_70750;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n ALIGNED_LOCAL_MEMORY(red_arr_mem_82239_backing_0, 8 *\n gmm_objectivezigroup_sizze_70750);\n ALIGNED_LOCAL_MEMORY(sync_arr_mem_82241_backing_1, 1);\n \n int32_t global_tid_70768;\n int32_t local_tid_70769;\n int32_t group_sizze_82231;\n int32_t wave_sizze_82230;\n int32_t group_id_70770;\n \n global_tid_70768 = get_global_id(0);\n local_tid_70769 = get_local_id(0);\n group_sizze_82231 = get_local_size(0);\n wave_sizze_82230 = LOCKSTEP_WIDTH;\n group_id_70770 = get_group_id(0);\n \n int32_t gtid_70746;\n int32_t gtid_70767;\n __local char *red_arr_mem_82239;\n \n red_arr_mem_82239 = (__local char *) red_arr_mem_82239_backing_0;\n \n __local char *sync_arr_mem_82241;\n \n sync_arr_mem_82241 = (__local char *) sync_arr_mem_82241_backing_1;\n gtid_70746 = squot32(group_id_70770, squot32(num_groups_71094 + smax32(1,\n K_68318) -\n 1, smax32(1, K_68318)));\n \n int32_t chunk_sizze_82243 = smin32(squot32(D_68333 + group_sizze_71084 *\n squot32(num_groups_71094 +\n smax32(1, K_68318) - 1,\n smax32(1, K_68318)) - 1,\n group_sizze_71084 *\n squot32(num_groups_71094 +\n smax32(1, K_68318) - 1,\n smax32(1, K_68318))),\n squot32(D_68333 -\n srem32(global_tid_70768,\n ",
" group_sizze_71084 *\n squot32(num_groups_71094 +\n smax32(1,\n K_68318) -\n 1, smax32(1,\n K_68318))) +\n thread_per_segment_82234 - 1,\n thread_per_segment_82234));\n double x_71100;\n double x_71101;\n \n x_71100 = 0.0;\n for (int32_t i_82247 = 0; i_82247 < chunk_sizze_82243; i_82247++) {\n gtid_70767 = srem32(global_tid_70768, group_sizze_71084 *\n squot32(num_groups_71094 + smax32(1, K_68318) - 1,\n smax32(1, K_68318))) +\n thread_per_segment_82234 * i_82247;\n // apply map function\n {\n double qs_elem_elem_71104;\n double res_71105;\n double res_71106;\n \n qs_elem_elem_71104 = *(__global\n double *) &qs_mem_80369[(gtid_70746 *\n D_68322 +\n gtid_70767) * 8];\n res_71105 = futrts_exp64(qs_elem_elem_71104);\n res_71106 = res_71105 * res_71105;\n // save results to be reduced\n {\n x_71101 = res_71106;\n }\n // save map-out results\n { }\n // apply reduction operator\n {\n double res_71102 = x_71100 + x_71101;\n \n x_71100 = res_71102;\n }\n }\n }\n // to reduce current chunk, first store our result to memory\n {\n *(__local double *) &red_arr_mem_82239[local_tid_70769 * 8] = x_7",
"1100;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82248;\n int32_t skip_waves_82249;\n double x_82244;\n double x_82245;\n \n offset_82248 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_70769, group_sizze_71084)) {\n x_82244 = *(__local double *) &red_arr_mem_82239[(local_tid_70769 +\n offset_82248) *\n 8];\n }\n }\n offset_82248 = 1;\n while (slt32(offset_82248, wave_sizze_82230)) {\n if (slt32(local_tid_70769 + offset_82248, group_sizze_71084) &&\n ((local_tid_70769 - squot32(local_tid_70769, wave_sizze_82230) *\n wave_sizze_82230) & (2 * offset_82248 - 1)) == 0) {\n // read array element\n {\n x_82245 = *(volatile __local\n double *) &red_arr_mem_82239[(local_tid_70769 +\n offset_82248) * 8];\n }\n // apply reduction operation\n {\n double res_82246 = x_82244 + x_82245;\n \n x_82244 = res_82246;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_82239[local_tid_70769 * 8] = x_82244;\n }\n }\n offset_82248 *= 2;\n }\n skip_waves_82249 = 1;\n while (slt32(skip_waves_82249, squot32(group_sizze_71084 +\n wave_sizze_82230 - 1,\n wave_sizze_82230))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82248 = skip_waves_82249 * wave_sizze_82230;\n if (slt32(local_tid_70769 + offset_82248, group_sizze_71084) &&\n ((local_tid_70769 - squot32(local_tid_70769, wave_sizze_82230) *\n wave_sizze_82230) == 0 && (squot32(lo",
"cal_tid_70769,\n wave_sizze_82230) & (2 *\n skip_waves_82249 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_82245 = *(__local\n double *) &red_arr_mem_82239[(local_tid_70769 +\n offset_82248) * 8];\n }\n // apply reduction operation\n {\n double res_82246 = x_82244 + x_82245;\n \n x_82244 = res_82246;\n }\n // write result of operation\n {\n *(__local double *) &red_arr_mem_82239[local_tid_70769 * 8] =\n x_82244;\n }\n }\n skip_waves_82249 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n if (squot32(num_groups_71094 + smax32(1, K_68318) - 1, smax32(1,\n K_68318)) ==\n 1) {\n // first thread in group saves final result to memory\n {\n if (local_tid_70769 == 0) {\n *(__global double *) &mem_80483[gtid_70746 * 8] = x_82244;\n }\n }\n } else {\n int32_t old_counter_82250;\n \n // first thread in group saves group result to global memory\n {\n if (local_tid_70769 == 0) {\n *(__global double *) &group_res_arr_mem_82235[group_id_70770 *\n 8] = x_82244;\n mem_fence_global();\n old_counter_82250 = atomic_add((volatile __global int *) &\n counter_mem_82237[srem32(squot32(group_id_70770,\n squot32(num_groups_71094 +\n ",
" smax32(1,\n K_68318) -\n 1,\n smax32(1,\n K_68318))),\n 1024) *\n 4], 1);\n *(__local bool *) &sync_arr_mem_82241[0] = old_counter_82250 ==\n squot32(num_groups_71094 + smax32(1, K_68318) - 1, smax32(1,\n K_68318)) -\n 1;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n bool is_last_group_82251 = *(__local bool *) &sync_arr_mem_82241[0];\n \n if (is_last_group_82251) {\n if (local_tid_70769 == 0) {\n old_counter_82250 = atomic_add((volatile __global int *) &\n counter_mem_82237[srem32(squot32(group_id_70770,\n squot32(num_groups_71094 +\n smax32(1,\n K_68318) -\n 1,\n smax32(1,\n K_68318))),\n 1024) *\n ",
" 4], 0 -\n squot32(num_groups_71094 +\n smax32(1, K_68318) - 1,\n smax32(1, K_68318)));\n }\n // read in the per-group-results\n {\n if (slt32(local_tid_70769, squot32(num_groups_71094 + smax32(1,\n K_68318) -\n 1, smax32(1, K_68318)))) {\n x_71100 = *(__global\n double *) &group_res_arr_mem_82235[(squot32(group_id_70770,\n squot32(num_groups_71094 +\n smax32(1,\n K_68318) -\n 1,\n smax32(1,\n K_68318))) *\n squot32(num_groups_71094 +\n smax32(1,\n K_68318) -\n 1,\n smax32(1,\n K_68318)) +\n local_tid_70769) *\n ",
" 8];\n } else {\n x_71100 = 0.0;\n }\n *(__local double *) &red_arr_mem_82239[local_tid_70769 * 8] =\n x_71100;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n // reduce the per-group results\n {\n int32_t offset_82252;\n int32_t skip_waves_82253;\n double x_82244;\n double x_82245;\n \n offset_82252 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_70769, group_sizze_71084)) {\n x_82244 = *(__local\n double *) &red_arr_mem_82239[(local_tid_70769 +\n offset_82252) *\n 8];\n }\n }\n offset_82252 = 1;\n while (slt32(offset_82252, wave_sizze_82230)) {\n if (slt32(local_tid_70769 + offset_82252,\n group_sizze_71084) && ((local_tid_70769 -\n squot32(local_tid_70769,\n wave_sizze_82230) *\n wave_sizze_82230) & (2 *\n offset_82252 -\n 1)) ==\n 0) {\n // read array element\n {\n x_82245 = *(volatile __local\n double *) &red_arr_mem_82239[(local_tid_70769 +\n offset_82252) *\n ",
" 8];\n }\n // apply reduction operation\n {\n double res_82246 = x_82244 + x_82245;\n \n x_82244 = res_82246;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_82239[local_tid_70769 *\n 8] = x_82244;\n }\n }\n offset_82252 *= 2;\n }\n skip_waves_82253 = 1;\n while (slt32(skip_waves_82253, squot32(group_sizze_71084 +\n wave_sizze_82230 - 1,\n wave_sizze_82230))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82252 = skip_waves_82253 * wave_sizze_82230;\n if (slt32(local_tid_70769 + offset_82252,\n group_sizze_71084) && ((local_tid_70769 -\n squot32(local_tid_70769,\n wave_sizze_82230) *\n wave_sizze_82230) == 0 &&\n (squot32(local_tid_70769,\n wave_sizze_82230) &\n (2 * skip_waves_82253 -\n 1)) == 0)) {\n // read array element\n {\n x_82245 = *(__local\n double *) &red_arr_mem_82239[(local_tid_70769 +\n ",
" offset_82252) *\n 8];\n }\n // apply reduction operation\n {\n double res_82246 = x_82244 + x_82245;\n \n x_82244 = res_82246;\n }\n // write result of operation\n {\n *(__local\n double *) &red_arr_mem_82239[local_tid_70769 *\n 8] = x_82244;\n }\n }\n skip_waves_82253 *= 2;\n }\n // and back to memory with the final result\n {\n if (local_tid_70769 == 0) {\n *(__global double *) &mem_80483[gtid_70746 * 8] =\n x_82244;\n }\n }\n }\n }\n }\n}\n__kernel void segred_large_73083(int32_t N_68508, int32_t K_68510,\n int32_t D_68526, int32_t num_groups_77901,\n __global unsigned char *mem_81382, __global\n unsigned char *mem_81387,\n int32_t thread_per_segment_83014, __global\n unsigned char *group_res_arr_mem_83015,\n __global unsigned char *counter_mem_83017)\n{\n const int32_t group_sizze_77891 = rev_gmm_objectivezigroup_sizze_73065;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n ALIGNED_LOCAL_MEMORY(red_arr_mem_83019_backing_0, 8 *\n rev_gmm_objectivezigroup_sizze_73065);\n ALIGNED_LOCAL_MEMORY(sync_arr_mem_83021_backing_1, 1);\n \n int32_t global_tid_73083;\n int32_t ",
"local_tid_73084;\n int32_t group_sizze_83011;\n int32_t wave_sizze_83010;\n int32_t group_id_73085;\n \n global_tid_73083 = get_global_id(0);\n local_tid_73084 = get_local_id(0);\n group_sizze_83011 = get_local_size(0);\n wave_sizze_83010 = LOCKSTEP_WIDTH;\n group_id_73085 = get_group_id(0);\n \n int32_t gtid_73057;\n int32_t gtid_73058;\n int32_t gtid_73059;\n int32_t gtid_73082;\n __local char *red_arr_mem_83019;\n \n red_arr_mem_83019 = (__local char *) red_arr_mem_83019_backing_0;\n \n __local char *sync_arr_mem_83021;\n \n sync_arr_mem_83021 = (__local char *) sync_arr_mem_83021_backing_1;\n gtid_73057 = squot32(squot32(group_id_73085, squot32(num_groups_77901 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526))),\n K_68510 * D_68526);\n gtid_73058 = squot32(squot32(group_id_73085, squot32(num_groups_77901 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526))) -\n squot32(squot32(group_id_73085,\n squot32(num_groups_77901 + smax32(1,\n N_68508 *\n ",
" K_68510 *\n D_68526) -\n 1, smax32(1, N_68508 *\n K_68510 * D_68526))),\n K_68510 * D_68526) * (K_68510 * D_68526),\n D_68526);\n gtid_73059 = squot32(group_id_73085, squot32(num_groups_77901 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1, smax32(1, N_68508 *\n K_68510 *\n D_68526))) -\n squot32(squot32(group_id_73085, squot32(num_groups_77901 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1, smax32(1, N_68508 * K_68510 *\n D_68526))), K_68510 *\n D_68526) * (K_68510 * D_68526) - squot32(squot32(group_id_73085,\n squot32(num_groups_77901 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n ",
" 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526))) -\n squot32(squot32(group_id_73085,\n squot32(num_groups_77901 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526))),\n K_68510 *\n D_68526) *\n (K_68510 * D_68526),\n D_68526) * D_68526;\n \n int32_t chunk_sizze_83023 = smin32(squot32(D_68526 + group_sizze_77891 *\n squot32(num_groups_77901 +\n smax32(1, N_68508 *\n ",
" K_68510 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526)) - 1,\n group_sizze_77891 *\n squot32(num_groups_77901 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526))),\n squot32(D_68526 -\n srem32(global_tid_73083,\n group_sizze_77891 *\n squot32(num_groups_77901 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1, smax32(1,\n N_68508 *\n K_68510 *\n D_68526))) +\n thread_per_segment_83014 - 1,\n thread_per_se",
"gment_83014));\n double x_77907;\n double x_77908;\n \n x_77907 = 0.0;\n for (int32_t i_83027 = 0; i_83027 < chunk_sizze_83023; i_83027++) {\n gtid_73082 = srem32(global_tid_73083, group_sizze_77891 *\n squot32(num_groups_77901 + smax32(1, N_68508 *\n K_68510 *\n D_68526) - 1,\n smax32(1, N_68508 * K_68510 * D_68526))) +\n thread_per_segment_83014 * i_83027;\n // apply map function\n {\n double x_77913 = *(__global double *) &mem_81382[(gtid_73057 *\n (D_68526 *\n D_68526 *\n K_68510) +\n gtid_73058 *\n (D_68526 *\n D_68526) +\n gtid_73059 *\n D_68526 +\n gtid_73082) * 8];\n \n // save results to be reduced\n {\n x_77908 = x_77913;\n }\n // save map-out results\n { }\n // apply reduction operator\n {\n double res_77909 = x_77907 + x_77908;\n \n x_77907 = res_77909;\n }\n }\n }\n // to reduce current chunk, first store our result to memory\n {\n *(__local double *) &red_arr_mem_83019[local_tid_73084 * 8] = x_77907;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_83028;\n int32_t skip_waves_83029;\n double x_83024;\n double x_830",
"25;\n \n offset_83028 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_73084, group_sizze_77891)) {\n x_83024 = *(__local double *) &red_arr_mem_83019[(local_tid_73084 +\n offset_83028) *\n 8];\n }\n }\n offset_83028 = 1;\n while (slt32(offset_83028, wave_sizze_83010)) {\n if (slt32(local_tid_73084 + offset_83028, group_sizze_77891) &&\n ((local_tid_73084 - squot32(local_tid_73084, wave_sizze_83010) *\n wave_sizze_83010) & (2 * offset_83028 - 1)) == 0) {\n // read array element\n {\n x_83025 = *(volatile __local\n double *) &red_arr_mem_83019[(local_tid_73084 +\n offset_83028) * 8];\n }\n // apply reduction operation\n {\n double res_83026 = x_83024 + x_83025;\n \n x_83024 = res_83026;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_83019[local_tid_73084 * 8] = x_83024;\n }\n }\n offset_83028 *= 2;\n }\n skip_waves_83029 = 1;\n while (slt32(skip_waves_83029, squot32(group_sizze_77891 +\n wave_sizze_83010 - 1,\n wave_sizze_83010))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_83028 = skip_waves_83029 * wave_sizze_83010;\n if (slt32(local_tid_73084 + offset_83028, group_sizze_77891) &&\n ((local_tid_73084 - squot32(local_tid_73084, wave_sizze_83010) *\n wave_sizze_83010) == 0 && (squot32(local_tid_73084,\n wave_sizze_83010) & (2 *\n ",
" skip_waves_83029 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_83025 = *(__local\n double *) &red_arr_mem_83019[(local_tid_73084 +\n offset_83028) * 8];\n }\n // apply reduction operation\n {\n double res_83026 = x_83024 + x_83025;\n \n x_83024 = res_83026;\n }\n // write result of operation\n {\n *(__local double *) &red_arr_mem_83019[local_tid_73084 * 8] =\n x_83024;\n }\n }\n skip_waves_83029 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n if (squot32(num_groups_77901 + smax32(1, N_68508 * K_68510 * D_68526) - 1,\n smax32(1, N_68508 * K_68510 * D_68526)) == 1) {\n // first thread in group saves final result to memory\n {\n if (local_tid_73084 == 0) {\n *(__global double *) &mem_81387[(gtid_73057 * (D_68526 *\n K_68510) +\n gtid_73058 * D_68526 +\n gtid_73059) * 8] = x_83024;\n }\n }\n } else {\n int32_t old_counter_83030;\n \n // first thread in group saves group result to global memory\n {\n if (local_tid_73084 == 0) {\n *(__global double *) &group_res_arr_mem_83015[group_id_73085 *\n 8] = x_83024;\n mem_fence_global();\n old_counter_83030 = atomic_add((volatile __global int *) &\n counter_mem_83017[srem32(squot32(group_id_73085,\n ",
" squot32(num_groups_77901 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526))),\n 1024) *\n 4], 1);\n *(__local bool *) &sync_arr_mem_83021[0] = old_counter_83030 ==\n squot32(num_groups_77901 + smax32(1, N_68508 * K_68510 *\n D_68526) - 1, smax32(1,\n N_68508 *\n K_68510 *\n D_68526)) -\n 1;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n bool is_last_group_83031 = *(__local bool *) &sync_arr_mem_83021[0];\n \n if (is_last_group_83031) {\n if (local_tid_73084 == 0) {\n old_counter_83030 = atomic_add((volatile __global int *) &\n counter_mem_83017[sre",
"m32(squot32(group_id_73085,\n squot32(num_groups_77901 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526))),\n 1024) *\n 4], 0 -\n squot32(num_groups_77901 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526)));\n }\n // read in the per-group-results\n {\n if (slt32(local_tid_73084, squot32(num_groups_77901 + smax32(1,\n N_68508 *\n ",
" K_68510 *\n D_68526) -\n 1, smax32(1, N_68508 *\n K_68510 *\n D_68526)))) {\n x_77907 = *(__global\n double *) &group_res_arr_mem_83015[(squot32(group_id_73085,\n squot32(num_groups_77901 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526))) *\n squot32(num_groups_77901 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n ",
" 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526)) +\n local_tid_73084) *\n 8];\n } else {\n x_77907 = 0.0;\n }\n *(__local double *) &red_arr_mem_83019[local_tid_73084 * 8] =\n x_77907;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n // reduce the per-group results\n {\n int32_t offset_83032;\n int32_t skip_waves_83033;\n double x_83024;\n double x_83025;\n \n offset_83032 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_73084, group_sizze_77891)) {\n x_83024 = *(__local\n double *) &red_arr_mem_83019[(local_tid_73084 +\n offset_83032) *\n 8];\n }\n }\n offset_83032 = 1;\n while (slt32(offset_83032, wave_sizze_83010)) {\n if (slt32(local_tid_73084 + offset_83032,\n group_sizze_77891) && ((local_tid_73084 -\n squot32(local_tid_73084,\n wave_sizze_83010) *\n wave_sizze_83010) & (2 *\n ",
" offset_83032 -\n 1)) ==\n 0) {\n // read array element\n {\n x_83025 = *(volatile __local\n double *) &red_arr_mem_83019[(local_tid_73084 +\n offset_83032) *\n 8];\n }\n // apply reduction operation\n {\n double res_83026 = x_83024 + x_83025;\n \n x_83024 = res_83026;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_83019[local_tid_73084 *\n 8] = x_83024;\n }\n }\n offset_83032 *= 2;\n }\n skip_waves_83033 = 1;\n while (slt32(skip_waves_83033, squot32(group_sizze_77891 +\n wave_sizze_83010 - 1,\n wave_sizze_83010))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_83032 = skip_waves_83033 * wave_sizze_83010;\n if (slt32(local_tid_73084 + offset_83032,\n group_sizze_77891) && ((local_tid_73084 -\n squot32(local_tid_73084,\n wave_sizze_83010) *\n wave_sizze_83010) == 0 &&\n ",
" (squot32(local_tid_73084,\n wave_sizze_83010) &\n (2 * skip_waves_83033 -\n 1)) == 0)) {\n // read array element\n {\n x_83025 = *(__local\n double *) &red_arr_mem_83019[(local_tid_73084 +\n offset_83032) *\n 8];\n }\n // apply reduction operation\n {\n double res_83026 = x_83024 + x_83025;\n \n x_83024 = res_83026;\n }\n // write result of operation\n {\n *(__local\n double *) &red_arr_mem_83019[local_tid_73084 *\n 8] = x_83024;\n }\n }\n skip_waves_83033 *= 2;\n }\n // and back to memory with the final result\n {\n if (local_tid_73084 == 0) {\n *(__global double *) &mem_81387[(gtid_73057 * (D_68526 *\n K_68510) +\n gtid_73058 * D_68526 +\n gtid_73059) * 8] =\n x_83024;\n }\n }\n }\n }\n }\n}\n__kernel void segred_large_73254(int32_t N_68508, int32_t K_68510,\n int32_t triD_68516, int32_t D_68526,\n ",
" int32_t num_groups_77803, __global\n unsigned char *mem_81324, __global\n unsigned char *mem_81329,\n int32_t thread_per_segment_82960, __global\n unsigned char *group_res_arr_mem_82961,\n __global unsigned char *counter_mem_82963)\n{\n const int32_t group_sizze_77793 = rev_gmm_objectivezigroup_sizze_73236;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n ALIGNED_LOCAL_MEMORY(red_arr_mem_82965_backing_0, 8 *\n rev_gmm_objectivezigroup_sizze_73236);\n ALIGNED_LOCAL_MEMORY(sync_arr_mem_82967_backing_1, 1);\n \n int32_t global_tid_73254;\n int32_t local_tid_73255;\n int32_t group_sizze_82957;\n int32_t wave_sizze_82956;\n int32_t group_id_73256;\n \n global_tid_73254 = get_global_id(0);\n local_tid_73255 = get_local_id(0);\n group_sizze_82957 = get_local_size(0);\n wave_sizze_82956 = LOCKSTEP_WIDTH;\n group_id_73256 = get_group_id(0);\n \n int32_t gtid_73228;\n int32_t gtid_73229;\n int32_t gtid_73230;\n int32_t gtid_73253;\n __local char *red_arr_mem_82965;\n \n red_arr_mem_82965 = (__local char *) red_arr_mem_82965_backing_0;\n \n __local char *sync_arr_mem_82967;\n \n sync_arr_mem_82967 = (__local char *) sync_arr_mem_82967_backing_1;\n gtid_73228 = squot32(squot32(group_id_73256, squot32(num_groups_77803 +\n smax32(1, N_68508 *\n K_68510 *\n triD_68516) - 1,\n smax32(1, N_68508 *\n K_68510 *\n triD_68516))),",
"\n K_68510 * triD_68516);\n gtid_73229 = squot32(squot32(group_id_73256, squot32(num_groups_77803 +\n smax32(1, N_68508 *\n K_68510 *\n triD_68516) - 1,\n smax32(1, N_68508 *\n K_68510 *\n triD_68516))) -\n squot32(squot32(group_id_73256,\n squot32(num_groups_77803 + smax32(1,\n N_68508 *\n K_68510 *\n triD_68516) -\n 1, smax32(1, N_68508 *\n K_68510 *\n triD_68516))),\n K_68510 * triD_68516) * (K_68510 * triD_68516),\n triD_68516);\n gtid_73230 = squot32(group_id_73256, squot32(num_groups_77803 + smax32(1,\n N_68508 *\n K_68510 *\n triD_68516) -\n 1, smax32(1, N_68508 *\n K_68510 *\n triD_68516))) -\n squot32(squot32(group_id_73256, squot32(num_groups_77803 + smax32(1,\n ",
" N_68508 *\n K_68510 *\n triD_68516) -\n 1, smax32(1, N_68508 * K_68510 *\n triD_68516))),\n K_68510 * triD_68516) * (K_68510 * triD_68516) -\n squot32(squot32(group_id_73256, squot32(num_groups_77803 + smax32(1,\n N_68508 *\n K_68510 *\n triD_68516) -\n 1, smax32(1, N_68508 * K_68510 *\n triD_68516))) -\n squot32(squot32(group_id_73256, squot32(num_groups_77803 +\n smax32(1, N_68508 *\n K_68510 *\n triD_68516) - 1,\n smax32(1, N_68508 *\n K_68510 *\n triD_68516))),\n K_68510 * triD_68516) * (K_68510 * triD_68516),\n triD_68516) * triD_68516;\n \n int32_t chunk_sizze_82969 = smin32(squot32(D_68526 + group_sizze_77793 *\n squot32(num_groups_77803 +\n smax32(1, N_68508 *\n K_68510 *\n triD_68516) - 1,\n smax32(1,",
" N_68508 *\n K_68510 *\n triD_68516)) - 1,\n group_sizze_77793 *\n squot32(num_groups_77803 +\n smax32(1, N_68508 *\n K_68510 *\n triD_68516) - 1,\n smax32(1, N_68508 *\n K_68510 *\n triD_68516))),\n squot32(D_68526 -\n srem32(global_tid_73254,\n group_sizze_77793 *\n squot32(num_groups_77803 +\n smax32(1,\n N_68508 *\n K_68510 *\n triD_68516) -\n 1, smax32(1,\n N_68508 *\n K_68510 *\n triD_68516))) +\n thread_per_segment_82960 - 1,\n thread_per_segment_82960));\n double x_77809;\n double x_77810;\n \n x_77809 = 0.0;\n for (int32_t i_82973 = 0; i_82973 < chunk_sizze_82969; i_82973++) {\n gtid_73253 = srem32(global",
"_tid_73254, group_sizze_77793 *\n squot32(num_groups_77803 + smax32(1, N_68508 *\n K_68510 *\n triD_68516) - 1,\n smax32(1, N_68508 * K_68510 *\n triD_68516))) +\n thread_per_segment_82960 * i_82973;\n // apply map function\n {\n double x_77815 = *(__global double *) &mem_81324[(gtid_73228 *\n (D_68526 *\n triD_68516 *\n K_68510) +\n gtid_73229 *\n (D_68526 *\n triD_68516) +\n gtid_73230 *\n D_68526 +\n gtid_73253) * 8];\n \n // save results to be reduced\n {\n x_77810 = x_77815;\n }\n // save map-out results\n { }\n // apply reduction operator\n {\n double res_77811 = x_77809 + x_77810;\n \n x_77809 = res_77811;\n }\n }\n }\n // to reduce current chunk, first store our result to memory\n {\n *(__local double *) &red_arr_mem_82965[local_tid_73255 * 8] = x_77809;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82974;\n int32_t skip_waves_82975;\n double x_82970;\n double x_82971;\n \n offset_82974 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_73255, grou",
"p_sizze_77793)) {\n x_82970 = *(__local double *) &red_arr_mem_82965[(local_tid_73255 +\n offset_82974) *\n 8];\n }\n }\n offset_82974 = 1;\n while (slt32(offset_82974, wave_sizze_82956)) {\n if (slt32(local_tid_73255 + offset_82974, group_sizze_77793) &&\n ((local_tid_73255 - squot32(local_tid_73255, wave_sizze_82956) *\n wave_sizze_82956) & (2 * offset_82974 - 1)) == 0) {\n // read array element\n {\n x_82971 = *(volatile __local\n double *) &red_arr_mem_82965[(local_tid_73255 +\n offset_82974) * 8];\n }\n // apply reduction operation\n {\n double res_82972 = x_82970 + x_82971;\n \n x_82970 = res_82972;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_82965[local_tid_73255 * 8] = x_82970;\n }\n }\n offset_82974 *= 2;\n }\n skip_waves_82975 = 1;\n while (slt32(skip_waves_82975, squot32(group_sizze_77793 +\n wave_sizze_82956 - 1,\n wave_sizze_82956))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82974 = skip_waves_82975 * wave_sizze_82956;\n if (slt32(local_tid_73255 + offset_82974, group_sizze_77793) &&\n ((local_tid_73255 - squot32(local_tid_73255, wave_sizze_82956) *\n wave_sizze_82956) == 0 && (squot32(local_tid_73255,\n wave_sizze_82956) & (2 *\n skip_waves_82975 -\n 1)) ==\n 0)) {",
"\n // read array element\n {\n x_82971 = *(__local\n double *) &red_arr_mem_82965[(local_tid_73255 +\n offset_82974) * 8];\n }\n // apply reduction operation\n {\n double res_82972 = x_82970 + x_82971;\n \n x_82970 = res_82972;\n }\n // write result of operation\n {\n *(__local double *) &red_arr_mem_82965[local_tid_73255 * 8] =\n x_82970;\n }\n }\n skip_waves_82975 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n if (squot32(num_groups_77803 + smax32(1, N_68508 * K_68510 * triD_68516) -\n 1, smax32(1, N_68508 * K_68510 * triD_68516)) == 1) {\n // first thread in group saves final result to memory\n {\n if (local_tid_73255 == 0) {\n *(__global double *) &mem_81329[(gtid_73228 * (triD_68516 *\n K_68510) +\n gtid_73229 * triD_68516 +\n gtid_73230) * 8] = x_82970;\n }\n }\n } else {\n int32_t old_counter_82976;\n \n // first thread in group saves group result to global memory\n {\n if (local_tid_73255 == 0) {\n *(__global double *) &group_res_arr_mem_82961[group_id_73256 *\n 8] = x_82970;\n mem_fence_global();\n old_counter_82976 = atomic_add((volatile __global int *) &\n counter_mem_82963[srem32(squot32(group_id_73256,\n squot32(num_groups_77803 +\n ",
" smax32(1,\n N_68508 *\n K_68510 *\n triD_68516) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n triD_68516))),\n 1024) *\n 4], 1);\n *(__local bool *) &sync_arr_mem_82967[0] = old_counter_82976 ==\n squot32(num_groups_77803 + smax32(1, N_68508 * K_68510 *\n triD_68516) - 1, smax32(1,\n N_68508 *\n K_68510 *\n triD_68516)) -\n 1;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n bool is_last_group_82977 = *(__local bool *) &sync_arr_mem_82967[0];\n \n if (is_last_group_82977) {\n if (local_tid_73255 == 0) {\n old_counter_82976 = atomic_add((volatile __global int *) &\n counter_mem_82963[srem32(squot32(group_id_73256,\n ",
" squot32(num_groups_77803 +\n smax32(1,\n N_68508 *\n K_68510 *\n triD_68516) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n triD_68516))),\n 1024) *\n 4], 0 -\n squot32(num_groups_77803 +\n smax32(1, N_68508 *\n K_68510 *\n triD_68516) - 1,\n smax32(1, N_68508 *\n K_68510 *\n triD_68516)));\n }\n // read in the per-group-results\n {\n if (slt32(local_tid_73255, squot32(num_groups_77803 + smax32(1,\n N_68508 *\n K_68510 *\n ",
" triD_68516) -\n 1, smax32(1, N_68508 *\n K_68510 *\n triD_68516)))) {\n x_77809 = *(__global\n double *) &group_res_arr_mem_82961[(squot32(group_id_73256,\n squot32(num_groups_77803 +\n smax32(1,\n N_68508 *\n K_68510 *\n triD_68516) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n triD_68516))) *\n squot32(num_groups_77803 +\n smax32(1,\n N_68508 *\n K_68510 *\n triD_68516) -\n 1,\n ",
" smax32(1,\n N_68508 *\n K_68510 *\n triD_68516)) +\n local_tid_73255) *\n 8];\n } else {\n x_77809 = 0.0;\n }\n *(__local double *) &red_arr_mem_82965[local_tid_73255 * 8] =\n x_77809;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n // reduce the per-group results\n {\n int32_t offset_82978;\n int32_t skip_waves_82979;\n double x_82970;\n double x_82971;\n \n offset_82978 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_73255, group_sizze_77793)) {\n x_82970 = *(__local\n double *) &red_arr_mem_82965[(local_tid_73255 +\n offset_82978) *\n 8];\n }\n }\n offset_82978 = 1;\n while (slt32(offset_82978, wave_sizze_82956)) {\n if (slt32(local_tid_73255 + offset_82978,\n group_sizze_77793) && ((local_tid_73255 -\n squot32(local_tid_73255,\n wave_sizze_82956) *\n wave_sizze_82956) & (2 *\n offset_",
"82978 -\n 1)) ==\n 0) {\n // read array element\n {\n x_82971 = *(volatile __local\n double *) &red_arr_mem_82965[(local_tid_73255 +\n offset_82978) *\n 8];\n }\n // apply reduction operation\n {\n double res_82972 = x_82970 + x_82971;\n \n x_82970 = res_82972;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_82965[local_tid_73255 *\n 8] = x_82970;\n }\n }\n offset_82978 *= 2;\n }\n skip_waves_82979 = 1;\n while (slt32(skip_waves_82979, squot32(group_sizze_77793 +\n wave_sizze_82956 - 1,\n wave_sizze_82956))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82978 = skip_waves_82979 * wave_sizze_82956;\n if (slt32(local_tid_73255 + offset_82978,\n group_sizze_77793) && ((local_tid_73255 -\n squot32(local_tid_73255,\n wave_sizze_82956) *\n wave_sizze_82956) == 0 &&\n (squot32(local_tid_73255,\n ",
" wave_sizze_82956) &\n (2 * skip_waves_82979 -\n 1)) == 0)) {\n // read array element\n {\n x_82971 = *(__local\n double *) &red_arr_mem_82965[(local_tid_73255 +\n offset_82978) *\n 8];\n }\n // apply reduction operation\n {\n double res_82972 = x_82970 + x_82971;\n \n x_82970 = res_82972;\n }\n // write result of operation\n {\n *(__local\n double *) &red_arr_mem_82965[local_tid_73255 *\n 8] = x_82970;\n }\n }\n skip_waves_82979 *= 2;\n }\n // and back to memory with the final result\n {\n if (local_tid_73255 == 0) {\n *(__global double *) &mem_81329[(gtid_73228 *\n (triD_68516 *\n K_68510) +\n gtid_73229 *\n triD_68516 +\n gtid_73230) * 8] =\n x_82970;\n }\n }\n }\n }\n }\n}\n__kernel void segred_large_73751(int32_t N_68508, int32_t K_68510,\n ",
" int32_t triD_68516, int32_t D_68526,\n int32_t num_groups_77705, __global\n unsigned char *mem_81263, __global\n unsigned char *mem_81269,\n int32_t thread_per_segment_82906, __global\n unsigned char *group_res_arr_mem_82907,\n __global unsigned char *counter_mem_82909)\n{\n const int32_t group_sizze_77695 = rev_gmm_objectivezigroup_sizze_73733;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n ALIGNED_LOCAL_MEMORY(red_arr_mem_82911_backing_0, 8 *\n rev_gmm_objectivezigroup_sizze_73733);\n ALIGNED_LOCAL_MEMORY(sync_arr_mem_82913_backing_1, 1);\n \n int32_t global_tid_73751;\n int32_t local_tid_73752;\n int32_t group_sizze_82903;\n int32_t wave_sizze_82902;\n int32_t group_id_73753;\n \n global_tid_73751 = get_global_id(0);\n local_tid_73752 = get_local_id(0);\n group_sizze_82903 = get_local_size(0);\n wave_sizze_82902 = LOCKSTEP_WIDTH;\n group_id_73753 = get_group_id(0);\n \n int32_t gtid_73723;\n int32_t gtid_73724;\n int32_t gtid_73725;\n int32_t gtid_73726;\n int32_t gtid_73750;\n __local char *red_arr_mem_82911;\n \n red_arr_mem_82911 = (__local char *) red_arr_mem_82911_backing_0;\n \n __local char *sync_arr_mem_82913;\n \n sync_arr_mem_82913 = (__local char *) sync_arr_mem_82913_backing_1;\n gtid_73723 = squot32(squot32(group_id_73753, squot32(num_groups_77705 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) - 1,\n smax32(1, N_68508 *\n ",
" K_68510 *\n D_68526 *\n triD_68516))),\n K_68510 * D_68526 * triD_68516);\n gtid_73724 = squot32(squot32(group_id_73753, squot32(num_groups_77705 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516))) -\n squot32(squot32(group_id_73753,\n squot32(num_groups_77705 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) -\n 1, smax32(1, N_68508 *\n K_68510 * D_68526 *\n triD_68516))),\n K_68510 * D_68526 * triD_68516) * (K_68510 *\n D_68526 *\n triD_68516),\n D_68526 * triD_68516);\n gtid_73725 = squot32(squot32(group_id_73753, squo",
"t32(num_groups_77705 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516))) -\n squot32(squot32(group_id_73753,\n squot32(num_groups_77705 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) -\n 1, smax32(1, N_68508 *\n K_68510 * D_68526 *\n triD_68516))),\n K_68510 * D_68526 * triD_68516) * (K_68510 *\n D_68526 *\n triD_68516) -\n squot32(squot32(group_id_73753,\n squot32(num_groups_77705 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n ",
" triD_68516) -\n 1, smax32(1, N_68508 *\n K_68510 * D_68526 *\n triD_68516))) -\n squot32(squot32(group_id_73753,\n squot32(num_groups_77705 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516))),\n K_68510 * D_68526 * triD_68516) *\n (K_68510 * D_68526 * triD_68516), D_68526 *\n triD_68516) * (D_68526 * triD_68516),\n triD_68516);\n gtid_73726 = squot32(group_id_73753, squot32(num_groups_77705 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) -\n 1, smax32(1, N_68508 *\n K_68510 * D_68526 *\n triD_68516))) -\n squot32(squot32(group_id_73753, squot32(num_groups_",
"77705 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) -\n 1, smax32(1, N_68508 * K_68510 *\n D_68526 *\n triD_68516))),\n K_68510 * D_68526 * triD_68516) * (K_68510 * D_68526 *\n triD_68516) -\n squot32(squot32(group_id_73753, squot32(num_groups_77705 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) -\n 1, smax32(1, N_68508 * K_68510 *\n D_68526 *\n triD_68516))) -\n squot32(squot32(group_id_73753, squot32(num_groups_77705 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n ",
" triD_68516))),\n K_68510 * D_68526 * triD_68516) * (K_68510 * D_68526 *\n triD_68516),\n D_68526 * triD_68516) * (D_68526 * triD_68516) -\n squot32(squot32(group_id_73753, squot32(num_groups_77705 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) -\n 1, smax32(1, N_68508 * K_68510 *\n D_68526 *\n triD_68516))) -\n squot32(squot32(group_id_73753, squot32(num_groups_77705 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516))),\n K_68510 * D_68526 * triD_68516) * (K_68510 * D_68526 *\n triD_68516) -\n squot32(squot32(group_id_73753, squot32(num_groups_77705 +\n smax32(1, N_68508 *\n K_68510 *\n ",
" D_68526 *\n triD_68516) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516))) -\n squot32(squot32(group_id_73753,\n squot32(num_groups_77705 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) -\n 1, smax32(1, N_68508 * K_68510 *\n D_68526 *\n triD_68516))),\n K_68510 * D_68526 * triD_68516) * (K_68510 *\n D_68526 *\n triD_68516),\n D_68526 * triD_68516) * (D_68526 * triD_68516),\n triD_68516) * triD_68516;\n \n int32_t chunk_sizze_82915 = smin32(squot32(D_68526 + group_sizze_77695 *\n squot32(num_groups_77705 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) - 1,\n smax32(1, N_68508 *\n ",
" K_68510 *\n D_68526 *\n triD_68516)) - 1,\n group_sizze_77695 *\n squot32(num_groups_77705 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516))),\n squot32(D_68526 -\n srem32(global_tid_73751,\n group_sizze_77695 *\n squot32(num_groups_77705 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) -\n 1, smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD",
"_68516))) +\n thread_per_segment_82906 - 1,\n thread_per_segment_82906));\n double x_77711;\n double x_77712;\n \n x_77711 = 0.0;\n for (int32_t i_82919 = 0; i_82919 < chunk_sizze_82915; i_82919++) {\n gtid_73750 = srem32(global_tid_73751, group_sizze_77695 *\n squot32(num_groups_77705 + smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) - 1,\n smax32(1, N_68508 * K_68510 * D_68526 *\n triD_68516))) +\n thread_per_segment_82906 * i_82919;\n // apply map function\n {\n double x_77718 = *(__global double *) &mem_81263[(gtid_73723 *\n (D_68526 *\n triD_68516 *\n D_68526 *\n K_68510) +\n gtid_73724 *\n (D_68526 *\n triD_68516 *\n D_68526) +\n gtid_73725 *\n (D_68526 *\n triD_68516) +\n gtid_73726 *\n D_68526 +\n gtid_73750",
") * 8];\n \n // save results to be reduced\n {\n x_77712 = x_77718;\n }\n // save map-out results\n { }\n // apply reduction operator\n {\n double res_77713 = x_77711 + x_77712;\n \n x_77711 = res_77713;\n }\n }\n }\n // to reduce current chunk, first store our result to memory\n {\n *(__local double *) &red_arr_mem_82911[local_tid_73752 * 8] = x_77711;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82920;\n int32_t skip_waves_82921;\n double x_82916;\n double x_82917;\n \n offset_82920 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_73752, group_sizze_77695)) {\n x_82916 = *(__local double *) &red_arr_mem_82911[(local_tid_73752 +\n offset_82920) *\n 8];\n }\n }\n offset_82920 = 1;\n while (slt32(offset_82920, wave_sizze_82902)) {\n if (slt32(local_tid_73752 + offset_82920, group_sizze_77695) &&\n ((local_tid_73752 - squot32(local_tid_73752, wave_sizze_82902) *\n wave_sizze_82902) & (2 * offset_82920 - 1)) == 0) {\n // read array element\n {\n x_82917 = *(volatile __local\n double *) &red_arr_mem_82911[(local_tid_73752 +\n offset_82920) * 8];\n }\n // apply reduction operation\n {\n double res_82918 = x_82916 + x_82917;\n \n x_82916 = res_82918;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_82911[local_tid_73752 * 8] = x_82916;\n }\n }\n offset_82920 *= 2;\n }\n ",
" skip_waves_82921 = 1;\n while (slt32(skip_waves_82921, squot32(group_sizze_77695 +\n wave_sizze_82902 - 1,\n wave_sizze_82902))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82920 = skip_waves_82921 * wave_sizze_82902;\n if (slt32(local_tid_73752 + offset_82920, group_sizze_77695) &&\n ((local_tid_73752 - squot32(local_tid_73752, wave_sizze_82902) *\n wave_sizze_82902) == 0 && (squot32(local_tid_73752,\n wave_sizze_82902) & (2 *\n skip_waves_82921 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_82917 = *(__local\n double *) &red_arr_mem_82911[(local_tid_73752 +\n offset_82920) * 8];\n }\n // apply reduction operation\n {\n double res_82918 = x_82916 + x_82917;\n \n x_82916 = res_82918;\n }\n // write result of operation\n {\n *(__local double *) &red_arr_mem_82911[local_tid_73752 * 8] =\n x_82916;\n }\n }\n skip_waves_82921 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n if (squot32(num_groups_77705 + smax32(1, N_68508 * K_68510 * D_68526 *\n triD_68516) - 1, smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516)) ==\n 1) {\n // first thread in group saves final result to memory\n {\n if (local_tid_73752 == 0) {\n ",
" *(__global double *) &mem_81269[(gtid_73723 * (triD_68516 *\n D_68526 *\n K_68510) +\n gtid_73724 * (triD_68516 *\n D_68526) +\n gtid_73725 * triD_68516 +\n gtid_73726) * 8] = x_82916;\n }\n }\n } else {\n int32_t old_counter_82922;\n \n // first thread in group saves group result to global memory\n {\n if (local_tid_73752 == 0) {\n *(__global double *) &group_res_arr_mem_82907[group_id_73753 *\n 8] = x_82916;\n mem_fence_global();\n old_counter_82922 = atomic_add((volatile __global int *) &\n counter_mem_82909[srem32(squot32(group_id_73753,\n squot32(num_groups_77705 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) -\n 1,\n smax32(1,\n N_68508 *\n ",
" K_68510 *\n D_68526 *\n triD_68516))),\n 1024) *\n 4], 1);\n *(__local bool *) &sync_arr_mem_82913[0] = old_counter_82922 ==\n squot32(num_groups_77705 + smax32(1, N_68508 * K_68510 *\n D_68526 * triD_68516) - 1,\n smax32(1, N_68508 * K_68510 * D_68526 *\n triD_68516)) - 1;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n bool is_last_group_82923 = *(__local bool *) &sync_arr_mem_82913[0];\n \n if (is_last_group_82923) {\n if (local_tid_73752 == 0) {\n old_counter_82922 = atomic_add((volatile __global int *) &\n counter_mem_82909[srem32(squot32(group_id_73753,\n squot32(num_groups_77705 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) -\n 1,\n ",
" smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD_68516))),\n 1024) *\n 4], 0 -\n squot32(num_groups_77705 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n triD_68516)));\n }\n // read in the per-group-results\n {\n if (slt32(local_tid_73752, squot32(num_groups_77705 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) -\n 1, smax32(1, N_68508 *\n K_68510 * D_68526 *\n ",
" triD_68516)))) {\n x_77711 = *(__global\n double *) &group_res_arr_mem_82907[(squot32(group_id_73753,\n squot32(num_groups_77705 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD_68516))) *\n squot32(num_groups_77705 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD_68516) -\n 1,\n ",
" smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n triD_68516)) +\n local_tid_73752) *\n 8];\n } else {\n x_77711 = 0.0;\n }\n *(__local double *) &red_arr_mem_82911[local_tid_73752 * 8] =\n x_77711;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n // reduce the per-group results\n {\n int32_t offset_82924;\n int32_t skip_waves_82925;\n double x_82916;\n double x_82917;\n \n offset_82924 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_73752, group_sizze_77695)) {\n x_82916 = *(__local\n double *) &red_arr_mem_82911[(local_tid_73752 +\n offset_82924) *\n 8];\n }\n }\n offset_82924 = 1;\n while (slt32(offset_82924, wave_sizze_82902)) {\n if (slt32(local_tid_73752 + offset_82924,\n group_sizze_77695) && ((local_tid_73752 -\n squot32(local_tid_73752,\n wave_sizze_82902) *\n ",
" wave_sizze_82902) & (2 *\n offset_82924 -\n 1)) ==\n 0) {\n // read array element\n {\n x_82917 = *(volatile __local\n double *) &red_arr_mem_82911[(local_tid_73752 +\n offset_82924) *\n 8];\n }\n // apply reduction operation\n {\n double res_82918 = x_82916 + x_82917;\n \n x_82916 = res_82918;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_82911[local_tid_73752 *\n 8] = x_82916;\n }\n }\n offset_82924 *= 2;\n }\n skip_waves_82925 = 1;\n while (slt32(skip_waves_82925, squot32(group_sizze_77695 +\n wave_sizze_82902 - 1,\n wave_sizze_82902))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82924 = skip_waves_82925 * wave_sizze_82902;\n if (slt32(local_tid_73752 + offset_82924,\n group_sizze_77695) && ((local_tid_73752 -\n squot32(local_tid_73752,\n wave_sizze_82902) *\n ",
" wave_sizze_82902) == 0 &&\n (squot32(local_tid_73752,\n wave_sizze_82902) &\n (2 * skip_waves_82925 -\n 1)) == 0)) {\n // read array element\n {\n x_82917 = *(__local\n double *) &red_arr_mem_82911[(local_tid_73752 +\n offset_82924) *\n 8];\n }\n // apply reduction operation\n {\n double res_82918 = x_82916 + x_82917;\n \n x_82916 = res_82918;\n }\n // write result of operation\n {\n *(__local\n double *) &red_arr_mem_82911[local_tid_73752 *\n 8] = x_82916;\n }\n }\n skip_waves_82925 *= 2;\n }\n // and back to memory with the final result\n {\n if (local_tid_73752 == 0) {\n *(__global double *) &mem_81269[(gtid_73723 *\n (triD_68516 * D_68526 *\n K_68510) +\n gtid_73724 *\n (triD_68516 *\n D_68526) +\n gtid_73",
"725 *\n triD_68516 +\n gtid_73726) * 8] =\n x_82916;\n }\n }\n }\n }\n }\n}\n__kernel void segred_large_73924(int32_t N_68508, int32_t K_68510,\n int32_t D_68526, int32_t num_groups_77606,\n __global unsigned char *mem_81194, __global\n unsigned char *mem_81200,\n int32_t thread_per_segment_82852, __global\n unsigned char *group_res_arr_mem_82853,\n __global unsigned char *counter_mem_82855)\n{\n const int32_t group_sizze_77596 = rev_gmm_objectivezigroup_sizze_73906;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n ALIGNED_LOCAL_MEMORY(red_arr_mem_82857_backing_0, 8 *\n rev_gmm_objectivezigroup_sizze_73906);\n ALIGNED_LOCAL_MEMORY(sync_arr_mem_82859_backing_1, 1);\n \n int32_t global_tid_73924;\n int32_t local_tid_73925;\n int32_t group_sizze_82849;\n int32_t wave_sizze_82848;\n int32_t group_id_73926;\n \n global_tid_73924 = get_global_id(0);\n local_tid_73925 = get_local_id(0);\n group_sizze_82849 = get_local_size(0);\n wave_sizze_82848 = LOCKSTEP_WIDTH;\n group_id_73926 = get_group_id(0);\n \n int32_t gtid_73896;\n int32_t gtid_73897;\n int32_t gtid_73898;\n int32_t gtid_73899;\n int32_t gtid_73923;\n __local char *red_arr_mem_82857;\n \n red_arr_mem_82857 = (__local char *) red_arr_mem_82857_backing_0;\n \n __local char *sync_arr_mem_82859;\n \n sync_arr_mem_82859 = (__local char *) sync_arr_mem_82859_backing_1;\n gtid_73896 = squot32(squot32(group_id_73926, squot32(num_groups_77606 +\n smax32(1, N_68508 *\n ",
" K_68510 *\n D_68526 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526))),\n K_68510 * D_68526 * D_68526);\n gtid_73897 = squot32(squot32(group_id_73926, squot32(num_groups_77606 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526))) -\n squot32(squot32(group_id_73926,\n squot32(num_groups_77606 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526) -\n 1, smax32(1, N_68508 *\n K_68510 * D_68526 *\n D_68526))), K_68510 *\n D_68526 *",
" D_68526) * (K_68510 * D_68526 *\n D_68526), D_68526 *\n D_68526);\n gtid_73898 = squot32(squot32(group_id_73926, squot32(num_groups_77606 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526))) -\n squot32(squot32(group_id_73926,\n squot32(num_groups_77606 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526) -\n 1, smax32(1, N_68508 *\n K_68510 * D_68526 *\n D_68526))), K_68510 *\n D_68526 * D_68526) * (K_68510 * D_68526 *\n D_68526) -\n squot32(squot32(group_id_73926,\n squot32(num_groups_77606 + smax32(1,\n N_68508 *\n K_68510 *\n ",
" D_68526 *\n D_68526) -\n 1, smax32(1, N_68508 *\n K_68510 * D_68526 *\n D_68526))) -\n squot32(squot32(group_id_73926,\n squot32(num_groups_77606 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526))),\n K_68510 * D_68526 * D_68526) *\n (K_68510 * D_68526 * D_68526), D_68526 *\n D_68526) * (D_68526 * D_68526), D_68526);\n gtid_73899 = squot32(group_id_73926, squot32(num_groups_77606 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526) -\n 1, smax32(1, N_68508 *\n K_68510 * D_68526 *\n D_68526))) -\n squot32(squo",
"t32(group_id_73926, squot32(num_groups_77606 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526) -\n 1, smax32(1, N_68508 * K_68510 *\n D_68526 * D_68526))),\n K_68510 * D_68526 * D_68526) * (K_68510 * D_68526 * D_68526) -\n squot32(squot32(group_id_73926, squot32(num_groups_77606 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526) -\n 1, smax32(1, N_68508 * K_68510 *\n D_68526 * D_68526))) -\n squot32(squot32(group_id_73926, squot32(num_groups_77606 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526))),\n K_68510 * D_68526 * D_68526) * (K_68510 * D_68526 *\n ",
" D_68526), D_68526 *\n D_68526) * (D_68526 * D_68526) - squot32(squot32(group_id_73926,\n squot32(num_groups_77606 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526))) -\n squot32(squot32(group_id_73926,\n squot32(num_groups_77606 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526) -\n 1,\n ",
" smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526))),\n K_68510 *\n D_68526 *\n D_68526) *\n (K_68510 * D_68526 *\n D_68526) -\n squot32(squot32(group_id_73926,\n squot32(num_groups_77606 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n ",
" D_68526))) -\n squot32(squot32(group_id_73926,\n squot32(num_groups_77606 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526))),\n K_68510 *\n D_68526 *\n D_68526) *\n (K_68510 *\n D_68526 *\n D_68526),\n D_68526 *\n D_6852",
"6) *\n (D_68526 * D_68526),\n D_68526) * D_68526;\n \n int32_t chunk_sizze_82861 = smin32(squot32(D_68526 + group_sizze_77596 *\n squot32(num_groups_77606 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526)) - 1,\n group_sizze_77596 *\n squot32(num_groups_77606 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526))),\n squot32(D_68526 -\n srem32(global_tid_73924,\n group_sizze_77596 *\n squot32(num_groups_77606 +\n smax32(1,\n ",
" N_68508 *\n K_68510 *\n D_68526 *\n D_68526) -\n 1, smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526))) +\n thread_per_segment_82852 - 1,\n thread_per_segment_82852));\n double x_77612;\n double x_77613;\n \n x_77612 = 0.0;\n for (int32_t i_82865 = 0; i_82865 < chunk_sizze_82861; i_82865++) {\n gtid_73923 = srem32(global_tid_73924, group_sizze_77596 *\n squot32(num_groups_77606 + smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526) - 1,\n smax32(1, N_68508 * K_68510 * D_68526 *\n D_68526))) +\n thread_per_segment_82852 * i_82865;\n // apply map function\n {\n double x_77619 = *(__global double *) &mem_81194[(gtid_73896 *\n (D_68526 *\n D_68526 *\n D_68526 *\n K_68510) +\n ",
" gtid_73897 *\n (D_68526 *\n D_68526 *\n D_68526) +\n gtid_73898 *\n (D_68526 *\n D_68526) +\n gtid_73899 *\n D_68526 +\n gtid_73923) * 8];\n \n // save results to be reduced\n {\n x_77613 = x_77619;\n }\n // save map-out results\n { }\n // apply reduction operator\n {\n double res_77614 = x_77612 + x_77613;\n \n x_77612 = res_77614;\n }\n }\n }\n // to reduce current chunk, first store our result to memory\n {\n *(__local double *) &red_arr_mem_82857[local_tid_73925 * 8] = x_77612;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82866;\n int32_t skip_waves_82867;\n double x_82862;\n double x_82863;\n \n offset_82866 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_73925, group_sizze_77596)) {\n x_82862 = *(__local double *) &red_arr_mem_82857[(local_tid_73925 +\n offset_82866) *\n 8];\n }\n }\n offset_82866 = 1;\n while (slt32(offset_82866, wave_sizze_82848)) {\n if (slt32(local_tid_73925 + offset_82866, group_sizze_77596) &&\n ((local_tid_73925 - squot32(local_tid_73925, wave_sizze_82848) *\n wave_sizze_82848) & (2 * offset",
"_82866 - 1)) == 0) {\n // read array element\n {\n x_82863 = *(volatile __local\n double *) &red_arr_mem_82857[(local_tid_73925 +\n offset_82866) * 8];\n }\n // apply reduction operation\n {\n double res_82864 = x_82862 + x_82863;\n \n x_82862 = res_82864;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_82857[local_tid_73925 * 8] = x_82862;\n }\n }\n offset_82866 *= 2;\n }\n skip_waves_82867 = 1;\n while (slt32(skip_waves_82867, squot32(group_sizze_77596 +\n wave_sizze_82848 - 1,\n wave_sizze_82848))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82866 = skip_waves_82867 * wave_sizze_82848;\n if (slt32(local_tid_73925 + offset_82866, group_sizze_77596) &&\n ((local_tid_73925 - squot32(local_tid_73925, wave_sizze_82848) *\n wave_sizze_82848) == 0 && (squot32(local_tid_73925,\n wave_sizze_82848) & (2 *\n skip_waves_82867 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_82863 = *(__local\n double *) &red_arr_mem_82857[(local_tid_73925 +\n offset_82866) * 8];\n }\n // apply reduction operation\n {\n double res_82864 = x_82862 + x_82863;\n \n x_82862 = res_82864;\n }\n // write result of operation\n {\n *(__local double *) &red",
"_arr_mem_82857[local_tid_73925 * 8] =\n x_82862;\n }\n }\n skip_waves_82867 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n if (squot32(num_groups_77606 + smax32(1, N_68508 * K_68510 * D_68526 *\n D_68526) - 1, smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526)) == 1) {\n // first thread in group saves final result to memory\n {\n if (local_tid_73925 == 0) {\n *(__global double *) &mem_81200[(gtid_73896 * (D_68526 *\n D_68526 *\n K_68510) +\n gtid_73897 * (D_68526 *\n D_68526) +\n gtid_73898 * D_68526 +\n gtid_73899) * 8] = x_82862;\n }\n }\n } else {\n int32_t old_counter_82868;\n \n // first thread in group saves group result to global memory\n {\n if (local_tid_73925 == 0) {\n *(__global double *) &group_res_arr_mem_82853[group_id_73926 *\n 8] = x_82862;\n mem_fence_global();\n old_counter_82868 = atomic_add((volatile __global int *) &\n counter_mem_82855[srem32(squot32(group_id_73926,\n squot32(num_groups_77606 +\n smax32(1,\n ",
" N_68508 *\n K_68510 *\n D_68526 *\n D_68526) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526))),\n 1024) *\n 4], 1);\n *(__local bool *) &sync_arr_mem_82859[0] = old_counter_82868 ==\n squot32(num_groups_77606 + smax32(1, N_68508 * K_68510 *\n D_68526 * D_68526) - 1,\n smax32(1, N_68508 * K_68510 * D_68526 * D_68526)) -\n 1;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n bool is_last_group_82869 = *(__local bool *) &sync_arr_mem_82859[0];\n \n if (is_last_group_82869) {\n if (local_tid_73925 == 0) {\n old_counter_82868 = atomic_add((volatile __global int *) &\n counter_mem_82855[srem32(squot32(group_id_73926,\n squot32(num_groups_77606 +\n ",
" smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526))),\n 1024) *\n 4], 0 -\n squot32(num_groups_77606 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526 *\n D_68526)));\n }\n // read in the per-group-results\n {\n ",
" if (slt32(local_tid_73925, squot32(num_groups_77606 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526) -\n 1, smax32(1, N_68508 *\n K_68510 * D_68526 *\n D_68526)))) {\n x_77612 = *(__global\n double *) &group_res_arr_mem_82853[(squot32(group_id_73926,\n squot32(num_groups_77606 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526))) *\n ",
" squot32(num_groups_77606 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526 *\n D_68526)) +\n local_tid_73925) *\n 8];\n } else {\n x_77612 = 0.0;\n }\n *(__local double *) &red_arr_mem_82857[local_tid_73925 * 8] =\n x_77612;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n // reduce the per-group results\n {\n int32_t offset_82870;\n int32_t skip_waves_82871;\n double x_82862;\n double x_82863;\n \n offset_82870 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_73925, group_sizze_77596)) {\n x_82862 = *(__local\n double *) &red_arr_mem_82857[(local_tid_73925 +\n ",
" offset_82870) *\n 8];\n }\n }\n offset_82870 = 1;\n while (slt32(offset_82870, wave_sizze_82848)) {\n if (slt32(local_tid_73925 + offset_82870,\n group_sizze_77596) && ((local_tid_73925 -\n squot32(local_tid_73925,\n wave_sizze_82848) *\n wave_sizze_82848) & (2 *\n offset_82870 -\n 1)) ==\n 0) {\n // read array element\n {\n x_82863 = *(volatile __local\n double *) &red_arr_mem_82857[(local_tid_73925 +\n offset_82870) *\n 8];\n }\n // apply reduction operation\n {\n double res_82864 = x_82862 + x_82863;\n \n x_82862 = res_82864;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_82857[local_tid_73925 *\n 8] = x_82862;\n }\n }\n offset_82870 *= 2;\n }\n skip_waves_82871 = 1;\n while (slt32(skip_waves_82871, squot32(group_sizze_77596",
" +\n wave_sizze_82848 - 1,\n wave_sizze_82848))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82870 = skip_waves_82871 * wave_sizze_82848;\n if (slt32(local_tid_73925 + offset_82870,\n group_sizze_77596) && ((local_tid_73925 -\n squot32(local_tid_73925,\n wave_sizze_82848) *\n wave_sizze_82848) == 0 &&\n (squot32(local_tid_73925,\n wave_sizze_82848) &\n (2 * skip_waves_82871 -\n 1)) == 0)) {\n // read array element\n {\n x_82863 = *(__local\n double *) &red_arr_mem_82857[(local_tid_73925 +\n offset_82870) *\n 8];\n }\n // apply reduction operation\n {\n double res_82864 = x_82862 + x_82863;\n \n x_82862 = res_82864;\n }\n // write result of operation\n {\n *(__local\n double *) &red_arr_mem_82857[local_tid_73925 *\n 8] = x_82862;\n }\n }\n skip_waves_82871 *= 2;\n }\n // an",
"d back to memory with the final result\n {\n if (local_tid_73925 == 0) {\n *(__global double *) &mem_81200[(gtid_73896 * (D_68526 *\n D_68526 *\n K_68510) +\n gtid_73897 * (D_68526 *\n D_68526) +\n gtid_73898 * D_68526 +\n gtid_73899) * 8] =\n x_82862;\n }\n }\n }\n }\n }\n}\n__kernel void segred_large_74280(int32_t N_68508, int32_t K_68510,\n int32_t D_68526, int32_t num_groups_77406,\n __global unsigned char *res_r_r_mem_80930,\n __global unsigned char *mem_81082, __global\n unsigned char *mem_81087,\n int32_t thread_per_segment_82775, __global\n unsigned char *group_res_arr_mem_82776,\n __global unsigned char *counter_mem_82778)\n{\n const int32_t group_sizze_77396 = rev_gmm_objectivezigroup_sizze_74262;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n ALIGNED_LOCAL_MEMORY(red_arr_mem_82780_backing_0, 8 *\n rev_gmm_objectivezigroup_sizze_74262);\n ALIGNED_LOCAL_MEMORY(sync_arr_mem_82782_backing_1, 1);\n \n int32_t global_tid_74280;\n int32_t local_tid_74281;\n int32_t group_sizze_82772;\n int32_t wave_sizze_82771;\n int32_t group_id_74282;\n \n global_tid_74280 = get_global_id(0);\n local_tid_74281 = get_local_id(0);\n group_sizze_82772 = get_local_size(0);\n wave",
"_sizze_82771 = LOCKSTEP_WIDTH;\n group_id_74282 = get_group_id(0);\n \n int32_t gtid_74252;\n int32_t gtid_74253;\n int32_t gtid_74254;\n int32_t gtid_74279;\n __local char *red_arr_mem_82780;\n \n red_arr_mem_82780 = (__local char *) red_arr_mem_82780_backing_0;\n \n __local char *sync_arr_mem_82782;\n \n sync_arr_mem_82782 = (__local char *) sync_arr_mem_82782_backing_1;\n gtid_74252 = squot32(squot32(group_id_74282, squot32(num_groups_77406 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526))),\n K_68510 * D_68526);\n gtid_74253 = squot32(squot32(group_id_74282, squot32(num_groups_77406 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526))) -\n squot32(squot32(group_id_74282,\n squot32(num_groups_77406 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1, smax32(1, N_68508 *\n ",
" K_68510 * D_68526))),\n K_68510 * D_68526) * (K_68510 * D_68526),\n D_68526);\n gtid_74254 = squot32(group_id_74282, squot32(num_groups_77406 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1, smax32(1, N_68508 *\n K_68510 *\n D_68526))) -\n squot32(squot32(group_id_74282, squot32(num_groups_77406 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1, smax32(1, N_68508 * K_68510 *\n D_68526))), K_68510 *\n D_68526) * (K_68510 * D_68526) - squot32(squot32(group_id_74282,\n squot32(num_groups_77406 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1,\n smax32(1,\n N_68508 *\n ",
" K_68510 *\n D_68526))) -\n squot32(squot32(group_id_74282,\n squot32(num_groups_77406 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526))),\n K_68510 *\n D_68526) *\n (K_68510 * D_68526),\n D_68526) * D_68526;\n \n int32_t chunk_sizze_82784 = smin32(squot32(D_68526 + group_sizze_77396 *\n squot32(num_groups_77406 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526) - 1,\n smax32(1, N_68508 *\n ",
" K_68510 *\n D_68526)) - 1,\n group_sizze_77396 *\n squot32(num_groups_77406 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526))),\n squot32(D_68526 -\n srem32(global_tid_74280,\n group_sizze_77396 *\n squot32(num_groups_77406 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1, smax32(1,\n N_68508 *\n K_68510 *\n D_68526))) +\n thread_per_segment_82775 - 1,\n thread_per_segment_82775));\n double x_77412;\n double x_77413;\n \n x_77412 = 0.0;\n for (int32_t i_82788 = 0; i_82788 < chunk_sizze_82784; i_82788++) {\n gtid_74279 = srem32(global_tid_74280, group_sizze_77396 *\n ",
" squot32(num_groups_77406 + smax32(1, N_68508 *\n K_68510 *\n D_68526) - 1,\n smax32(1, N_68508 * K_68510 * D_68526))) +\n thread_per_segment_82775 * i_82788;\n // apply map function\n {\n double x_77420;\n double x_77421;\n double res_77422;\n \n x_77420 = *(__global double *) &mem_81082[(gtid_74252 * (D_68526 *\n D_68526 *\n K_68510) +\n gtid_74253 * (D_68526 *\n D_68526) +\n gtid_74254 * D_68526 +\n gtid_74279) * 8];\n x_77421 = *(__global double *) &res_r_r_mem_80930[(gtid_74252 *\n (D_68526 *\n K_68510) +\n gtid_74253 *\n D_68526 +\n gtid_74279) * 8];\n res_77422 = x_77420 * x_77421;\n // save results to be reduced\n {\n x_77413 = res_77422;\n }\n // save map-out results\n { }\n // apply reduction operator\n {\n double res_77414 = x_77412 + x_77413;\n \n x_77412 = res_77414;\n }\n }\n }\n // to reduce current chunk, first store our result to memory\n {\n *(__local double *) &red_arr_mem_82780[local_tid_74281 * 8] ",
"= x_77412;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n int32_t offset_82789;\n int32_t skip_waves_82790;\n double x_82785;\n double x_82786;\n \n offset_82789 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_74281, group_sizze_77396)) {\n x_82785 = *(__local double *) &red_arr_mem_82780[(local_tid_74281 +\n offset_82789) *\n 8];\n }\n }\n offset_82789 = 1;\n while (slt32(offset_82789, wave_sizze_82771)) {\n if (slt32(local_tid_74281 + offset_82789, group_sizze_77396) &&\n ((local_tid_74281 - squot32(local_tid_74281, wave_sizze_82771) *\n wave_sizze_82771) & (2 * offset_82789 - 1)) == 0) {\n // read array element\n {\n x_82786 = *(volatile __local\n double *) &red_arr_mem_82780[(local_tid_74281 +\n offset_82789) * 8];\n }\n // apply reduction operation\n {\n double res_82787 = x_82785 + x_82786;\n \n x_82785 = res_82787;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_82780[local_tid_74281 * 8] = x_82785;\n }\n }\n offset_82789 *= 2;\n }\n skip_waves_82790 = 1;\n while (slt32(skip_waves_82790, squot32(group_sizze_77396 +\n wave_sizze_82771 - 1,\n wave_sizze_82771))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82789 = skip_waves_82790 * wave_sizze_82771;\n if (slt32(local_tid_74281 + offset_82789, group_sizze_77396) &&\n ((local_tid_74281 - squot32(local_tid_74281, wave_sizze_82771) *\n wave_sizze_82771) == 0 && (squot",
"32(local_tid_74281,\n wave_sizze_82771) & (2 *\n skip_waves_82790 -\n 1)) ==\n 0)) {\n // read array element\n {\n x_82786 = *(__local\n double *) &red_arr_mem_82780[(local_tid_74281 +\n offset_82789) * 8];\n }\n // apply reduction operation\n {\n double res_82787 = x_82785 + x_82786;\n \n x_82785 = res_82787;\n }\n // write result of operation\n {\n *(__local double *) &red_arr_mem_82780[local_tid_74281 * 8] =\n x_82785;\n }\n }\n skip_waves_82790 *= 2;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n if (squot32(num_groups_77406 + smax32(1, N_68508 * K_68510 * D_68526) - 1,\n smax32(1, N_68508 * K_68510 * D_68526)) == 1) {\n // first thread in group saves final result to memory\n {\n if (local_tid_74281 == 0) {\n *(__global double *) &mem_81087[(gtid_74252 * (D_68526 *\n K_68510) +\n gtid_74253 * D_68526 +\n gtid_74254) * 8] = x_82785;\n }\n }\n } else {\n int32_t old_counter_82791;\n \n // first thread in group saves group result to global memory\n {\n if (local_tid_74281 == 0) {\n *(__global double *) &group_res_arr_mem_82776[group_id_74282 *\n 8] = x_82785;\n mem_fence_global();\n old_counter_82791 = atomic_add((volatile __global int *) &\n ",
" counter_mem_82778[srem32(squot32(group_id_74282,\n squot32(num_groups_77406 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526))),\n 1024) *\n 4], 1);\n *(__local bool *) &sync_arr_mem_82782[0] = old_counter_82791 ==\n squot32(num_groups_77406 + smax32(1, N_68508 * K_68510 *\n D_68526) - 1, smax32(1,\n N_68508 *\n K_68510 *\n D_68526)) -\n 1;\n }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n \n bool is_last_group_82792 = *(__local bool *) &sync_arr_mem_82782[0];\n \n if (is_last_group_82792) {\n if (local_tid_74281 == ",
"0) {\n old_counter_82791 = atomic_add((volatile __global int *) &\n counter_mem_82778[srem32(squot32(group_id_74282,\n squot32(num_groups_77406 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526))),\n 1024) *\n 4], 0 -\n squot32(num_groups_77406 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526)));\n }\n // read in the per-group-results\n {\n if (slt32(local_tid_74281, squot32(nu",
"m_groups_77406 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1, smax32(1, N_68508 *\n K_68510 *\n D_68526)))) {\n x_77412 = *(__global\n double *) &group_res_arr_mem_82776[(squot32(group_id_74282,\n squot32(num_groups_77406 +\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526))) *\n squot32(num_groups_77406 +\n smax32(1,\n N_68508 *\n K_68510 *\n ",
" D_68526) -\n 1,\n smax32(1,\n N_68508 *\n K_68510 *\n D_68526)) +\n local_tid_74281) *\n 8];\n } else {\n x_77412 = 0.0;\n }\n *(__local double *) &red_arr_mem_82780[local_tid_74281 * 8] =\n x_77412;\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n // reduce the per-group results\n {\n int32_t offset_82793;\n int32_t skip_waves_82794;\n double x_82785;\n double x_82786;\n \n offset_82793 = 0;\n // participating threads read initial accumulator\n {\n if (slt32(local_tid_74281, group_sizze_77396)) {\n x_82785 = *(__local\n double *) &red_arr_mem_82780[(local_tid_74281 +\n offset_82793) *\n 8];\n }\n }\n offset_82793 = 1;\n while (slt32(offset_82793, wave_sizze_82771)) {\n if (slt32(local_tid_74281 + offset_82793,\n group_sizze_77396) && ((local_tid_74281 -\n squot32(local_tid_74281,\n ",
" wave_sizze_82771) *\n wave_sizze_82771) & (2 *\n offset_82793 -\n 1)) ==\n 0) {\n // read array element\n {\n x_82786 = *(volatile __local\n double *) &red_arr_mem_82780[(local_tid_74281 +\n offset_82793) *\n 8];\n }\n // apply reduction operation\n {\n double res_82787 = x_82785 + x_82786;\n \n x_82785 = res_82787;\n }\n // write result of operation\n {\n *(volatile __local\n double *) &red_arr_mem_82780[local_tid_74281 *\n 8] = x_82785;\n }\n }\n offset_82793 *= 2;\n }\n skip_waves_82794 = 1;\n while (slt32(skip_waves_82794, squot32(group_sizze_77396 +\n wave_sizze_82771 - 1,\n wave_sizze_82771))) {\n barrier(CLK_LOCAL_MEM_FENCE);\n offset_82793 = skip_waves_82794 * wave_sizze_82771;\n if (slt32(local_tid_74281 + offset_82793,\n group_sizze_77396) && ((local_tid_74281 -\n squot32(local_tid_74281,\n ",
" wave_sizze_82771) *\n wave_sizze_82771) == 0 &&\n (squot32(local_tid_74281,\n wave_sizze_82771) &\n (2 * skip_waves_82794 -\n 1)) == 0)) {\n // read array element\n {\n x_82786 = *(__local\n double *) &red_arr_mem_82780[(local_tid_74281 +\n offset_82793) *\n 8];\n }\n // apply reduction operation\n {\n double res_82787 = x_82785 + x_82786;\n \n x_82785 = res_82787;\n }\n // write result of operation\n {\n *(__local\n double *) &red_arr_mem_82780[local_tid_74281 *\n 8] = x_82785;\n }\n }\n skip_waves_82794 *= 2;\n }\n // and back to memory with the final result\n {\n if (local_tid_74281 == 0) {\n *(__global double *) &mem_81087[(gtid_74252 * (D_68526 *\n K_68510) +\n gtid_74253 * D_68526 +\n gtid_74254) * 8] =\n x_82785;\n }\n }\n }\n }",
"\n }\n}\n__kernel void segred_large_74786(int32_t N_68508, int32_t K_68510,\n int32_t D_68526, int32_t num_groups_77093,\n __global unsigned char *mem_80879, __global\n unsigned char *mem_80889, __global\n unsigned char *mem_80919,\n int32_t thread_per_segment_82664, __global\n unsigned char *group_res_arr_mem_82665,\n __global unsigned char *counter_mem_82667)\n{\n const int32_t group_sizze_77083 = rev_gmm_objectivezigroup_sizze_74768;\n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n ALIGNED_LOCAL_MEMORY(red_arr_mem_82669_backing_0, 8 *\n rev_gmm_objectivezigroup_sizze_74768);\n ALIGNED_LOCAL_MEMORY(sync_arr_mem_82671_backing_1, 1);\n \n int32_t global_tid_74786;\n int32_t local_tid_74787;\n int32_t group_sizze_82661;\n int32_t wave_sizze_82660;\n int32_t group_id_74788;\n \n global_tid_74786 = get_global_id(0);\n local_tid_74787 = get_local_id(0);\n group_sizze_82661 = get_local_size(0);\n wave_sizze_82660 = LOCKSTEP_WIDTH;\n group_id_74788 = get_group_id(0);\n \n int32_t gtid_74758;\n int32_t gtid_74759;\n int32_t gtid_74760;\n int32_t gtid_74785;\n __local char *red_arr_mem_82669;\n \n red_arr_mem_82669 = (__local char *) red_arr_mem_82669_backing_0;\n \n __local char *sync_arr_mem_82671;\n \n sync_arr_mem_82671 = (__local char *) sync_arr_mem_82671_backing_1;\n gtid_74758 = squot32(squot32(group_id_74788, squot32(num_groups_77093 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526) - 1,\n smax32(1, N_",
"68508 *\n K_68510 *\n D_68526))),\n K_68510 * D_68526);\n gtid_74759 = squot32(squot32(group_id_74788, squot32(num_groups_77093 +\n smax32(1, N_68508 *\n K_68510 *\n D_68526) - 1,\n smax32(1, N_68508 *\n K_68510 *\n D_68526))) -\n squot32(squot32(group_id_74788,\n squot32(num_groups_77093 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1, smax32(1, N_68508 *\n K_68510 * D_68526))),\n K_68510 * D_68526) * (K_68510 * D_68526),\n D_68526);\n gtid_74760 = squot32(group_id_74788, squot32(num_groups_77093 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1, smax32(1, N_68508 *\n K_68510 *\n D_68526))) -\n squot32(squot32(group_id_74788, squot32(num_groups_7709",
"3 + smax32(1,\n N_68508 *\n K_68510 *\n D_68526) -\n 1, smax32(1, N_68508 * K_68510 *\n D_68526))), K_68510 *\n D_68526) * (K_68510 * D_68526) - squot32(squot32(group_id_74788,\n squot32(num_groups_77093 +\n smax32(1,\n N_68508 *\n K
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment