Skip to content

Instantly share code, notes, and snippets.

@Munksgaard
Last active June 21, 2023 11:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Munksgaard/f2f1d5fa15e7deb8cd1fa6fad9b5edac to your computer and use it in GitHub Desktop.
Save Munksgaard/f2f1d5fa15e7deb8cd1fa6fad9b5edac to your computer and use it in GitHub Desktop.
// Generated by Futhark 0.25.0 (prerelease - include info below when reporting bugs)
// git: 6a2e6e1 (Sun Apr 30 19:03:19 2023 +0200) [modified]
// We need to define _GNU_SOURCE before
// _any_ headers files are imported to get
// the usage statistics of a thread (i.e. have RUSAGE_THREAD) on GNU/Linux
// https://manpages.courier-mta.org/htmlman2/getrusage.2.html
#ifndef _GNU_SOURCE // Avoid possible double-definition warning.
#define _GNU_SOURCE
#endif
#ifdef __clang__
#pragma clang diagnostic ignored "-Wunused-function"
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wparentheses"
#pragma clang diagnostic ignored "-Wunused-label"
#elif __GNUC__
#pragma GCC diagnostic ignored "-Wunused-function"
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wparentheses"
#pragma GCC diagnostic ignored "-Wunused-label"
#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
#endif
// Headers
#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdio.h>
#include <float.h>
#define CL_TARGET_OPENCL_VERSION 120
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifdef __APPLE__
#define CL_SILENCE_DEPRECATION
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
// Initialisation
struct futhark_context_config;
struct futhark_context_config *futhark_context_config_new(void);
void futhark_context_config_free(struct futhark_context_config *cfg);
int futhark_context_config_set_tuning_param(struct futhark_context_config *cfg, const char *param_name, size_t new_value);
struct futhark_context;
struct futhark_context *futhark_context_new(struct futhark_context_config *cfg);
void futhark_context_free(struct futhark_context *cfg);
void futhark_context_config_add_build_option(struct futhark_context_config *cfg, const char *opt);
void futhark_context_config_set_device(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_set_platform(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_select_device_interactively(struct futhark_context_config *cfg);
void futhark_context_config_list_devices(struct futhark_context_config *cfg);
void futhark_context_config_dump_program_to(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_load_program_from(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_dump_binary_to(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_load_binary_from(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_set_default_group_size(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_default_threshold(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_command_queue(struct futhark_context_config *cfg, cl_command_queue);
void futhark_context_config_set_debugging(struct futhark_context_config *cfg, int flag);
void futhark_context_config_set_profiling(struct futhark_context_config *cfg, int flag);
void futhark_context_config_set_logging(struct futhark_context_config *cfg, int flag);
int futhark_get_tuning_param_count(void);
const char *futhark_get_tuning_param_name(int);
const char *futhark_get_tuning_param_class(int);
// Arrays
struct futhark_i64_1d;
struct futhark_i64_1d *futhark_new_i64_1d(struct futhark_context *ctx, const int64_t *data, int64_t dim0);
struct futhark_i64_1d *futhark_new_raw_i64_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0);
int futhark_free_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr);
int futhark_values_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr, int64_t *data);
cl_mem futhark_values_raw_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr);
const int64_t *futhark_shape_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr);
struct futhark_u8_1d;
struct futhark_u8_1d *futhark_new_u8_1d(struct futhark_context *ctx, const uint8_t *data, int64_t dim0);
struct futhark_u8_1d *futhark_new_raw_u8_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0);
int futhark_free_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr);
int futhark_values_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr, uint8_t *data);
cl_mem futhark_values_raw_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr);
const int64_t *futhark_shape_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr);
// Opaque values
// Entry points
int futhark_entry_add(struct futhark_context *ctx, struct futhark_u8_1d **out0, const struct futhark_u8_1d *in0, const struct futhark_u8_1d *in1);
int futhark_entry_add_i64(struct futhark_context *ctx, struct futhark_i64_1d **out0, const struct futhark_i64_1d *in0, const struct futhark_i64_1d *in1);
// Miscellaneous
int futhark_context_sync(struct futhark_context *ctx);
cl_command_queue futhark_context_get_command_queue(struct futhark_context *ctx);
void futhark_context_config_set_cache_file(struct futhark_context_config *cfg, const char *f);
char *futhark_context_report(struct futhark_context *ctx);
char *futhark_context_get_error(struct futhark_context *ctx);
void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f);
void futhark_context_pause_profiling(struct futhark_context *ctx);
void futhark_context_unpause_profiling(struct futhark_context *ctx);
int futhark_context_clear_caches(struct futhark_context *ctx);
#define FUTHARK_BACKEND_opencl
#define FUTHARK_SUCCESS 0
#define FUTHARK_PROGRAM_ERROR 2
#define FUTHARK_OUT_OF_MEMORY 3
#ifdef __cplusplus
}
#endif
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>
#include <stdint.h>
// If NDEBUG is set, the assert() macro will do nothing. Since Futhark
// (unfortunately) makes use of assert() for error detection (and even some
// side effects), we want to avoid that.
#undef NDEBUG
#include <assert.h>
#include <stdarg.h>
// Start of util.h.
//
// Various helper functions that are useful in all generated C code.
#include <errno.h>
#include <string.h>
static const char *fut_progname = "(embedded Futhark)";
static void futhark_panic(int eval, const char *fmt, ...) __attribute__((noreturn));
static char* msgprintf(const char *s, ...);
static void* slurp_file(const char *filename, size_t *size);
static int dump_file(const char *file, const void *buf, size_t n);
struct str_builder;
static void str_builder_init(struct str_builder *b);
static void str_builder(struct str_builder *b, const char *s, ...);
static char *strclone(const char *str);
static void futhark_panic(int eval, const char *fmt, ...) {
va_list ap;
va_start(ap, fmt);
fprintf(stderr, "%s: ", fut_progname);
vfprintf(stderr, fmt, ap);
va_end(ap);
exit(eval);
}
// For generating arbitrary-sized error messages. It is the callers
// responsibility to free the buffer at some point.
static char* msgprintf(const char *s, ...) {
va_list vl;
va_start(vl, s);
size_t needed = 1 + (size_t)vsnprintf(NULL, 0, s, vl);
char *buffer = (char*) malloc(needed);
va_start(vl, s); // Must re-init.
vsnprintf(buffer, needed, s, vl);
return buffer;
}
static inline void check_err(int errval, int sets_errno, const char *fun, int line,
const char *msg, ...) {
if (errval) {
char errnum[10];
va_list vl;
va_start(vl, msg);
fprintf(stderr, "ERROR: ");
vfprintf(stderr, msg, vl);
fprintf(stderr, " in %s() at line %d with error code %s\n",
fun, line,
sets_errno ? strerror(errno) : errnum);
exit(errval);
}
}
#define CHECK_ERR(err, ...) check_err(err, 0, __func__, __LINE__, __VA_ARGS__)
#define CHECK_ERRNO(err, ...) check_err(err, 1, __func__, __LINE__, __VA_ARGS__)
// Read the rest of an open file into a NUL-terminated string; returns
// NULL on error.
static void* fslurp_file(FILE *f, size_t *size) {
long start = ftell(f);
fseek(f, 0, SEEK_END);
long src_size = ftell(f)-start;
fseek(f, start, SEEK_SET);
unsigned char *s = (unsigned char*) malloc((size_t)src_size + 1);
if (fread(s, 1, (size_t)src_size, f) != (size_t)src_size) {
free(s);
s = NULL;
} else {
s[src_size] = '\0';
}
if (size) {
*size = (size_t)src_size;
}
return s;
}
// Read a file into a NUL-terminated string; returns NULL on error.
static void* slurp_file(const char *filename, size_t *size) {
FILE *f = fopen(filename, "rb"); // To avoid Windows messing with linebreaks.
if (f == NULL) return NULL;
unsigned char *s = fslurp_file(f, size);
fclose(f);
return s;
}
// Dump 'n' bytes from 'buf' into the file at the designated location.
// Returns 0 on success.
static int dump_file(const char *file, const void *buf, size_t n) {
FILE *f = fopen(file, "w");
if (f == NULL) {
return 1;
}
if (fwrite(buf, sizeof(char), n, f) != n) {
return 1;
}
if (fclose(f) != 0) {
return 1;
}
return 0;
}
struct str_builder {
char *str;
size_t capacity; // Size of buffer.
size_t used; // Bytes used, *not* including final zero.
};
static void str_builder_init(struct str_builder *b) {
b->capacity = 10;
b->used = 0;
b->str = malloc(b->capacity);
b->str[0] = 0;
}
static void str_builder(struct str_builder *b, const char *s, ...) {
va_list vl;
va_start(vl, s);
size_t needed = (size_t)vsnprintf(NULL, 0, s, vl);
while (b->capacity < b->used + needed + 1) {
b->capacity *= 2;
b->str = realloc(b->str, b->capacity);
}
va_start(vl, s); // Must re-init.
vsnprintf(b->str+b->used, b->capacity-b->used, s, vl);
b->used += needed;
}
static char *strclone(const char *str) {
size_t size = strlen(str) + 1;
char *copy = (char*) malloc(size);
if (copy == NULL) {
return NULL;
}
memcpy(copy, str, size);
return copy;
}
// End of util.h.
// Start of cache.h
#define CACHE_HASH_SIZE 8 // In 32-bit words.
struct cache_hash {
uint32_t hash[CACHE_HASH_SIZE];
};
// Initialise a blank cache.
static void cache_hash_init(struct cache_hash *c);
// Hash some bytes and add them to the accumulated hash.
static void cache_hash(struct cache_hash *out, const char *in, size_t n);
// Try to restore cache contents from a file with the given name.
// Assumes the cache is invalid if it contains the given hash.
// Allocates memory and reads the cache conents, which is returned in
// *buf with size *buflen. If the cache is successfully loaded, this
// function returns 0. Otherwise it returns nonzero. Errno is set if
// the failure to load the cache is due to anything except invalid
// cache conents. Note that failing to restore the cache is not
// necessarily a problem: it might just be invalid or not created yet.
static int cache_restore(const char *fname, const struct cache_hash *hash,
unsigned char **buf, size_t *buflen);
// Store cache contents in the given file, with the given hash.
static int cache_store(const char *fname, const struct cache_hash *hash,
const unsigned char *buf, size_t buflen);
// Now for the implementation.
static void cache_hash_init(struct cache_hash *c) {
memset(c->hash, 0, CACHE_HASH_SIZE * sizeof(uint32_t));
}
static void cache_hash(struct cache_hash *out, const char *in, size_t n) {
// Adaptation of djb2 for larger output size by storing intermediate
// states.
uint32_t hash = 5381;
for (size_t i = 0; i < n; i++) {
hash = ((hash << 5) + hash) + in[i];
out->hash[i % CACHE_HASH_SIZE] ^= hash;
}
}
#define CACHE_HEADER_SIZE 8
static const char cache_header[CACHE_HEADER_SIZE] = "FUTHARK\0";
static int cache_restore(const char *fname, const struct cache_hash *hash,
unsigned char **buf, size_t *buflen) {
FILE *f = fopen(fname, "rb");
if (f == NULL) {
return 1;
}
char f_header[CACHE_HEADER_SIZE];
if (fread(f_header, sizeof(char), CACHE_HEADER_SIZE, f) != CACHE_HEADER_SIZE) {
goto error;
}
if (memcmp(f_header, cache_header, CACHE_HEADER_SIZE) != 0) {
goto error;
}
if (fseek(f, 0, SEEK_END) != 0) {
goto error;
}
int64_t f_size = (int64_t)ftell(f);
if (fseek(f, CACHE_HEADER_SIZE, SEEK_SET) != 0) {
goto error;
}
int64_t expected_size;
if (fread(&expected_size, sizeof(int64_t), 1, f) != 1) {
goto error;
}
if (f_size != expected_size) {
errno = 0;
goto error;
}
int32_t f_hash[CACHE_HASH_SIZE];
if (fread(f_hash, sizeof(int32_t), CACHE_HASH_SIZE, f) != CACHE_HASH_SIZE) {
goto error;
}
if (memcmp(f_hash, hash->hash, CACHE_HASH_SIZE) != 0) {
errno = 0;
goto error;
}
*buflen = f_size - CACHE_HEADER_SIZE - sizeof(int64_t) - CACHE_HASH_SIZE*sizeof(int32_t);
*buf = malloc(*buflen);
if (fread(*buf, sizeof(char), *buflen, f) != *buflen) {
free(*buf);
goto error;
}
fclose(f);
return 0;
error:
fclose(f);
return 1;
}
static int cache_store(const char *fname, const struct cache_hash *hash,
const unsigned char *buf, size_t buflen) {
FILE *f = fopen(fname, "wb");
if (f == NULL) {
return 1;
}
if (fwrite(cache_header, CACHE_HEADER_SIZE, 1, f) != 1) {
goto error;
}
int64_t size = CACHE_HEADER_SIZE + sizeof(int64_t) + CACHE_HASH_SIZE*sizeof(int32_t) + buflen;
if (fwrite(&size, sizeof(size), 1, f) != 1) {
goto error;
}
if (fwrite(hash->hash, sizeof(int32_t), CACHE_HASH_SIZE, f) != CACHE_HASH_SIZE) {
goto error;
}
if (fwrite(buf, sizeof(unsigned char), buflen, f) != buflen) {
goto error;
}
fclose(f);
return 0;
error:
fclose(f);
return 1;
}
// End of cache.h
// Start of half.h.
// Conversion functions are from http://half.sourceforge.net/, but
// translated to C.
//
// Copyright (c) 2012-2021 Christian Rau
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef __OPENCL_VERSION__
#define __constant
#endif
__constant static const uint16_t base_table[512] = {
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100,
0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00,
0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00,
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100,
0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00,
0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00,
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 };
__constant static const unsigned char shift_table[512] = {
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 };
__constant static const uint32_t mantissa_table[2048] = {
0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000,
0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000,
0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000,
0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000,
0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000,
0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000,
0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000,
0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000,
0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000,
0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000,
0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000,
0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000,
0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000,
0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000,
0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000,
0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000,
0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000,
0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000,
0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000,
0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000,
0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000,
0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000,
0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000,
0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000,
0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000,
0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000,
0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000,
0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000,
0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000,
0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000,
0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000,
0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000,
0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000,
0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000,
0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000,
0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000,
0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000,
0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000,
0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000,
0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000,
0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000,
0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000,
0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000,
0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000,
0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000,
0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000,
0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000,
0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000,
0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000,
0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000,
0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000,
0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000,
0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000,
0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000,
0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000,
0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000,
0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000,
0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000,
0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000,
0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000,
0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000,
0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000,
0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000,
0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000,
0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000,
0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000,
0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000,
0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000,
0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000,
0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000,
0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000,
0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000,
0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000,
0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000,
0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000,
0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000,
0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000,
0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000,
0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000,
0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000,
0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000,
0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000,
0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000,
0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000,
0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000,
0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000,
0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000,
0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000,
0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000,
0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000,
0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000,
0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000,
0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000,
0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000,
0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000,
0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000,
0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000,
0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000,
0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000,
0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000,
0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000,
0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000,
0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000,
0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000,
0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,
0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000,
0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000,
0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000,
0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000,
0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000,
0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000,
0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000,
0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000,
0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000,
0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000,
0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000,
0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000,
0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000,
0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000,
0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000,
0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000,
0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000,
0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000,
0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000,
0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000,
0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000,
0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000,
0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 };
__constant static const uint32_t exponent_table[64] = {
0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000,
0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000,
0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 };
__constant static const unsigned short offset_table[64] = {
0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 };
static uint16_t float2halfbits(float value) {
union { float x; uint32_t y; } u;
u.x = value;
uint32_t bits = u.y;
uint16_t hbits = base_table[bits>>23] + (uint16_t)((bits&0x7FFFFF)>>shift_table[bits>>23]);;
return hbits;
}
static float halfbits2float(uint16_t value) {
uint32_t bits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10];
union { uint32_t x; float y; } u;
u.x = bits;
return u.y;
}
static uint16_t halfbitsnextafter(uint16_t from, uint16_t to) {
int fabs = from & 0x7FFF, tabs = to & 0x7FFF;
if(fabs > 0x7C00 || tabs > 0x7C00) {
return ((from&0x7FFF)>0x7C00) ? (from|0x200) : (to|0x200);
}
if(from == to || !(fabs|tabs)) {
return to;
}
if(!fabs) {
return (to&0x8000)+1;
}
unsigned int out =
from +
(((from>>15)^(unsigned int)((from^(0x8000|(0x8000-(from>>15))))<(to^(0x8000|(0x8000-(to>>15))))))<<1)
- 1;
return out;
}
// End of half.h.
// Start of timing.h.
// The function get_wall_time() returns the wall time in microseconds
// (with an unspecified offset).
#ifdef _WIN32
#include <windows.h>
static int64_t get_wall_time(void) {
LARGE_INTEGER time,freq;
assert(QueryPerformanceFrequency(&freq));
assert(QueryPerformanceCounter(&time));
return ((double)time.QuadPart / freq.QuadPart) * 1000000;
}
#else
// Assuming POSIX
#include <time.h>
#include <sys/time.h>
static int64_t get_wall_time(void) {
struct timeval time;
assert(gettimeofday(&time,NULL) == 0);
return time.tv_sec * 1000000 + time.tv_usec;
}
static int64_t get_wall_time_ns(void) {
struct timespec time;
assert(clock_gettime(CLOCK_REALTIME, &time) == 0);
return time.tv_sec * 1000000000 + time.tv_nsec;
}
#endif
// End of timing.h.
// Start of lock.h.
// A very simple cross-platform implementation of locks. Uses
// pthreads on Unix and some Windows thing there. Futhark's
// host-level code is not multithreaded, but user code may be, so we
// need some mechanism for ensuring atomic access to API functions.
// This is that mechanism. It is not exposed to user code at all, so
// we do not have to worry about name collisions.
#ifdef _WIN32
typedef HANDLE lock_t;
static void create_lock(lock_t *lock) {
*lock = CreateMutex(NULL, // Default security attributes.
FALSE, // Initially unlocked.
NULL); // Unnamed.
}
static void lock_lock(lock_t *lock) {
assert(WaitForSingleObject(*lock, INFINITE) == WAIT_OBJECT_0);
}
static void lock_unlock(lock_t *lock) {
assert(ReleaseMutex(*lock));
}
static void free_lock(lock_t *lock) {
CloseHandle(*lock);
}
#else
// Assuming POSIX
#include <pthread.h>
typedef pthread_mutex_t lock_t;
static void create_lock(lock_t *lock) {
int r = pthread_mutex_init(lock, NULL);
assert(r == 0);
}
static void lock_lock(lock_t *lock) {
int r = pthread_mutex_lock(lock);
assert(r == 0);
}
static void lock_unlock(lock_t *lock) {
int r = pthread_mutex_unlock(lock);
assert(r == 0);
}
static void free_lock(lock_t *lock) {
// Nothing to do for pthreads.
(void)lock;
}
#endif
// End of lock.h.
// Start of free_list.h.
typedef uintptr_t fl_mem;
// An entry in the free list. May be invalid, to avoid having to
// deallocate entries as soon as they are removed. There is also a
// tag, to help with memory reuse.
struct free_list_entry {
size_t size;
fl_mem mem;
const char *tag;
unsigned char valid;
};
struct free_list {
struct free_list_entry *entries; // Pointer to entries.
int capacity; // Number of entries.
int used; // Number of valid entries.
lock_t lock; // Thread safety.
};
static void free_list_init(struct free_list *l) {
l->capacity = 30; // Picked arbitrarily.
l->used = 0;
l->entries = (struct free_list_entry*) malloc(sizeof(struct free_list_entry) * l->capacity);
for (int i = 0; i < l->capacity; i++) {
l->entries[i].valid = 0;
}
create_lock(&l->lock);
}
// Remove invalid entries from the free list.
static void free_list_pack(struct free_list *l) {
lock_lock(&l->lock);
int p = 0;
for (int i = 0; i < l->capacity; i++) {
if (l->entries[i].valid) {
l->entries[p] = l->entries[i];
if (i > p) {
l->entries[i].valid = 0;
}
p++;
}
}
// Now p is the number of used elements. We don't want it to go
// less than the default capacity (although in practice it's OK as
// long as it doesn't become 1).
if (p < 30) {
p = 30;
}
l->entries = realloc(l->entries, p * sizeof(struct free_list_entry));
l->capacity = p;
lock_unlock(&l->lock);
}
static void free_list_destroy(struct free_list *l) {
assert(l->used == 0);
free(l->entries);
free_lock(&l->lock);
}
// Not part of the interface, so no locking.
static int free_list_find_invalid(struct free_list *l) {
int i;
for (i = 0; i < l->capacity; i++) {
if (!l->entries[i].valid) {
break;
}
}
return i;
}
static void free_list_insert(struct free_list *l, size_t size, fl_mem mem, const char *tag) {
lock_lock(&l->lock);
int i = free_list_find_invalid(l);
if (i == l->capacity) {
// List is full; so we have to grow it.
int new_capacity = l->capacity * 2 * sizeof(struct free_list_entry);
l->entries = realloc(l->entries, new_capacity);
for (int j = 0; j < l->capacity; j++) {
l->entries[j+l->capacity].valid = 0;
}
l->capacity *= 2;
}
// Now 'i' points to the first invalid entry.
l->entries[i].valid = 1;
l->entries[i].size = size;
l->entries[i].mem = mem;
l->entries[i].tag = tag;
l->used++;
lock_unlock(&l->lock);
}
// Determine whether this entry in the free list is acceptable for
// satisfying the request. Not public, so no locking.
static bool free_list_acceptable(size_t size, const char* tag, struct free_list_entry *entry) {
// We check not just the hard requirement (is the entry acceptable
// and big enough?) but also put a cap on how much wasted space
// (internal fragmentation) we allow. This is necessarily a
// heuristic, and a crude one.
if (!entry->valid) {
return false;
}
if (size > entry->size) {
return false;
}
// We know the block fits. Now the question is whether it is too
// big. Our policy is as follows:
//
// 1) We don't care about wasted space below 4096 bytes (to avoid
// churn in tiny allocations).
//
// 2) If the tag matches, we allow _any_ amount of wasted space.
//
// 3) Otherwise we allow up to 50% wasted space.
if (entry->size < 4096) {
return true;
}
if (entry->tag == tag) {
return true;
}
if (entry->size < size * 2) {
return true;
}
return false;
}
// Find and remove a memory block of the indicated tag, or if that
// does not exist, another memory block with exactly the desired size.
// Returns 0 on success.
static int free_list_find(struct free_list *l, size_t size, const char *tag,
size_t *size_out, fl_mem *mem_out) {
lock_lock(&l->lock);
int size_match = -1;
int i;
int ret = 1;
for (i = 0; i < l->capacity; i++) {
if (free_list_acceptable(size, tag, &l->entries[i]) &&
(size_match < 0 || l->entries[i].size < l->entries[size_match].size)) {
// If this entry is valid, has sufficient size, and is smaller than the
// best entry found so far, use this entry.
size_match = i;
}
}
if (size_match >= 0) {
l->entries[size_match].valid = 0;
*size_out = l->entries[size_match].size;
*mem_out = l->entries[size_match].mem;
l->used--;
ret = 0;
}
lock_unlock(&l->lock);
return ret;
}
// Remove the first block in the free list. Returns 0 if a block was
// removed, and nonzero if the free list was already empty.
static int free_list_first(struct free_list *l, fl_mem *mem_out) {
lock_lock(&l->lock);
int ret = 1;
for (int i = 0; i < l->capacity; i++) {
if (l->entries[i].valid) {
l->entries[i].valid = 0;
*mem_out = l->entries[i].mem;
l->used--;
ret = 0;
break;
}
}
lock_unlock(&l->lock);
return ret;
}
// End of free_list.h.
#ifdef _MSC_VER
#define inline __inline
#endif
#include <string.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <ctype.h>
#define CL_TARGET_OPENCL_VERSION 120
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifdef __APPLE__
#define CL_SILENCE_DEPRECATION
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#define FUTHARK_F64_ENABLED
// Start of scalar.h.
// Implementation of the primitive scalar operations. Very
// repetitive. This code is inserted directly into both CUDA and
// OpenCL programs, as well as the CPU code, so it has some #ifdefs to
// work everywhere. Some operations are defined as macros because
// this allows us to use them as constant expressions in things like
// array sizes and static initialisers.
// Some of the #ifdefs are because OpenCL uses type-generic functions
// for some operations (e.g. sqrt), while C and CUDA sensibly use
// distinct functions for different precisions (e.g. sqrtf() and
// sqrt()). This is quite annoying. Due to C's unfortunate casting
// rules, it is also really easy to accidentally implement
// floating-point functions in the wrong precision, so be careful.
// Double-precision definitions are only included if the preprocessor
// macro FUTHARK_F64_ENABLED is set.
static inline uint8_t add8(uint8_t x, uint8_t y) {
return x + y;
}
static inline uint16_t add16(uint16_t x, uint16_t y) {
return x + y;
}
static inline uint32_t add32(uint32_t x, uint32_t y) {
return x + y;
}
static inline uint64_t add64(uint64_t x, uint64_t y) {
return x + y;
}
static inline uint8_t sub8(uint8_t x, uint8_t y) {
return x - y;
}
static inline uint16_t sub16(uint16_t x, uint16_t y) {
return x - y;
}
static inline uint32_t sub32(uint32_t x, uint32_t y) {
return x - y;
}
static inline uint64_t sub64(uint64_t x, uint64_t y) {
return x - y;
}
static inline uint8_t mul8(uint8_t x, uint8_t y) {
return x * y;
}
static inline uint16_t mul16(uint16_t x, uint16_t y) {
return x * y;
}
static inline uint32_t mul32(uint32_t x, uint32_t y) {
return x * y;
}
static inline uint64_t mul64(uint64_t x, uint64_t y) {
return x * y;
}
#if ISPC
static inline uint8_t udiv8(uint8_t x, uint8_t y) {
// This strange pattern is used to prevent the ISPC compiler from
// causing SIGFPEs and bogus results on divisions where inactive lanes
// have 0-valued divisors. It ensures that any inactive lane instead
// has a divisor of 1. https://github.com/ispc/ispc/issues/2292
uint8_t ys = 1;
foreach_active(i){
ys = y;
}
return x / ys;
}
static inline uint16_t udiv16(uint16_t x, uint16_t y) {
uint16_t ys = 1;
foreach_active(i){
ys = y;
}
return x / ys;
}
static inline uint32_t udiv32(uint32_t x, uint32_t y) {
uint32_t ys = 1;
foreach_active(i){
ys = y;
}
return x / ys;
}
static inline uint64_t udiv64(uint64_t x, uint64_t y) {
uint64_t ys = 1;
foreach_active(i){
ys = y;
}
return x / ys;
}
static inline uint8_t udiv_up8(uint8_t x, uint8_t y) {
uint8_t ys = 1;
foreach_active(i){
ys = y;
}
return (x + y - 1) / ys;
}
static inline uint16_t udiv_up16(uint16_t x, uint16_t y) {
uint16_t ys = 1;
foreach_active(i){
ys = y;
}
return (x + y - 1) / ys;
}
static inline uint32_t udiv_up32(uint32_t x, uint32_t y) {
uint32_t ys = 1;
foreach_active(i){
ys = y;
}
return (x + y - 1) / ys;
}
static inline uint64_t udiv_up64(uint64_t x, uint64_t y) {
uint64_t ys = 1;
foreach_active(i){
ys = y;
}
return (x + y - 1) / ys;
}
static inline uint8_t umod8(uint8_t x, uint8_t y) {
uint8_t ys = 1;
foreach_active(i){
ys = y;
}
return x % ys;
}
static inline uint16_t umod16(uint16_t x, uint16_t y) {
uint16_t ys = 1;
foreach_active(i){
ys = y;
}
return x % ys;
}
static inline uint32_t umod32(uint32_t x, uint32_t y) {
uint32_t ys = 1;
foreach_active(i){
ys = y;
}
return x % ys;
}
static inline uint64_t umod64(uint64_t x, uint64_t y) {
uint64_t ys = 1;
foreach_active(i){
ys = y;
}
return x % ys;
}
static inline uint8_t udiv_safe8(uint8_t x, uint8_t y) {
uint8_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : x / ys;
}
static inline uint16_t udiv_safe16(uint16_t x, uint16_t y) {
uint16_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : x / ys;
}
static inline uint32_t udiv_safe32(uint32_t x, uint32_t y) {
uint32_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : x / ys;
}
static inline uint64_t udiv_safe64(uint64_t x, uint64_t y) {
uint64_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : x / ys;
}
static inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) {
uint8_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : (x + y - 1) / ys;
}
static inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) {
uint16_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : (x + y - 1) / ys;
}
static inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) {
uint32_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : (x + y - 1) / ys;
}
static inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) {
uint64_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : (x + y - 1) / ys;
}
static inline uint8_t umod_safe8(uint8_t x, uint8_t y) {
uint8_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : x % ys;
}
static inline uint16_t umod_safe16(uint16_t x, uint16_t y) {
uint16_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : x % ys;
}
static inline uint32_t umod_safe32(uint32_t x, uint32_t y) {
uint32_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : x % ys;
}
static inline uint64_t umod_safe64(uint64_t x, uint64_t y) {
uint64_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : x % ys;
}
static inline int8_t sdiv8(int8_t x, int8_t y) {
int8_t ys = 1;
foreach_active(i){
ys = y;
}
int8_t q = x / ys;
int8_t r = x % ys;
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}
static inline int16_t sdiv16(int16_t x, int16_t y) {
int16_t ys = 1;
foreach_active(i){
ys = y;
}
int16_t q = x / ys;
int16_t r = x % ys;
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}
static inline int32_t sdiv32(int32_t x, int32_t y) {
int32_t ys = 1;
foreach_active(i){
ys = y;
}
int32_t q = x / ys;
int32_t r = x % ys;
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}
static inline int64_t sdiv64(int64_t x, int64_t y) {
int64_t ys = 1;
foreach_active(i){
ys = y;
}
int64_t q = x / ys;
int64_t r = x % ys;
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}
static inline int8_t sdiv_up8(int8_t x, int8_t y) {
return sdiv8(x + y - 1, y);
}
static inline int16_t sdiv_up16(int16_t x, int16_t y) {
return sdiv16(x + y - 1, y);
}
static inline int32_t sdiv_up32(int32_t x, int32_t y) {
return sdiv32(x + y - 1, y);
}
static inline int64_t sdiv_up64(int64_t x, int64_t y) {
return sdiv64(x + y - 1, y);
}
static inline int8_t smod8(int8_t x, int8_t y) {
int8_t ys = 1;
foreach_active(i){
ys = y;
}
int8_t r = x % ys;
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}
static inline int16_t smod16(int16_t x, int16_t y) {
int16_t ys = 1;
foreach_active(i){
ys = y;
}
int16_t r = x % ys;
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}
static inline int32_t smod32(int32_t x, int32_t y) {
int32_t ys = 1;
foreach_active(i){
ys = y;
}
int32_t r = x % ys;
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}
static inline int64_t smod64(int64_t x, int64_t y) {
int64_t ys = 1;
foreach_active(i){
ys = y;
}
int64_t r = x % ys;
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}
static inline int8_t sdiv_safe8(int8_t x, int8_t y) {
return y == 0 ? 0 : sdiv8(x, y);
}
static inline int16_t sdiv_safe16(int16_t x, int16_t y) {
return y == 0 ? 0 : sdiv16(x, y);
}
static inline int32_t sdiv_safe32(int32_t x, int32_t y) {
return y == 0 ? 0 : sdiv32(x, y);
}
static inline int64_t sdiv_safe64(int64_t x, int64_t y) {
return y == 0 ? 0 : sdiv64(x, y);
}
static inline int8_t sdiv_up_safe8(int8_t x, int8_t y) {
return sdiv_safe8(x + y - 1, y);
}
static inline int16_t sdiv_up_safe16(int16_t x, int16_t y) {
return sdiv_safe16(x + y - 1, y);
}
static inline int32_t sdiv_up_safe32(int32_t x, int32_t y) {
return sdiv_safe32(x + y - 1, y);
}
static inline int64_t sdiv_up_safe64(int64_t x, int64_t y) {
return sdiv_safe64(x + y - 1, y);
}
static inline int8_t smod_safe8(int8_t x, int8_t y) {
return y == 0 ? 0 : smod8(x, y);
}
static inline int16_t smod_safe16(int16_t x, int16_t y) {
return y == 0 ? 0 : smod16(x, y);
}
static inline int32_t smod_safe32(int32_t x, int32_t y) {
return y == 0 ? 0 : smod32(x, y);
}
static inline int64_t smod_safe64(int64_t x, int64_t y) {
return y == 0 ? 0 : smod64(x, y);
}
static inline int8_t squot8(int8_t x, int8_t y) {
int8_t ys = 1;
foreach_active(i){
ys = y;
}
return x / ys;
}
static inline int16_t squot16(int16_t x, int16_t y) {
int16_t ys = 1;
foreach_active(i){
ys = y;
}
return x / ys;
}
static inline int32_t squot32(int32_t x, int32_t y) {
int32_t ys = 1;
foreach_active(i){
ys = y;
}
return x / ys;
}
static inline int64_t squot64(int64_t x, int64_t y) {
int64_t ys = 1;
foreach_active(i){
ys = y;
}
return x / ys;
}
static inline int8_t srem8(int8_t x, int8_t y) {
int8_t ys = 1;
foreach_active(i){
ys = y;
}
return x % ys;
}
static inline int16_t srem16(int16_t x, int16_t y) {
int16_t ys = 1;
foreach_active(i){
ys = y;
}
return x % ys;
}
static inline int32_t srem32(int32_t x, int32_t y) {
int32_t ys = 1;
foreach_active(i){
ys = y;
}
return x % ys;
}
static inline int64_t srem64(int64_t x, int64_t y) {
int8_t ys = 1;
foreach_active(i){
ys = y;
}
return x % ys;
}
static inline int8_t squot_safe8(int8_t x, int8_t y) {
int8_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : x / ys;
}
static inline int16_t squot_safe16(int16_t x, int16_t y) {
int16_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : x / ys;
}
static inline int32_t squot_safe32(int32_t x, int32_t y) {
int32_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : x / ys;
}
static inline int64_t squot_safe64(int64_t x, int64_t y) {
int64_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : x / ys;
}
static inline int8_t srem_safe8(int8_t x, int8_t y) {
int8_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : x % ys;
}
static inline int16_t srem_safe16(int16_t x, int16_t y) {
int16_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : x % ys;
}
static inline int32_t srem_safe32(int32_t x, int32_t y) {
int32_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : x % ys;
}
static inline int64_t srem_safe64(int64_t x, int64_t y) {
int64_t ys = 1;
foreach_active(i){
ys = y;
}
return y == 0 ? 0 : x % ys;
}
#else
static inline uint8_t udiv8(uint8_t x, uint8_t y) {
return x / y;
}
static inline uint16_t udiv16(uint16_t x, uint16_t y) {
return x / y;
}
static inline uint32_t udiv32(uint32_t x, uint32_t y) {
return x / y;
}
static inline uint64_t udiv64(uint64_t x, uint64_t y) {
return x / y;
}
static inline uint8_t udiv_up8(uint8_t x, uint8_t y) {
return (x + y - 1) / y;
}
static inline uint16_t udiv_up16(uint16_t x, uint16_t y) {
return (x + y - 1) / y;
}
static inline uint32_t udiv_up32(uint32_t x, uint32_t y) {
return (x + y - 1) / y;
}
static inline uint64_t udiv_up64(uint64_t x, uint64_t y) {
return (x + y - 1) / y;
}
static inline uint8_t umod8(uint8_t x, uint8_t y) {
return x % y;
}
static inline uint16_t umod16(uint16_t x, uint16_t y) {
return x % y;
}
static inline uint32_t umod32(uint32_t x, uint32_t y) {
return x % y;
}
static inline uint64_t umod64(uint64_t x, uint64_t y) {
return x % y;
}
static inline uint8_t udiv_safe8(uint8_t x, uint8_t y) {
return y == 0 ? 0 : x / y;
}
static inline uint16_t udiv_safe16(uint16_t x, uint16_t y) {
return y == 0 ? 0 : x / y;
}
static inline uint32_t udiv_safe32(uint32_t x, uint32_t y) {
return y == 0 ? 0 : x / y;
}
static inline uint64_t udiv_safe64(uint64_t x, uint64_t y) {
return y == 0 ? 0 : x / y;
}
static inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) {
return y == 0 ? 0 : (x + y - 1) / y;
}
static inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) {
return y == 0 ? 0 : (x + y - 1) / y;
}
static inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) {
return y == 0 ? 0 : (x + y - 1) / y;
}
static inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) {
return y == 0 ? 0 : (x + y - 1) / y;
}
static inline uint8_t umod_safe8(uint8_t x, uint8_t y) {
return y == 0 ? 0 : x % y;
}
static inline uint16_t umod_safe16(uint16_t x, uint16_t y) {
return y == 0 ? 0 : x % y;
}
static inline uint32_t umod_safe32(uint32_t x, uint32_t y) {
return y == 0 ? 0 : x % y;
}
static inline uint64_t umod_safe64(uint64_t x, uint64_t y) {
return y == 0 ? 0 : x % y;
}
static inline int8_t sdiv8(int8_t x, int8_t y) {
int8_t q = x / y;
int8_t r = x % y;
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}
static inline int16_t sdiv16(int16_t x, int16_t y) {
int16_t q = x / y;
int16_t r = x % y;
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}
static inline int32_t sdiv32(int32_t x, int32_t y) {
int32_t q = x / y;
int32_t r = x % y;
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}
static inline int64_t sdiv64(int64_t x, int64_t y) {
int64_t q = x / y;
int64_t r = x % y;
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}
static inline int8_t sdiv_up8(int8_t x, int8_t y) {
return sdiv8(x + y - 1, y);
}
static inline int16_t sdiv_up16(int16_t x, int16_t y) {
return sdiv16(x + y - 1, y);
}
static inline int32_t sdiv_up32(int32_t x, int32_t y) {
return sdiv32(x + y - 1, y);
}
static inline int64_t sdiv_up64(int64_t x, int64_t y) {
return sdiv64(x + y - 1, y);
}
static inline int8_t smod8(int8_t x, int8_t y) {
int8_t r = x % y;
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}
static inline int16_t smod16(int16_t x, int16_t y) {
int16_t r = x % y;
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}
static inline int32_t smod32(int32_t x, int32_t y) {
int32_t r = x % y;
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}
static inline int64_t smod64(int64_t x, int64_t y) {
int64_t r = x % y;
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}
static inline int8_t sdiv_safe8(int8_t x, int8_t y) {
return y == 0 ? 0 : sdiv8(x, y);
}
static inline int16_t sdiv_safe16(int16_t x, int16_t y) {
return y == 0 ? 0 : sdiv16(x, y);
}
static inline int32_t sdiv_safe32(int32_t x, int32_t y) {
return y == 0 ? 0 : sdiv32(x, y);
}
static inline int64_t sdiv_safe64(int64_t x, int64_t y) {
return y == 0 ? 0 : sdiv64(x, y);
}
static inline int8_t sdiv_up_safe8(int8_t x, int8_t y) {
return sdiv_safe8(x + y - 1, y);
}
static inline int16_t sdiv_up_safe16(int16_t x, int16_t y) {
return sdiv_safe16(x + y - 1, y);
}
static inline int32_t sdiv_up_safe32(int32_t x, int32_t y) {
return sdiv_safe32(x + y - 1, y);
}
static inline int64_t sdiv_up_safe64(int64_t x, int64_t y) {
return sdiv_safe64(x + y - 1, y);
}
static inline int8_t smod_safe8(int8_t x, int8_t y) {
return y == 0 ? 0 : smod8(x, y);
}
static inline int16_t smod_safe16(int16_t x, int16_t y) {
return y == 0 ? 0 : smod16(x, y);
}
static inline int32_t smod_safe32(int32_t x, int32_t y) {
return y == 0 ? 0 : smod32(x, y);
}
static inline int64_t smod_safe64(int64_t x, int64_t y) {
return y == 0 ? 0 : smod64(x, y);
}
static inline int8_t squot8(int8_t x, int8_t y) {
return x / y;
}
static inline int16_t squot16(int16_t x, int16_t y) {
return x / y;
}
static inline int32_t squot32(int32_t x, int32_t y) {
return x / y;
}
static inline int64_t squot64(int64_t x, int64_t y) {
return x / y;
}
static inline int8_t srem8(int8_t x, int8_t y) {
return x % y;
}
static inline int16_t srem16(int16_t x, int16_t y) {
return x % y;
}
static inline int32_t srem32(int32_t x, int32_t y) {
return x % y;
}
static inline int64_t srem64(int64_t x, int64_t y) {
return x % y;
}
static inline int8_t squot_safe8(int8_t x, int8_t y) {
return y == 0 ? 0 : x / y;
}
static inline int16_t squot_safe16(int16_t x, int16_t y) {
return y == 0 ? 0 : x / y;
}
static inline int32_t squot_safe32(int32_t x, int32_t y) {
return y == 0 ? 0 : x / y;
}
static inline int64_t squot_safe64(int64_t x, int64_t y) {
return y == 0 ? 0 : x / y;
}
static inline int8_t srem_safe8(int8_t x, int8_t y) {
return y == 0 ? 0 : x % y;
}
static inline int16_t srem_safe16(int16_t x, int16_t y) {
return y == 0 ? 0 : x % y;
}
static inline int32_t srem_safe32(int32_t x, int32_t y) {
return y == 0 ? 0 : x % y;
}
static inline int64_t srem_safe64(int64_t x, int64_t y) {
return y == 0 ? 0 : x % y;
}
#endif
static inline int8_t smin8(int8_t x, int8_t y) {
return x < y ? x : y;
}
static inline int16_t smin16(int16_t x, int16_t y) {
return x < y ? x : y;
}
static inline int32_t smin32(int32_t x, int32_t y) {
return x < y ? x : y;
}
static inline int64_t smin64(int64_t x, int64_t y) {
return x < y ? x : y;
}
static inline uint8_t umin8(uint8_t x, uint8_t y) {
return x < y ? x : y;
}
static inline uint16_t umin16(uint16_t x, uint16_t y) {
return x < y ? x : y;
}
static inline uint32_t umin32(uint32_t x, uint32_t y) {
return x < y ? x : y;
}
static inline uint64_t umin64(uint64_t x, uint64_t y) {
return x < y ? x : y;
}
static inline int8_t smax8(int8_t x, int8_t y) {
return x < y ? y : x;
}
static inline int16_t smax16(int16_t x, int16_t y) {
return x < y ? y : x;
}
static inline int32_t smax32(int32_t x, int32_t y) {
return x < y ? y : x;
}
static inline int64_t smax64(int64_t x, int64_t y) {
return x < y ? y : x;
}
static inline uint8_t umax8(uint8_t x, uint8_t y) {
return x < y ? y : x;
}
static inline uint16_t umax16(uint16_t x, uint16_t y) {
return x < y ? y : x;
}
static inline uint32_t umax32(uint32_t x, uint32_t y) {
return x < y ? y : x;
}
static inline uint64_t umax64(uint64_t x, uint64_t y) {
return x < y ? y : x;
}
static inline uint8_t shl8(uint8_t x, uint8_t y) {
return (uint8_t)(x << y);
}
static inline uint16_t shl16(uint16_t x, uint16_t y) {
return (uint16_t)(x << y);
}
static inline uint32_t shl32(uint32_t x, uint32_t y) {
return x << y;
}
static inline uint64_t shl64(uint64_t x, uint64_t y) {
return x << y;
}
static inline uint8_t lshr8(uint8_t x, uint8_t y) {
return x >> y;
}
static inline uint16_t lshr16(uint16_t x, uint16_t y) {
return x >> y;
}
static inline uint32_t lshr32(uint32_t x, uint32_t y) {
return x >> y;
}
static inline uint64_t lshr64(uint64_t x, uint64_t y) {
return x >> y;
}
static inline int8_t ashr8(int8_t x, int8_t y) {
return x >> y;
}
static inline int16_t ashr16(int16_t x, int16_t y) {
return x >> y;
}
static inline int32_t ashr32(int32_t x, int32_t y) {
return x >> y;
}
static inline int64_t ashr64(int64_t x, int64_t y) {
return x >> y;
}
static inline uint8_t and8(uint8_t x, uint8_t y) {
return x & y;
}
static inline uint16_t and16(uint16_t x, uint16_t y) {
return x & y;
}
static inline uint32_t and32(uint32_t x, uint32_t y) {
return x & y;
}
static inline uint64_t and64(uint64_t x, uint64_t y) {
return x & y;
}
static inline uint8_t or8(uint8_t x, uint8_t y) {
return x | y;
}
static inline uint16_t or16(uint16_t x, uint16_t y) {
return x | y;
}
static inline uint32_t or32(uint32_t x, uint32_t y) {
return x | y;
}
static inline uint64_t or64(uint64_t x, uint64_t y) {
return x | y;
}
static inline uint8_t xor8(uint8_t x, uint8_t y) {
return x ^ y;
}
static inline uint16_t xor16(uint16_t x, uint16_t y) {
return x ^ y;
}
static inline uint32_t xor32(uint32_t x, uint32_t y) {
return x ^ y;
}
static inline uint64_t xor64(uint64_t x, uint64_t y) {
return x ^ y;
}
static inline bool ult8(uint8_t x, uint8_t y) {
return x < y;
}
static inline bool ult16(uint16_t x, uint16_t y) {
return x < y;
}
static inline bool ult32(uint32_t x, uint32_t y) {
return x < y;
}
static inline bool ult64(uint64_t x, uint64_t y) {
return x < y;
}
static inline bool ule8(uint8_t x, uint8_t y) {
return x <= y;
}
static inline bool ule16(uint16_t x, uint16_t y) {
return x <= y;
}
static inline bool ule32(uint32_t x, uint32_t y) {
return x <= y;
}
static inline bool ule64(uint64_t x, uint64_t y) {
return x <= y;
}
static inline bool slt8(int8_t x, int8_t y) {
return x < y;
}
static inline bool slt16(int16_t x, int16_t y) {
return x < y;
}
static inline bool slt32(int32_t x, int32_t y) {
return x < y;
}
static inline bool slt64(int64_t x, int64_t y) {
return x < y;
}
static inline bool sle8(int8_t x, int8_t y) {
return x <= y;
}
static inline bool sle16(int16_t x, int16_t y) {
return x <= y;
}
static inline bool sle32(int32_t x, int32_t y) {
return x <= y;
}
static inline bool sle64(int64_t x, int64_t y) {
return x <= y;
}
static inline uint8_t pow8(uint8_t x, uint8_t y) {
uint8_t res = 1, rem = y;
while (rem != 0) {
if (rem & 1)
res *= x;
rem >>= 1;
x *= x;
}
return res;
}
static inline uint16_t pow16(uint16_t x, uint16_t y) {
uint16_t res = 1, rem = y;
while (rem != 0) {
if (rem & 1)
res *= x;
rem >>= 1;
x *= x;
}
return res;
}
static inline uint32_t pow32(uint32_t x, uint32_t y) {
uint32_t res = 1, rem = y;
while (rem != 0) {
if (rem & 1)
res *= x;
rem >>= 1;
x *= x;
}
return res;
}
static inline uint64_t pow64(uint64_t x, uint64_t y) {
uint64_t res = 1, rem = y;
while (rem != 0) {
if (rem & 1)
res *= x;
rem >>= 1;
x *= x;
}
return res;
}
static inline bool itob_i8_bool(int8_t x) {
return x != 0;
}
static inline bool itob_i16_bool(int16_t x) {
return x != 0;
}
static inline bool itob_i32_bool(int32_t x) {
return x != 0;
}
static inline bool itob_i64_bool(int64_t x) {
return x != 0;
}
static inline int8_t btoi_bool_i8(bool x) {
return x;
}
static inline int16_t btoi_bool_i16(bool x) {
return x;
}
static inline int32_t btoi_bool_i32(bool x) {
return x;
}
static inline int64_t btoi_bool_i64(bool x) {
return x;
}
#define sext_i8_i8(x) ((int8_t) (int8_t) (x))
#define sext_i8_i16(x) ((int16_t) (int8_t) (x))
#define sext_i8_i32(x) ((int32_t) (int8_t) (x))
#define sext_i8_i64(x) ((int64_t) (int8_t) (x))
#define sext_i16_i8(x) ((int8_t) (int16_t) (x))
#define sext_i16_i16(x) ((int16_t) (int16_t) (x))
#define sext_i16_i32(x) ((int32_t) (int16_t) (x))
#define sext_i16_i64(x) ((int64_t) (int16_t) (x))
#define sext_i32_i8(x) ((int8_t) (int32_t) (x))
#define sext_i32_i16(x) ((int16_t) (int32_t) (x))
#define sext_i32_i32(x) ((int32_t) (int32_t) (x))
#define sext_i32_i64(x) ((int64_t) (int32_t) (x))
#define sext_i64_i8(x) ((int8_t) (int64_t) (x))
#define sext_i64_i16(x) ((int16_t) (int64_t) (x))
#define sext_i64_i32(x) ((int32_t) (int64_t) (x))
#define sext_i64_i64(x) ((int64_t) (int64_t) (x))
#define zext_i8_i8(x) ((int8_t) (uint8_t) (x))
#define zext_i8_i16(x) ((int16_t) (uint8_t) (x))
#define zext_i8_i32(x) ((int32_t) (uint8_t) (x))
#define zext_i8_i64(x) ((int64_t) (uint8_t) (x))
#define zext_i16_i8(x) ((int8_t) (uint16_t) (x))
#define zext_i16_i16(x) ((int16_t) (uint16_t) (x))
#define zext_i16_i32(x) ((int32_t) (uint16_t) (x))
#define zext_i16_i64(x) ((int64_t) (uint16_t) (x))
#define zext_i32_i8(x) ((int8_t) (uint32_t) (x))
#define zext_i32_i16(x) ((int16_t) (uint32_t) (x))
#define zext_i32_i32(x) ((int32_t) (uint32_t) (x))
#define zext_i32_i64(x) ((int64_t) (uint32_t) (x))
#define zext_i64_i8(x) ((int8_t) (uint64_t) (x))
#define zext_i64_i16(x) ((int16_t) (uint64_t) (x))
#define zext_i64_i32(x) ((int32_t) (uint64_t) (x))
#define zext_i64_i64(x) ((int64_t) (uint64_t) (x))
static int8_t abs8(int8_t x) {
return (int8_t)abs(x);
}
static int16_t abs16(int16_t x) {
return (int16_t)abs(x);
}
static int32_t abs32(int32_t x) {
return abs(x);
}
static int64_t abs64(int64_t x) {
#if defined(__OPENCL_VERSION__) || defined(ISPC)
return abs(x);
#else
return llabs(x);
#endif
}
#if defined(__OPENCL_VERSION__)
static int32_t futrts_popc8(int8_t x) {
return popcount(x);
}
static int32_t futrts_popc16(int16_t x) {
return popcount(x);
}
static int32_t futrts_popc32(int32_t x) {
return popcount(x);
}
static int32_t futrts_popc64(int64_t x) {
return popcount(x);
}
#elif defined(__CUDA_ARCH__)
static int32_t futrts_popc8(int8_t x) {
return __popc(zext_i8_i32(x));
}
static int32_t futrts_popc16(int16_t x) {
return __popc(zext_i16_i32(x));
}
static int32_t futrts_popc32(int32_t x) {
return __popc(x);
}
static int32_t futrts_popc64(int64_t x) {
return __popcll(x);
}
#else // Not OpenCL or CUDA, but plain C.
static int32_t futrts_popc8(uint8_t x) {
int c = 0;
for (; x; ++c) { x &= x - 1; }
return c;
}
static int32_t futrts_popc16(uint16_t x) {
int c = 0;
for (; x; ++c) { x &= x - 1; }
return c;
}
static int32_t futrts_popc32(uint32_t x) {
int c = 0;
for (; x; ++c) { x &= x - 1; }
return c;
}
static int32_t futrts_popc64(uint64_t x) {
int c = 0;
for (; x; ++c) { x &= x - 1; }
return c;
}
#endif
#if defined(__OPENCL_VERSION__)
static uint8_t futrts_umul_hi8 ( uint8_t a, uint8_t b) { return mul_hi(a, b); }
static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return mul_hi(a, b); }
static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return mul_hi(a, b); }
static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return mul_hi(a, b); }
static uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return mul_hi(a, b); }
static uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return mul_hi(a, b); }
static uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return mul_hi(a, b); }
static uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return mul_hi(a, b); }
#elif defined(__CUDA_ARCH__)
static uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return __umulhi(a, b); }
static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return __umul64hi(a, b); }
static uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; }
static uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }
static uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return __mulhi(a, b); }
static uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return __mul64hi(a, b); }
#elif ISPC
static uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }
static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) {
uint64_t ah = a >> 32;
uint64_t al = a & 0xffffffff;
uint64_t bh = b >> 32;
uint64_t bl = b & 0xffffffff;
uint64_t p1 = al * bl;
uint64_t p2 = al * bh;
uint64_t p3 = ah * bl;
uint64_t p4 = ah * bh;
uint64_t p1h = p1 >> 32;
uint64_t p2h = p2 >> 32;
uint64_t p3h = p3 >> 32;
uint64_t p2l = p2 & 0xffffffff;
uint64_t p3l = p3 & 0xffffffff;
uint64_t l = p1h + p2l + p3l;
uint64_t m = (p2 >> 32) + (p3 >> 32);
uint64_t h = (l >> 32) + m + p4;
return h;
}
static int8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
static int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
static int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }
static int64_t futrts_smul_hi64(int64_t a, int64_t b) {
uint64_t ah = a >> 32;
uint64_t al = a & 0xffffffff;
uint64_t bh = b >> 32;
uint64_t bl = b & 0xffffffff;
uint64_t p1 = al * bl;
int64_t p2 = al * bh;
int64_t p3 = ah * bl;
uint64_t p4 = ah * bh;
uint64_t p1h = p1 >> 32;
uint64_t p2h = p2 >> 32;
uint64_t p3h = p3 >> 32;
uint64_t p2l = p2 & 0xffffffff;
uint64_t p3l = p3 & 0xffffffff;
uint64_t l = p1h + p2l + p3l;
uint64_t m = (p2 >> 32) + (p3 >> 32);
uint64_t h = (l >> 32) + m + p4;
return h;
}
#else // Not OpenCL, ISPC, or CUDA, but plain C.
static uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }
static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return ((__uint128_t)a) * ((__uint128_t)b) >> 64; }
static int8_t futrts_smul_hi8(int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; }
static int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }
static int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((int64_t)a) * ((int64_t)b) >> 32; }
static int64_t futrts_smul_hi64(int64_t a, int64_t b) { return ((__int128_t)a) * ((__int128_t)b) >> 64; }
#endif
#if defined(__OPENCL_VERSION__)
static uint8_t futrts_umad_hi8 ( uint8_t a, uint8_t b, uint8_t c) { return mad_hi(a, b, c); }
static uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return mad_hi(a, b, c); }
static uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return mad_hi(a, b, c); }
static uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return mad_hi(a, b, c); }
static uint8_t futrts_smad_hi8( int8_t a, int8_t b, int8_t c) { return mad_hi(a, b, c); }
static uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return mad_hi(a, b, c); }
static uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return mad_hi(a, b, c); }
static uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return mad_hi(a, b, c); }
#else // Not OpenCL
static uint8_t futrts_umad_hi8( uint8_t a, uint8_t b, uint8_t c) { return futrts_umul_hi8(a, b) + c; }
static uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return futrts_umul_hi16(a, b) + c; }
static uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return futrts_umul_hi32(a, b) + c; }
static uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return futrts_umul_hi64(a, b) + c; }
static uint8_t futrts_smad_hi8 ( int8_t a, int8_t b, int8_t c) { return futrts_smul_hi8(a, b) + c; }
static uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return futrts_smul_hi16(a, b) + c; }
static uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return futrts_smul_hi32(a, b) + c; }
static uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return futrts_smul_hi64(a, b) + c; }
#endif
#if defined(__OPENCL_VERSION__)
static int32_t futrts_clzz8(int8_t x) {
return clz(x);
}
static int32_t futrts_clzz16(int16_t x) {
return clz(x);
}
static int32_t futrts_clzz32(int32_t x) {
return clz(x);
}
static int32_t futrts_clzz64(int64_t x) {
return clz(x);
}
#elif defined(__CUDA_ARCH__)
static int32_t futrts_clzz8(int8_t x) {
return __clz(zext_i8_i32(x)) - 24;
}
static int32_t futrts_clzz16(int16_t x) {
return __clz(zext_i16_i32(x)) - 16;
}
static int32_t futrts_clzz32(int32_t x) {
return __clz(x);
}
static int32_t futrts_clzz64(int64_t x) {
return __clzll(x);
}
#elif ISPC
static int32_t futrts_clzz8(int8_t x) {
return count_leading_zeros((int32_t)(uint8_t)x)-24;
}
static int32_t futrts_clzz16(int16_t x) {
return count_leading_zeros((int32_t)(uint16_t)x)-16;
}
static int32_t futrts_clzz32(int32_t x) {
return count_leading_zeros(x);
}
static int32_t futrts_clzz64(int64_t x) {
return count_leading_zeros(x);
}
#else // Not OpenCL, ISPC or CUDA, but plain C.
static int32_t futrts_clzz8(int8_t x) {
return x == 0 ? 8 : __builtin_clz((uint32_t)zext_i8_i32(x)) - 24;
}
static int32_t futrts_clzz16(int16_t x) {
return x == 0 ? 16 : __builtin_clz((uint32_t)zext_i16_i32(x)) - 16;
}
static int32_t futrts_clzz32(int32_t x) {
return x == 0 ? 32 : __builtin_clz((uint32_t)x);
}
static int32_t futrts_clzz64(int64_t x) {
return x == 0 ? 64 : __builtin_clzll((uint64_t)x);
}
#endif
#if defined(__OPENCL_VERSION__)
static int32_t futrts_ctzz8(int8_t x) {
int i = 0;
for (; i < 8 && (x & 1) == 0; i++, x >>= 1)
;
return i;
}
static int32_t futrts_ctzz16(int16_t x) {
int i = 0;
for (; i < 16 && (x & 1) == 0; i++, x >>= 1)
;
return i;
}
static int32_t futrts_ctzz32(int32_t x) {
int i = 0;
for (; i < 32 && (x & 1) == 0; i++, x >>= 1)
;
return i;
}
static int32_t futrts_ctzz64(int64_t x) {
int i = 0;
for (; i < 64 && (x & 1) == 0; i++, x >>= 1)
;
return i;
}
#elif defined(__CUDA_ARCH__)
static int32_t futrts_ctzz8(int8_t x) {
int y = __ffs(x);
return y == 0 ? 8 : y - 1;
}
static int32_t futrts_ctzz16(int16_t x) {
int y = __ffs(x);
return y == 0 ? 16 : y - 1;
}
static int32_t futrts_ctzz32(int32_t x) {
int y = __ffs(x);
return y == 0 ? 32 : y - 1;
}
static int32_t futrts_ctzz64(int64_t x) {
int y = __ffsll(x);
return y == 0 ? 64 : y - 1;
}
#elif ISPC
static int32_t futrts_ctzz8(int8_t x) {
return x == 0 ? 8 : count_trailing_zeros((int32_t)x);
}
static int32_t futrts_ctzz16(int16_t x) {
return x == 0 ? 16 : count_trailing_zeros((int32_t)x);
}
static int32_t futrts_ctzz32(int32_t x) {
return count_trailing_zeros(x);
}
static int32_t futrts_ctzz64(int64_t x) {
return count_trailing_zeros(x);
}
#else // Not OpenCL or CUDA, but plain C.
static int32_t futrts_ctzz8(int8_t x) {
return x == 0 ? 8 : __builtin_ctz((uint32_t)x);
}
static int32_t futrts_ctzz16(int16_t x) {
return x == 0 ? 16 : __builtin_ctz((uint32_t)x);
}
static int32_t futrts_ctzz32(int32_t x) {
return x == 0 ? 32 : __builtin_ctz((uint32_t)x);
}
static int32_t futrts_ctzz64(int64_t x) {
return x == 0 ? 64 : __builtin_ctzll((uint64_t)x);
}
#endif
static inline float fdiv32(float x, float y) {
return x / y;
}
static inline float fadd32(float x, float y) {
return x + y;
}
static inline float fsub32(float x, float y) {
return x - y;
}
static inline float fmul32(float x, float y) {
return x * y;
}
static inline bool cmplt32(float x, float y) {
return x < y;
}
static inline bool cmple32(float x, float y) {
return x <= y;
}
static inline float sitofp_i8_f32(int8_t x) {
return (float) x;
}
static inline float sitofp_i16_f32(int16_t x) {
return (float) x;
}
static inline float sitofp_i32_f32(int32_t x) {
return (float) x;
}
static inline float sitofp_i64_f32(int64_t x) {
return (float) x;
}
static inline float uitofp_i8_f32(uint8_t x) {
return (float) x;
}
static inline float uitofp_i16_f32(uint16_t x) {
return (float) x;
}
static inline float uitofp_i32_f32(uint32_t x) {
return (float) x;
}
static inline float uitofp_i64_f32(uint64_t x) {
return (float) x;
}
#ifdef __OPENCL_VERSION__
static inline float fabs32(float x) {
return fabs(x);
}
static inline float fmax32(float x, float y) {
return fmax(x, y);
}
static inline float fmin32(float x, float y) {
return fmin(x, y);
}
static inline float fpow32(float x, float y) {
return pow(x, y);
}
#elif ISPC
static inline float fabs32(float x) {
return abs(x);
}
static inline float fmax32(float x, float y) {
return isnan(x) ? y : isnan(y) ? x : max(x, y);
}
static inline float fmin32(float x, float y) {
return isnan(x) ? y : isnan(y) ? x : min(x, y);
}
static inline float fpow32(float a, float b) {
float ret;
foreach_active (i) {
uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
ret = insert(ret, i, r);
}
return ret;
}
#else // Not OpenCL, but CUDA or plain C.
static inline float fabs32(float x) {
return fabsf(x);
}
static inline float fmax32(float x, float y) {
return fmaxf(x, y);
}
static inline float fmin32(float x, float y) {
return fminf(x, y);
}
static inline float fpow32(float x, float y) {
return powf(x, y);
}
#endif
static inline bool futrts_isnan32(float x) {
return isnan(x);
}
#if ISPC
static inline bool futrts_isinf32(float x) {
return !isnan(x) && isnan(x - x);
}
static inline bool futrts_isfinite32(float x) {
return !isnan(x) && !futrts_isinf32(x);
}
#else
static inline bool futrts_isinf32(float x) {
return isinf(x);
}
#endif
static inline int8_t fptosi_f32_i8(float x) {
if (futrts_isnan32(x) || futrts_isinf32(x)) {
return 0;
} else {
return (int8_t) x;
}
}
static inline int16_t fptosi_f32_i16(float x) {
if (futrts_isnan32(x) || futrts_isinf32(x)) {
return 0;
} else {
return (int16_t) x;
}
}
static inline int32_t fptosi_f32_i32(float x) {
if (futrts_isnan32(x) || futrts_isinf32(x)) {
return 0;
} else {
return (int32_t) x;
}
}
static inline int64_t fptosi_f32_i64(float x) {
if (futrts_isnan32(x) || futrts_isinf32(x)) {
return 0;
} else {
return (int64_t) x;
};
}
static inline uint8_t fptoui_f32_i8(float x) {
if (futrts_isnan32(x) || futrts_isinf32(x)) {
return 0;
} else {
return (uint8_t) (int8_t) x;
}
}
static inline uint16_t fptoui_f32_i16(float x) {
if (futrts_isnan32(x) || futrts_isinf32(x)) {
return 0;
} else {
return (uint16_t) (int16_t) x;
}
}
static inline uint32_t fptoui_f32_i32(float x) {
if (futrts_isnan32(x) || futrts_isinf32(x)) {
return 0;
} else {
return (uint32_t) (int32_t) x;
}
}
static inline uint64_t fptoui_f32_i64(float x) {
if (futrts_isnan32(x) || futrts_isinf32(x)) {
return 0;
} else {
return (uint64_t) (int64_t) x;
}
}
static inline bool ftob_f32_bool(float x) {
return x != 0;
}
static inline float btof_bool_f32(bool x) {
return x ? 1 : 0;
}
#ifdef __OPENCL_VERSION__
static inline float futrts_log32(float x) {
return log(x);
}
static inline float futrts_log2_32(float x) {
return log2(x);
}
static inline float futrts_log10_32(float x) {
return log10(x);
}
static inline float futrts_log1p_32(float x) {
return log1p(x);
}
static inline float futrts_sqrt32(float x) {
return sqrt(x);
}
static inline float futrts_cbrt32(float x) {
return cbrt(x);
}
static inline float futrts_exp32(float x) {
return exp(x);
}
static inline float futrts_cos32(float x) {
return cos(x);
}
static inline float futrts_sin32(float x) {
return sin(x);
}
static inline float futrts_tan32(float x) {
return tan(x);
}
static inline float futrts_acos32(float x) {
return acos(x);
}
static inline float futrts_asin32(float x) {
return asin(x);
}
static inline float futrts_atan32(float x) {
return atan(x);
}
static inline float futrts_cosh32(float x) {
return cosh(x);
}
static inline float futrts_sinh32(float x) {
return sinh(x);
}
static inline float futrts_tanh32(float x) {
return tanh(x);
}
static inline float futrts_acosh32(float x) {
return acosh(x);
}
static inline float futrts_asinh32(float x) {
return asinh(x);
}
static inline float futrts_atanh32(float x) {
return atanh(x);
}
static inline float futrts_atan2_32(float x, float y) {
return atan2(x, y);
}
static inline float futrts_hypot32(float x, float y) {
return hypot(x, y);
}
static inline float futrts_gamma32(float x) {
return tgamma(x);
}
static inline float futrts_lgamma32(float x) {
return lgamma(x);
}
static inline float futrts_erf32(float x) {
return erf(x);
}
static inline float futrts_erfc32(float x) {
return erfc(x);
}
static inline float fmod32(float x, float y) {
return fmod(x, y);
}
static inline float futrts_round32(float x) {
return rint(x);
}
static inline float futrts_floor32(float x) {
return floor(x);
}
static inline float futrts_ceil32(float x) {
return ceil(x);
}
static inline float futrts_nextafter32(float x, float y) {
return nextafter(x, y);
}
static inline float futrts_lerp32(float v0, float v1, float t) {
return mix(v0, v1, t);
}
static inline float futrts_mad32(float a, float b, float c) {
return mad(a, b, c);
}
static inline float futrts_fma32(float a, float b, float c) {
return fma(a, b, c);
}
#elif ISPC
static inline float futrts_log32(float x) {
return futrts_isfinite32(x) || (futrts_isinf32(x) && x < 0)? log(x) : x;
}
static inline float futrts_log2_32(float x) {
return futrts_log32(x) / log(2.0f);
}
static inline float futrts_log10_32(float x) {
return futrts_log32(x) / log(10.0f);
}
static inline float futrts_log1p_32(float x) {
if(x == -1.0f || (futrts_isinf32(x) && x > 0.0f)) return x / 0.0f;
float y = 1.0f + x;
float z = y - 1.0f;
return log(y) - (z-x)/y;
}
static inline float futrts_sqrt32(float x) {
return sqrt(x);
}
extern "C" unmasked uniform float cbrtf(uniform float);
static inline float futrts_cbrt32(float x) {
float res;
foreach_active (i) {
uniform float r = cbrtf(extract(x, i));
res = insert(res, i, r);
}
return res;
}
static inline float futrts_exp32(float x) {
return exp(x);
}
static inline float futrts_cos32(float x) {
return cos(x);
}
static inline float futrts_sin32(float x) {
return sin(x);
}
static inline float futrts_tan32(float x) {
return tan(x);
}
static inline float futrts_acos32(float x) {
return acos(x);
}
static inline float futrts_asin32(float x) {
return asin(x);
}
static inline float futrts_atan32(float x) {
return atan(x);
}
static inline float futrts_cosh32(float x) {
return (exp(x)+exp(-x)) / 2.0f;
}
static inline float futrts_sinh32(float x) {
return (exp(x)-exp(-x)) / 2.0f;
}
static inline float futrts_tanh32(float x) {
return futrts_sinh32(x)/futrts_cosh32(x);
}
static inline float futrts_acosh32(float x) {
float f = x+sqrt(x*x-1);
if(futrts_isfinite32(f)) return log(f);
return f;
}
static inline float futrts_asinh32(float x) {
float f = x+sqrt(x*x+1);
if(futrts_isfinite32(f)) return log(f);
return f;
}
static inline float futrts_atanh32(float x) {
float f = (1+x)/(1-x);
if(futrts_isfinite32(f)) return log(f)/2.0f;
return f;
}
static inline float futrts_atan2_32(float x, float y) {
return (x == 0.0f && y == 0.0f) ? 0.0f : atan2(x, y);
}
static inline float futrts_hypot32(float x, float y) {
if (futrts_isfinite32(x) && futrts_isfinite32(y)) {
x = abs(x);
y = abs(y);
float a;
float b;
if (x >= y){
a = x;
b = y;
} else {
a = y;
b = x;
}
if(b == 0){
return a;
}
int e;
float an;
float bn;
an = frexp (a, &e);
bn = ldexp (b, - e);
float cn;
cn = sqrt (an * an + bn * bn);
return ldexp (cn, e);
} else {
if (futrts_isinf32(x) || futrts_isinf32(y)) return INFINITY;
else return x + y;
}
}
extern "C" unmasked uniform float tgammaf(uniform float x);
static inline float futrts_gamma32(float x) {
float res;
foreach_active (i) {
uniform float r = tgammaf(extract(x, i));
res = insert(res, i, r);
}
return res;
}
extern "C" unmasked uniform float lgammaf(uniform float x);
static inline float futrts_lgamma32(float x) {
float res;
foreach_active (i) {
uniform float r = lgammaf(extract(x, i));
res = insert(res, i, r);
}
return res;
}
extern "C" unmasked uniform float erff(uniform float x);
static inline float futrts_erf32(float x) {
float res;
foreach_active (i) {
uniform float r = erff(extract(x, i));
res = insert(res, i, r);
}
return res;
}
extern "C" unmasked uniform float erfcf(uniform float x);
static inline float futrts_erfc32(float x) {
float res;
foreach_active (i) {
uniform float r = erfcf(extract(x, i));
res = insert(res, i, r);
}
return res;
}
static inline float fmod32(float x, float y) {
return x - y * trunc(x/y);
}
static inline float futrts_round32(float x) {
return round(x);
}
static inline float futrts_floor32(float x) {
return floor(x);
}
static inline float futrts_ceil32(float x) {
return ceil(x);
}
extern "C" unmasked uniform float nextafterf(uniform float x, uniform float y);
static inline float futrts_nextafter32(float x, float y) {
float res;
foreach_active (i) {
uniform float r = nextafterf(extract(x, i), extract(y, i));
res = insert(res, i, r);
}
return res;
}
static inline float futrts_lerp32(float v0, float v1, float t) {
return v0 + (v1 - v0) * t;
}
static inline float futrts_mad32(float a, float b, float c) {
return a * b + c;
}
static inline float futrts_fma32(float a, float b, float c) {
return a * b + c;
}
#else // Not OpenCL or ISPC, but CUDA or plain C.
static inline float futrts_log32(float x) {
return logf(x);
}
static inline float futrts_log2_32(float x) {
return log2f(x);
}
static inline float futrts_log10_32(float x) {
return log10f(x);
}
static inline float futrts_log1p_32(float x) {
return log1pf(x);
}
static inline float futrts_sqrt32(float x) {
return sqrtf(x);
}
static inline float futrts_cbrt32(float x) {
return cbrtf(x);
}
static inline float futrts_exp32(float x) {
return expf(x);
}
static inline float futrts_cos32(float x) {
return cosf(x);
}
static inline float futrts_sin32(float x) {
return sinf(x);
}
static inline float futrts_tan32(float x) {
return tanf(x);
}
static inline float futrts_acos32(float x) {
return acosf(x);
}
static inline float futrts_asin32(float x) {
return asinf(x);
}
static inline float futrts_atan32(float x) {
return atanf(x);
}
static inline float futrts_cosh32(float x) {
return coshf(x);
}
static inline float futrts_sinh32(float x) {
return sinhf(x);
}
static inline float futrts_tanh32(float x) {
return tanhf(x);
}
static inline float futrts_acosh32(float x) {
return acoshf(x);
}
static inline float futrts_asinh32(float x) {
return asinhf(x);
}
static inline float futrts_atanh32(float x) {
return atanhf(x);
}
static inline float futrts_atan2_32(float x, float y) {
return atan2f(x, y);
}
static inline float futrts_hypot32(float x, float y) {
return hypotf(x, y);
}
static inline float futrts_gamma32(float x) {
return tgammaf(x);
}
static inline float futrts_lgamma32(float x) {
return lgammaf(x);
}
static inline float futrts_erf32(float x) {
return erff(x);
}
static inline float futrts_erfc32(float x) {
return erfcf(x);
}
static inline float fmod32(float x, float y) {
return fmodf(x, y);
}
static inline float futrts_round32(float x) {
return rintf(x);
}
static inline float futrts_floor32(float x) {
return floorf(x);
}
static inline float futrts_ceil32(float x) {
return ceilf(x);
}
static inline float futrts_nextafter32(float x, float y) {
return nextafterf(x, y);
}
static inline float futrts_lerp32(float v0, float v1, float t) {
return v0 + (v1 - v0) * t;
}
static inline float futrts_mad32(float a, float b, float c) {
return a * b + c;
}
static inline float futrts_fma32(float a, float b, float c) {
return fmaf(a, b, c);
}
#endif
#if ISPC
static inline int32_t futrts_to_bits32(float x) {
return intbits(x);
}
static inline float futrts_from_bits32(int32_t x) {
return floatbits(x);
}
#else
static inline int32_t futrts_to_bits32(float x) {
union {
float f;
int32_t t;
} p;
p.f = x;
return p.t;
}
static inline float futrts_from_bits32(int32_t x) {
union {
int32_t f;
float t;
} p;
p.f = x;
return p.t;
}
#endif
static inline float fsignum32(float x) {
return futrts_isnan32(x) ? x : (x > 0 ? 1 : 0) - (x < 0 ? 1 : 0);
}
#ifdef FUTHARK_F64_ENABLED
#if ISPC
static inline bool futrts_isinf64(float x) {
return !isnan(x) && isnan(x - x);
}
static inline bool futrts_isfinite64(float x) {
return !isnan(x) && !futrts_isinf64(x);
}
static inline double fdiv64(double x, double y) {
return x / y;
}
static inline double fadd64(double x, double y) {
return x + y;
}
static inline double fsub64(double x, double y) {
return x - y;
}
static inline double fmul64(double x, double y) {
return x * y;
}
static inline bool cmplt64(double x, double y) {
return x < y;
}
static inline bool cmple64(double x, double y) {
return x <= y;
}
static inline double sitofp_i8_f64(int8_t x) {
return (double) x;
}
static inline double sitofp_i16_f64(int16_t x) {
return (double) x;
}
static inline double sitofp_i32_f64(int32_t x) {
return (double) x;
}
static inline double sitofp_i64_f64(int64_t x) {
return (double) x;
}
static inline double uitofp_i8_f64(uint8_t x) {
return (double) x;
}
static inline double uitofp_i16_f64(uint16_t x) {
return (double) x;
}
static inline double uitofp_i32_f64(uint32_t x) {
return (double) x;
}
static inline double uitofp_i64_f64(uint64_t x) {
return (double) x;
}
static inline double fabs64(double x) {
return abs(x);
}
static inline double fmax64(double x, double y) {
return isnan(x) ? y : isnan(y) ? x : max(x, y);
}
static inline double fmin64(double x, double y) {
return isnan(x) ? y : isnan(y) ? x : min(x, y);
}
static inline double fpow64(double a, double b) {
float ret;
foreach_active (i) {
uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
ret = insert(ret, i, r);
}
return ret;
}
static inline double futrts_log64(double x) {
return futrts_isfinite64(x) || (futrts_isinf64(x) && x < 0)? log(x) : x;
}
static inline double futrts_log2_64(double x) {
return futrts_log64(x)/log(2.0d);
}
static inline double futrts_log10_64(double x) {
return futrts_log64(x)/log(10.0d);
}
static inline double futrts_log1p_64(double x) {
if(x == -1.0d || (futrts_isinf64(x) && x > 0.0d)) return x / 0.0d;
double y = 1.0d + x;
double z = y - 1.0d;
return log(y) - (z-x)/y;
}
static inline double futrts_sqrt64(double x) {
return sqrt(x);
}
extern "C" unmasked uniform double cbrt(uniform double);
static inline double futrts_cbrt64(double x) {
double res;
foreach_active (i) {
uniform double r = cbrtf(extract(x, i));
res = insert(res, i, r);
}
return res;
}
static inline double futrts_exp64(double x) {
return exp(x);
}
static inline double futrts_cos64(double x) {
return cos(x);
}
static inline double futrts_sin64(double x) {
return sin(x);
}
static inline double futrts_tan64(double x) {
return tan(x);
}
static inline double futrts_acos64(double x) {
return acos(x);
}
static inline double futrts_asin64(double x) {
return asin(x);
}
static inline double futrts_atan64(double x) {
return atan(x);
}
static inline double futrts_cosh64(double x) {
return (exp(x)+exp(-x)) / 2.0d;
}
static inline double futrts_sinh64(double x) {
return (exp(x)-exp(-x)) / 2.0d;
}
static inline double futrts_tanh64(double x) {
return futrts_sinh64(x)/futrts_cosh64(x);
}
static inline double futrts_acosh64(double x) {
double f = x+sqrt(x*x-1.0d);
if(futrts_isfinite64(f)) return log(f);
return f;
}
static inline double futrts_asinh64(double x) {
double f = x+sqrt(x*x+1.0d);
if(futrts_isfinite64(f)) return log(f);
return f;
}
static inline double futrts_atanh64(double x) {
double f = (1.0d+x)/(1.0d-x);
if(futrts_isfinite64(f)) return log(f)/2.0d;
return f;
}
static inline double futrts_atan2_64(double x, double y) {
return atan2(x, y);
}
extern "C" unmasked uniform double hypot(uniform double x, uniform double y);
static inline double futrts_hypot64(double x, double y) {
double res;
foreach_active (i) {
uniform double r = hypot(extract(x, i), extract(y, i));
res = insert(res, i, r);
}
return res;
}
extern "C" unmasked uniform double tgamma(uniform double x);
static inline double futrts_gamma64(double x) {
double res;
foreach_active (i) {
uniform double r = tgamma(extract(x, i));
res = insert(res, i, r);
}
return res;
}
extern "C" unmasked uniform double lgamma(uniform double x);
static inline double futrts_lgamma64(double x) {
double res;
foreach_active (i) {
uniform double r = lgamma(extract(x, i));
res = insert(res, i, r);
}
return res;
}
extern "C" unmasked uniform double erf(uniform double x);
static inline double futrts_erf64(double x) {
double res;
foreach_active (i) {
uniform double r = erf(extract(x, i));
res = insert(res, i, r);
}
return res;
}
extern "C" unmasked uniform double erfc(uniform double x);
static inline double futrts_erfc64(double x) {
double res;
foreach_active (i) {
uniform double r = erfc(extract(x, i));
res = insert(res, i, r);
}
return res;
}
static inline double futrts_fma64(double a, double b, double c) {
return a * b + c;
}
static inline double futrts_round64(double x) {
return round(x);
}
static inline double futrts_ceil64(double x) {
return ceil(x);
}
extern "C" unmasked uniform double nextafter(uniform float x, uniform double y);
static inline float futrts_nextafter64(double x, double y) {
double res;
foreach_active (i) {
uniform double r = nextafter(extract(x, i), extract(y, i));
res = insert(res, i, r);
}
return res;
}
static inline double futrts_floor64(double x) {
return floor(x);
}
static inline bool futrts_isnan64(double x) {
return isnan(x);
}
static inline int8_t fptosi_f64_i8(double x) {
if (futrts_isnan64(x) || futrts_isinf64(x)) {
return 0;
} else {
return (int8_t) x;
}
}
static inline int16_t fptosi_f64_i16(double x) {
if (futrts_isnan64(x) || futrts_isinf64(x)) {
return 0;
} else {
return (int16_t) x;
}
}
static inline int32_t fptosi_f64_i32(double x) {
if (futrts_isnan64(x) || futrts_isinf64(x)) {
return 0;
} else {
return (int32_t) x;
}
}
static inline int64_t fptosi_f64_i64(double x) {
if (futrts_isnan64(x) || futrts_isinf64(x)) {
return 0;
} else {
return (int64_t) x;
}
}
static inline uint8_t fptoui_f64_i8(double x) {
if (futrts_isnan64(x) || futrts_isinf64(x)) {
return 0;
} else {
return (uint8_t) (int8_t) x;
}
}
static inline uint16_t fptoui_f64_i16(double x) {
if (futrts_isnan64(x) || futrts_isinf64(x)) {
return 0;
} else {
return (uint16_t) (int16_t) x;
}
}
static inline uint32_t fptoui_f64_i32(double x) {
if (futrts_isnan64(x) || futrts_isinf64(x)) {
return 0;
} else {
return (uint32_t) (int32_t) x;
}
}
static inline uint64_t fptoui_f64_i64(double x) {
if (futrts_isnan64(x) || futrts_isinf64(x)) {
return 0;
} else {
return (uint64_t) (int64_t) x;
}
}
static inline bool ftob_f64_bool(double x) {
return x != 0.0;
}
static inline double btof_bool_f64(bool x) {
return x ? 1.0 : 0.0;
}
static inline int64_t futrts_to_bits64(double x) {
int64_t res;
foreach_active (i) {
uniform double tmp = extract(x, i);
uniform int64_t r = *((uniform int64_t* uniform)&tmp);
res = insert(res, i, r);
}
return res;
}
static inline double futrts_from_bits64(int64_t x) {
double res;
foreach_active (i) {
uniform int64_t tmp = extract(x, i);
uniform double r = *((uniform double* uniform)&tmp);
res = insert(res, i, r);
}
return res;
}
static inline double fmod64(double x, double y) {
return x - y * trunc(x/y);
}
static inline double fsignum64(double x) {
return futrts_isnan64(x) ? x : (x > 0 ? 1.0d : 0.0d) - (x < 0 ? 1.0d : 0.0d);
}
static inline double futrts_lerp64(double v0, double v1, double t) {
return v0 + (v1 - v0) * t;
}
static inline double futrts_mad64(double a, double b, double c) {
return a * b + c;
}
static inline float fpconv_f32_f32(float x) {
return (float) x;
}
static inline double fpconv_f32_f64(float x) {
return (double) x;
}
static inline float fpconv_f64_f32(double x) {
return (float) x;
}
static inline double fpconv_f64_f64(double x) {
return (double) x;
}
#else
static inline double fdiv64(double x, double y) {
return x / y;
}
static inline double fadd64(double x, double y) {
return x + y;
}
static inline double fsub64(double x, double y) {
return x - y;
}
static inline double fmul64(double x, double y) {
return x * y;
}
static inline bool cmplt64(double x, double y) {
return x < y;
}
static inline bool cmple64(double x, double y) {
return x <= y;
}
static inline double sitofp_i8_f64(int8_t x) {
return (double) x;
}
static inline double sitofp_i16_f64(int16_t x) {
return (double) x;
}
static inline double sitofp_i32_f64(int32_t x) {
return (double) x;
}
static inline double sitofp_i64_f64(int64_t x) {
return (double) x;
}
static inline double uitofp_i8_f64(uint8_t x) {
return (double) x;
}
static inline double uitofp_i16_f64(uint16_t x) {
return (double) x;
}
static inline double uitofp_i32_f64(uint32_t x) {
return (double) x;
}
static inline double uitofp_i64_f64(uint64_t x) {
return (double) x;
}
static inline double fabs64(double x) {
return fabs(x);
}
static inline double fmax64(double x, double y) {
return fmax(x, y);
}
static inline double fmin64(double x, double y) {
return fmin(x, y);
}
static inline double fpow64(double x, double y) {
return pow(x, y);
}
static inline double futrts_log64(double x) {
return log(x);
}
static inline double futrts_log2_64(double x) {
return log2(x);
}
static inline double futrts_log10_64(double x) {
return log10(x);
}
static inline double futrts_log1p_64(double x) {
return log1p(x);
}
static inline double futrts_sqrt64(double x) {
return sqrt(x);
}
static inline double futrts_cbrt64(double x) {
return cbrt(x);
}
static inline double futrts_exp64(double x) {
return exp(x);
}
static inline double futrts_cos64(double x) {
return cos(x);
}
static inline double futrts_sin64(double x) {
return sin(x);
}
static inline double futrts_tan64(double x) {
return tan(x);
}
static inline double futrts_acos64(double x) {
return acos(x);
}
static inline double futrts_asin64(double x) {
return asin(x);
}
static inline double futrts_atan64(double x) {
return atan(x);
}
static inline double futrts_cosh64(double x) {
return cosh(x);
}
static inline double futrts_sinh64(double x) {
return sinh(x);
}
static inline double futrts_tanh64(double x) {
return tanh(x);
}
static inline double futrts_acosh64(double x) {
return acosh(x);
}
static inline double futrts_asinh64(double x) {
return asinh(x);
}
static inline double futrts_atanh64(double x) {
return atanh(x);
}
static inline double futrts_atan2_64(double x, double y) {
return atan2(x, y);
}
static inline double futrts_hypot64(double x, double y) {
return hypot(x, y);
}
static inline double futrts_gamma64(double x) {
return tgamma(x);
}
static inline double futrts_lgamma64(double x) {
return lgamma(x);
}
static inline double futrts_erf64(double x) {
return erf(x);
}
static inline double futrts_erfc64(double x) {
return erfc(x);
}
static inline double futrts_fma64(double a, double b, double c) {
return fma(a, b, c);
}
static inline double futrts_round64(double x) {
return rint(x);
}
static inline double futrts_ceil64(double x) {
return ceil(x);
}
static inline float futrts_nextafter64(float x, float y) {
return nextafter(x, y);
}
static inline double futrts_floor64(double x) {
return floor(x);
}
static inline bool futrts_isnan64(double x) {
return isnan(x);
}
static inline bool futrts_isinf64(double x) {
return isinf(x);
}
static inline int8_t fptosi_f64_i8(double x) {
if (futrts_isnan64(x) || futrts_isinf64(x)) {
return 0;
} else {
return (int8_t) x;
}
}
static inline int16_t fptosi_f64_i16(double x) {
if (futrts_isnan64(x) || futrts_isinf64(x)) {
return 0;
} else {
return (int16_t) x;
}
}
static inline int32_t fptosi_f64_i32(double x) {
if (futrts_isnan64(x) || futrts_isinf64(x)) {
return 0;
} else {
return (int32_t) x;
}
}
static inline int64_t fptosi_f64_i64(double x) {
if (futrts_isnan64(x) || futrts_isinf64(x)) {
return 0;
} else {
return (int64_t) x;
}
}
static inline uint8_t fptoui_f64_i8(double x) {
if (futrts_isnan64(x) || futrts_isinf64(x)) {
return 0;
} else {
return (uint8_t) (int8_t) x;
}
}
static inline uint16_t fptoui_f64_i16(double x) {
if (futrts_isnan64(x) || futrts_isinf64(x)) {
return 0;
} else {
return (uint16_t) (int16_t) x;
}
}
static inline uint32_t fptoui_f64_i32(double x) {
if (futrts_isnan64(x) || futrts_isinf64(x)) {
return 0;
} else {
return (uint32_t) (int32_t) x;
}
}
static inline uint64_t fptoui_f64_i64(double x) {
if (futrts_isnan64(x) || futrts_isinf64(x)) {
return 0;
} else {
return (uint64_t) (int64_t) x;
}
}
static inline bool ftob_f64_bool(double x) {
return x != 0;
}
static inline double btof_bool_f64(bool x) {
return x ? 1 : 0;
}
static inline int64_t futrts_to_bits64(double x) {
union {
double f;
int64_t t;
} p;
p.f = x;
return p.t;
}
static inline double futrts_from_bits64(int64_t x) {
union {
int64_t f;
double t;
} p;
p.f = x;
return p.t;
}
static inline double fmod64(double x, double y) {
return fmod(x, y);
}
static inline double fsignum64(double x) {
return futrts_isnan64(x) ? x : (x > 0) - (x < 0);
}
static inline double futrts_lerp64(double v0, double v1, double t) {
#ifdef __OPENCL_VERSION__
return mix(v0, v1, t);
#else
return v0 + (v1 - v0) * t;
#endif
}
static inline double futrts_mad64(double a, double b, double c) {
#ifdef __OPENCL_VERSION__
return mad(a, b, c);
#else
return a * b + c;
#endif
}
static inline float fpconv_f32_f32(float x) {
return (float) x;
}
static inline double fpconv_f32_f64(float x) {
return (double) x;
}
static inline float fpconv_f64_f32(double x) {
return (float) x;
}
static inline double fpconv_f64_f64(double x) {
return (double) x;
}
#endif
#endif
// End of scalar.h.
// Start of scalar_f16.h.
// Half-precision is emulated if needed (e.g. in straight C) with the
// native type used if possible. The emulation works by typedef'ing
// 'float' to 'f16', and then implementing all operations on single
// precision. To cut down on duplication, we use the same code for
// those Futhark functions that require just operators or casts. The
// in-memory representation for arrays will still be 16 bits even
// under emulation, so the compiler will have to be careful when
// generating reads or writes.
#if !defined(cl_khr_fp16) && !(defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) && !(defined(ISPC))
#define EMULATE_F16
#endif
#if !defined(EMULATE_F16) && defined(__OPENCL_VERSION__)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#ifdef EMULATE_F16
// Note that the half-precision storage format is still 16 bits - the
// compiler will have to be real careful!
typedef float f16;
#elif ISPC
typedef float16 f16;
#else
#ifdef __CUDA_ARCH__
#include <cuda_fp16.h>
#endif
typedef half f16;
#endif
// Some of these functions convert to single precision because half
// precision versions are not available.
static inline f16 fadd16(f16 x, f16 y) {
return x + y;
}
static inline f16 fsub16(f16 x, f16 y) {
return x - y;
}
static inline f16 fmul16(f16 x, f16 y) {
return x * y;
}
static inline bool cmplt16(f16 x, f16 y) {
return x < y;
}
static inline bool cmple16(f16 x, f16 y) {
return x <= y;
}
static inline f16 sitofp_i8_f16(int8_t x) {
return (f16) x;
}
static inline f16 sitofp_i16_f16(int16_t x) {
return (f16) x;
}
static inline f16 sitofp_i32_f16(int32_t x) {
return (f16) x;
}
static inline f16 sitofp_i64_f16(int64_t x) {
return (f16) x;
}
static inline f16 uitofp_i8_f16(uint8_t x) {
return (f16) x;
}
static inline f16 uitofp_i16_f16(uint16_t x) {
return (f16) x;
}
static inline f16 uitofp_i32_f16(uint32_t x) {
return (f16) x;
}
static inline f16 uitofp_i64_f16(uint64_t x) {
return (f16) x;
}
static inline int8_t fptosi_f16_i8(f16 x) {
return (int8_t) (float) x;
}
static inline int16_t fptosi_f16_i16(f16 x) {
return (int16_t) x;
}
static inline int32_t fptosi_f16_i32(f16 x) {
return (int32_t) x;
}
static inline int64_t fptosi_f16_i64(f16 x) {
return (int64_t) x;
}
static inline uint8_t fptoui_f16_i8(f16 x) {
return (uint8_t) (float) x;
}
static inline uint16_t fptoui_f16_i16(f16 x) {
return (uint16_t) x;
}
static inline uint32_t fptoui_f16_i32(f16 x) {
return (uint32_t) x;
}
static inline uint64_t fptoui_f16_i64(f16 x) {
return (uint64_t) x;
}
static inline bool ftob_f16_bool(f16 x) {
return x != (f16)0;
}
static inline f16 btof_bool_f16(bool x) {
return x ? 1 : 0;
}
#ifndef EMULATE_F16
static inline bool futrts_isnan16(f16 x) {
return isnan((float)x);
}
#ifdef __OPENCL_VERSION__
static inline f16 fabs16(f16 x) {
return fabs(x);
}
static inline f16 fmax16(f16 x, f16 y) {
return fmax(x, y);
}
static inline f16 fmin16(f16 x, f16 y) {
return fmin(x, y);
}
static inline f16 fpow16(f16 x, f16 y) {
return pow(x, y);
}
#elif ISPC
static inline f16 fabs16(f16 x) {
return abs(x);
}
static inline f16 fmax16(f16 x, f16 y) {
return futrts_isnan16(x) ? y : futrts_isnan16(y) ? x : max(x, y);
}
static inline f16 fmin16(f16 x, f16 y) {
return futrts_isnan16(x) ? y : futrts_isnan16(y) ? x : min(x, y);
}
static inline f16 fpow16(f16 x, f16 y) {
return pow(x, y);
}
#else // Assuming CUDA.
static inline f16 fabs16(f16 x) {
return fabsf(x);
}
static inline f16 fmax16(f16 x, f16 y) {
return fmaxf(x, y);
}
static inline f16 fmin16(f16 x, f16 y) {
return fminf(x, y);
}
static inline f16 fpow16(f16 x, f16 y) {
return powf(x, y);
}
#endif
#if ISPC
static inline bool futrts_isinf16(float x) {
return !futrts_isnan16(x) && futrts_isnan16(x - x);
}
static inline bool futrts_isfinite16(float x) {
return !futrts_isnan16(x) && !futrts_isinf16(x);
}
#else
static inline bool futrts_isinf16(f16 x) {
return isinf((float)x);
}
#endif
#ifdef __OPENCL_VERSION__
static inline f16 futrts_log16(f16 x) {
return log(x);
}
static inline f16 futrts_log2_16(f16 x) {
return log2(x);
}
static inline f16 futrts_log10_16(f16 x) {
return log10(x);
}
static inline f16 futrts_log1p_16(f16 x) {
return log1p(x);
}
static inline f16 futrts_sqrt16(f16 x) {
return sqrt(x);
}
static inline f16 futrts_cbrt16(f16 x) {
return cbrt(x);
}
static inline f16 futrts_exp16(f16 x) {
return exp(x);
}
static inline f16 futrts_cos16(f16 x) {
return cos(x);
}
static inline f16 futrts_sin16(f16 x) {
return sin(x);
}
static inline f16 futrts_tan16(f16 x) {
return tan(x);
}
static inline f16 futrts_acos16(f16 x) {
return acos(x);
}
static inline f16 futrts_asin16(f16 x) {
return asin(x);
}
static inline f16 futrts_atan16(f16 x) {
return atan(x);
}
static inline f16 futrts_cosh16(f16 x) {
return cosh(x);
}
static inline f16 futrts_sinh16(f16 x) {
return sinh(x);
}
static inline f16 futrts_tanh16(f16 x) {
return tanh(x);
}
static inline f16 futrts_acosh16(f16 x) {
return acosh(x);
}
static inline f16 futrts_asinh16(f16 x) {
return asinh(x);
}
static inline f16 futrts_atanh16(f16 x) {
return atanh(x);
}
static inline f16 futrts_atan2_16(f16 x, f16 y) {
return atan2(x, y);
}
static inline f16 futrts_hypot16(f16 x, f16 y) {
return hypot(x, y);
}
static inline f16 futrts_gamma16(f16 x) {
return tgamma(x);
}
static inline f16 futrts_lgamma16(f16 x) {
return lgamma(x);
}
static inline f16 futrts_erf16(f16 x) {
return erf(x);
}
static inline f16 futrts_erfc16(f16 x) {
return erfc(x);
}
static inline f16 fmod16(f16 x, f16 y) {
return fmod(x, y);
}
static inline f16 futrts_round16(f16 x) {
return rint(x);
}
static inline f16 futrts_floor16(f16 x) {
return floor(x);
}
static inline f16 futrts_ceil16(f16 x) {
return ceil(x);
}
static inline f16 futrts_nextafter16(f16 x, f16 y) {
return nextafter(x, y);
}
static inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {
return mix(v0, v1, t);
}
static inline f16 futrts_mad16(f16 a, f16 b, f16 c) {
return mad(a, b, c);
}
static inline f16 futrts_fma16(f16 a, f16 b, f16 c) {
return fma(a, b, c);
}
#elif ISPC
static inline f16 futrts_log16(f16 x) {
return futrts_isfinite16(x) || (futrts_isinf16(x) && x < 0) ? log(x) : x;
}
static inline f16 futrts_log2_16(f16 x) {
return futrts_log16(x) / log(2.0f16);
}
static inline f16 futrts_log10_16(f16 x) {
return futrts_log16(x) / log(10.0f16);
}
static inline f16 futrts_log1p_16(f16 x) {
if(x == -1.0f16 || (futrts_isinf16(x) && x > 0.0f16)) return x / 0.0f16;
f16 y = 1.0f16 + x;
f16 z = y - 1.0f16;
return log(y) - (z-x)/y;
}
static inline f16 futrts_sqrt16(f16 x) {
return (float16)sqrt((float)x);
}
static inline f16 futrts_exp16(f16 x) {
return exp(x);
}
static inline f16 futrts_cos16(f16 x) {
return (float16)cos((float)x);
}
static inline f16 futrts_sin16(f16 x) {
return (float16)sin((float)x);
}
static inline f16 futrts_tan16(f16 x) {
return (float16)tan((float)x);
}
static inline f16 futrts_acos16(f16 x) {
return (float16)acos((float)x);
}
static inline f16 futrts_asin16(f16 x) {
return (float16)asin((float)x);
}
static inline f16 futrts_atan16(f16 x) {
return (float16)atan((float)x);
}
static inline f16 futrts_cosh16(f16 x) {
return (exp(x)+exp(-x)) / 2.0f16;
}
static inline f16 futrts_sinh16(f16 x) {
return (exp(x)-exp(-x)) / 2.0f16;
}
static inline f16 futrts_tanh16(f16 x) {
return futrts_sinh16(x)/futrts_cosh16(x);
}
static inline f16 futrts_acosh16(f16 x) {
float16 f = x+(float16)sqrt((float)(x*x-1));
if(futrts_isfinite16(f)) return log(f);
return f;
}
static inline f16 futrts_asinh16(f16 x) {
float16 f = x+(float16)sqrt((float)(x*x+1));
if(futrts_isfinite16(f)) return log(f);
return f;
}
static inline f16 futrts_atanh16(f16 x) {
float16 f = (1+x)/(1-x);
if(futrts_isfinite16(f)) return log(f)/2.0f16;
return f;
}
static inline f16 futrts_atan2_16(f16 x, f16 y) {
return (float16)atan2((float)x, (float)y);
}
static inline f16 futrts_hypot16(f16 x, f16 y) {
return (float16)futrts_hypot32((float)x, (float)y);
}
extern "C" unmasked uniform float tgammaf(uniform float x);
static inline f16 futrts_gamma16(f16 x) {
f16 res;
foreach_active (i) {
uniform f16 r = (f16)tgammaf(extract((float)x, i));
res = insert(res, i, r);
}
return res;
}
extern "C" unmasked uniform float lgammaf(uniform float x);
static inline f16 futrts_lgamma16(f16 x) {
f16 res;
foreach_active (i) {
uniform f16 r = (f16)lgammaf(extract((float)x, i));
res = insert(res, i, r);
}
return res;
}
static inline f16 futrts_cbrt16(f16 x) {
f16 res = (f16)futrts_cbrt32((float)x);
return res;
}
static inline f16 futrts_erf16(f16 x) {
f16 res = (f16)futrts_erf32((float)x);
return res;
}
static inline f16 futrts_erfc16(f16 x) {
f16 res = (f16)futrts_erfc32((float)x);
return res;
}
static inline f16 fmod16(f16 x, f16 y) {
return x - y * (float16)trunc((float) (x/y));
}
static inline f16 futrts_round16(f16 x) {
return (float16)round((float)x);
}
static inline f16 futrts_floor16(f16 x) {
return (float16)floor((float)x);
}
static inline f16 futrts_ceil16(f16 x) {
return (float16)ceil((float)x);
}
static inline f16 futrts_nextafter16(f16 x, f16 y) {
return (float16)futrts_nextafter32((float)x, (float) y);
}
static inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {
return v0 + (v1 - v0) * t;
}
static inline f16 futrts_mad16(f16 a, f16 b, f16 c) {
return a * b + c;
}
static inline f16 futrts_fma16(f16 a, f16 b, f16 c) {
return a * b + c;
}
#else // Assume CUDA.
static inline f16 futrts_log16(f16 x) {
return hlog(x);
}
static inline f16 futrts_log2_16(f16 x) {
return hlog2(x);
}
static inline f16 futrts_log10_16(f16 x) {
return hlog10(x);
}
static inline f16 futrts_log1p_16(f16 x) {
return (f16)log1pf((float)x);
}
static inline f16 futrts_sqrt16(f16 x) {
return hsqrt(x);
}
static inline f16 futrts_cbrt16(f16 x) {
return cbrtf(x);
}
static inline f16 futrts_exp16(f16 x) {
return hexp(x);
}
static inline f16 futrts_cos16(f16 x) {
return hcos(x);
}
static inline f16 futrts_sin16(f16 x) {
return hsin(x);
}
static inline f16 futrts_tan16(f16 x) {
return tanf(x);
}
static inline f16 futrts_acos16(f16 x) {
return acosf(x);
}
static inline f16 futrts_asin16(f16 x) {
return asinf(x);
}
static inline f16 futrts_atan16(f16 x) {
return atanf(x);
}
static inline f16 futrts_cosh16(f16 x) {
return coshf(x);
}
static inline f16 futrts_sinh16(f16 x) {
return sinhf(x);
}
static inline f16 futrts_tanh16(f16 x) {
return tanhf(x);
}
static inline f16 futrts_acosh16(f16 x) {
return acoshf(x);
}
static inline f16 futrts_asinh16(f16 x) {
return asinhf(x);
}
static inline f16 futrts_atanh16(f16 x) {
return atanhf(x);
}
static inline f16 futrts_atan2_16(f16 x, f16 y) {
return atan2f(x, y);
}
static inline f16 futrts_hypot16(f16 x, f16 y) {
return hypotf(x, y);
}
static inline f16 futrts_gamma16(f16 x) {
return tgammaf(x);
}
static inline f16 futrts_lgamma16(f16 x) {
return lgammaf(x);
}
static inline f16 futrts_erf16(f16 x) {
return erff(x);
}
static inline f16 futrts_erfc16(f16 x) {
return erfcf(x);
}
static inline f16 fmod16(f16 x, f16 y) {
return fmodf(x, y);
}
static inline f16 futrts_round16(f16 x) {
return rintf(x);
}
static inline f16 futrts_floor16(f16 x) {
return hfloor(x);
}
static inline f16 futrts_ceil16(f16 x) {
return hceil(x);
}
static inline f16 futrts_nextafter16(f16 x, f16 y) {
return __ushort_as_half(halfbitsnextafter(__half_as_ushort(x), __half_as_ushort(y)));
}
static inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {
return v0 + (v1 - v0) * t;
}
static inline f16 futrts_mad16(f16 a, f16 b, f16 c) {
return a * b + c;
}
static inline f16 futrts_fma16(f16 a, f16 b, f16 c) {
return fmaf(a, b, c);
}
#endif
// The CUDA __half type cannot be put in unions for some reason, so we
// use bespoke conversion functions instead.
#ifdef __CUDA_ARCH__
static inline int16_t futrts_to_bits16(f16 x) {
return __half_as_ushort(x);
}
static inline f16 futrts_from_bits16(int16_t x) {
return __ushort_as_half(x);
}
#elif ISPC
static inline int16_t futrts_to_bits16(f16 x) {
varying int16_t y = *((varying int16_t * uniform)&x);
return y;
}
static inline f16 futrts_from_bits16(int16_t x) {
varying f16 y = *((varying f16 * uniform)&x);
return y;
}
#else
static inline int16_t futrts_to_bits16(f16 x) {
union {
f16 f;
int16_t t;
} p;
p.f = x;
return p.t;
}
static inline f16 futrts_from_bits16(int16_t x) {
union {
int16_t f;
f16 t;
} p;
p.f = x;
return p.t;
}
#endif
#else // No native f16 - emulate.
static inline f16 fabs16(f16 x) {
return fabs32(x);
}
static inline f16 fmax16(f16 x, f16 y) {
return fmax32(x, y);
}
static inline f16 fmin16(f16 x, f16 y) {
return fmin32(x, y);
}
static inline f16 fpow16(f16 x, f16 y) {
return fpow32(x, y);
}
static inline bool futrts_isnan16(f16 x) {
return futrts_isnan32(x);
}
static inline bool futrts_isinf16(f16 x) {
return futrts_isinf32(x);
}
static inline f16 futrts_log16(f16 x) {
return futrts_log32(x);
}
static inline f16 futrts_log2_16(f16 x) {
return futrts_log2_32(x);
}
static inline f16 futrts_log10_16(f16 x) {
return futrts_log10_32(x);
}
static inline f16 futrts_log1p_16(f16 x) {
return futrts_log1p_32(x);
}
static inline f16 futrts_sqrt16(f16 x) {
return futrts_sqrt32(x);
}
static inline f16 futrts_cbrt16(f16 x) {
return futrts_cbrt32(x);
}
static inline f16 futrts_exp16(f16 x) {
return futrts_exp32(x);
}
static inline f16 futrts_cos16(f16 x) {
return futrts_cos32(x);
}
static inline f16 futrts_sin16(f16 x) {
return futrts_sin32(x);
}
static inline f16 futrts_tan16(f16 x) {
return futrts_tan32(x);
}
static inline f16 futrts_acos16(f16 x) {
return futrts_acos32(x);
}
static inline f16 futrts_asin16(f16 x) {
return futrts_asin32(x);
}
static inline f16 futrts_atan16(f16 x) {
return futrts_atan32(x);
}
static inline f16 futrts_cosh16(f16 x) {
return futrts_cosh32(x);
}
static inline f16 futrts_sinh16(f16 x) {
return futrts_sinh32(x);
}
static inline f16 futrts_tanh16(f16 x) {
return futrts_tanh32(x);
}
static inline f16 futrts_acosh16(f16 x) {
return futrts_acosh32(x);
}
static inline f16 futrts_asinh16(f16 x) {
return futrts_asinh32(x);
}
static inline f16 futrts_atanh16(f16 x) {
return futrts_atanh32(x);
}
static inline f16 futrts_atan2_16(f16 x, f16 y) {
return futrts_atan2_32(x, y);
}
static inline f16 futrts_hypot16(f16 x, f16 y) {
return futrts_hypot32(x, y);
}
static inline f16 futrts_gamma16(f16 x) {
return futrts_gamma32(x);
}
static inline f16 futrts_lgamma16(f16 x) {
return futrts_lgamma32(x);
}
static inline f16 futrts_erf16(f16 x) {
return futrts_erf32(x);
}
static inline f16 futrts_erfc16(f16 x) {
return futrts_erfc32(x);
}
static inline f16 fmod16(f16 x, f16 y) {
return fmod32(x, y);
}
static inline f16 futrts_round16(f16 x) {
return futrts_round32(x);
}
static inline f16 futrts_floor16(f16 x) {
return futrts_floor32(x);
}
static inline f16 futrts_ceil16(f16 x) {
return futrts_ceil32(x);
}
static inline f16 futrts_nextafter16(f16 x, f16 y) {
return halfbits2float(halfbitsnextafter(float2halfbits(x), float2halfbits(y)));
}
static inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {
return futrts_lerp32(v0, v1, t);
}
static inline f16 futrts_mad16(f16 a, f16 b, f16 c) {
return futrts_mad32(a, b, c);
}
static inline f16 futrts_fma16(f16 a, f16 b, f16 c) {
return futrts_fma32(a, b, c);
}
// Even when we are using an OpenCL that does not support cl_khr_fp16,
// it must still support vload_half for actually creating a
// half-precision number, which can then be efficiently converted to a
// float. Similarly for vstore_half.
#ifdef __OPENCL_VERSION__
static inline int16_t futrts_to_bits16(f16 x) {
int16_t y;
// Violating strict aliasing here.
vstore_half((float)x, 0, (half*)&y);
return y;
}
static inline f16 futrts_from_bits16(int16_t x) {
return (f16)vload_half(0, (half*)&x);
}
#else
static inline int16_t futrts_to_bits16(f16 x) {
return (int16_t)float2halfbits(x);
}
static inline f16 futrts_from_bits16(int16_t x) {
return halfbits2float((uint16_t)x);
}
static inline f16 fsignum16(f16 x) {
return futrts_isnan16(x) ? x : (x > 0 ? 1 : 0) - (x < 0 ? 1 : 0);
}
#endif
#endif
static inline float fpconv_f16_f16(f16 x) {
return x;
}
static inline float fpconv_f16_f32(f16 x) {
return x;
}
static inline f16 fpconv_f32_f16(float x) {
return (f16) x;
}
#ifdef FUTHARK_F64_ENABLED
static inline double fpconv_f16_f64(f16 x) {
return (double) x;
}
#if ISPC
static inline f16 fpconv_f64_f16(double x) {
return (f16) ((float)x);
}
#else
static inline f16 fpconv_f64_f16(double x) {
return (f16) x;
}
#endif
#endif
// End of scalar_f16.h.
// Start of context_prototypes.h
//
// Prototypes for the functions in context.h, or that will be called
// from those functions, that need to be available very early.
struct futhark_context_config;
struct futhark_context;
static void set_error(struct futhark_context* ctx, char *error);
// These are called in context/config new/free functions and contain
// shared setup. They are generated by the compiler itself.
static int init_constants(struct futhark_context*);
static int free_constants(struct futhark_context*);
static void setup_program(struct futhark_context* ctx);
static void teardown_program(struct futhark_context *ctx);
// Allocate host memory. Must be freed with host_free().
static void host_alloc(struct futhark_context* ctx, size_t size, const char* tag, size_t* size_out, void** mem_out);
// Allocate memory allocated with host_alloc().
static void host_free(struct futhark_context* ctx, size_t size, const char* tag, void* mem);
// Functions that must be defined by the backend.
static void backend_context_config_setup(struct futhark_context_config* cfg);
static void backend_context_config_teardown(struct futhark_context_config* cfg);
static int backend_context_setup(struct futhark_context *ctx);
static void backend_context_teardown(struct futhark_context *ctx);
// End of of context_prototypes.h
struct memblock_device {
int *references;
cl_mem mem;
int64_t size;
const char *desc;
};
struct memblock {
int *references;
unsigned char *mem;
int64_t size;
const char *desc;
};
struct constants {
int dummy;
};
struct tuning_params {
int64_t *addzisegmap_group_sizze_6879;
int64_t *add_i64zisegmap_group_sizze_6899;
};
static const int num_tuning_params = 2;
static const char *tuning_param_names[] = {"add.segmap_group_size_6879", "add_i64.segmap_group_size_6899", NULL};
static const char *tuning_param_vars[] = {"addzisegmap_group_sizze_6879", "add_i64zisegmap_group_sizze_6899", NULL};
static const char *tuning_param_classes[] = {"group_size", "group_size", NULL};
static int64_t tuning_param_defaults[] = {0, 0, 0};
static const int max_failure_args = 0;
static const int f64_required = 0;
static const char *opencl_program[] = {"\n// Clang-based OpenCL implementations need this for 'static' to work.\n#ifdef cl_clang_storage_class_specifiers\n#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable\n#endif\n#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n\n// Some OpenCL programs dislike empty progams, or programs with no kernels.\n// Declare a dummy kernel to ensure they remain our friends.\n__kernel void dummy_kernel(__global unsigned char *dummy, int n)\n{\n const int thread_gid = get_global_id(0);\n if (thread_gid >= n) return;\n}\n\n#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n\ntypedef char int8_t;\ntypedef short int16_t;\ntypedef int int32_t;\ntypedef long int64_t;\n\ntypedef uchar uint8_t;\ntypedef ushort uint16_t;\ntypedef uint uint32_t;\ntypedef ulong uint64_t;\n\n// NVIDIAs OpenCL does not create device-wide memory fences (see #734), so we\n// use inline assembly if we detect we are on an NVIDIA GPU.\n#ifdef cl_nv_pragma_unroll\nstatic inline void mem_fence_global() {\n asm(\"membar.gl;\");\n}\n#else\nstatic inline void mem_fence_global() {\n mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n}\n#endif\nstatic inline void mem_fence_local() {\n mem_fence(CLK_LOCAL_MEM_FENCE);\n}\n// Start of half.h.\n\n// Conversion functions are from http://half.sourceforge.net/, but\n// translated to C.\n//\n// Copyright (c) 2012-2021 Christian Rau\n//\n// Permission is hereby granted, free of charge, to any person obtaining a copy\n// of this software and associated documentation files (the \"Software\"), to deal\n// in the Software without restriction, including without limitation the rights\n// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n// copies of the Software, and to permit persons to whom the Software is\n// furnished to do so, subject to the following conditions:\n//\n// The above copyright notice and this permission notice shall be included in\n// all copies or substantial portions of the Softwa", "re.\n//\n// THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n// THE SOFTWARE.\n\n#ifndef __OPENCL_VERSION__\n#define __constant\n#endif\n\n__constant static const uint16_t base_table[512] = {\n 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100,\n 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00,\n 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00,\n 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C0", "0, 0x7C00, 0x7C00, 0x7C00,\n 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100,\n 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00,\n 0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00,\n 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n 0xFC00, 0xFC00, 0xFC0",
"0, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 };\n\n__constant static const unsigned char shift_table[512] = {\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,\n 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 2", "4, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,\n 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 };\n\n__constant static const uint32_t mantissa_table[2048] = {\n 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000,\n 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000,\n 0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000,\n 0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000,\n 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000,\n 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA00", "00, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000,\n 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000,\n 0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000,\n 0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000,\n 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000,\n 0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000,\n 0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000,\n 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000,\n 0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000,\n 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000,\n 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A00",
"00, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000,\n 0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000,\n 0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000,\n 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000,\n 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000,\n 0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000,\n 0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000,\n 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000,\n 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000,\n 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000,\n 0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF80", "00,\n 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000,\n 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000,\n 0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000,\n 0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000,\n 0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000,\n 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000,\n 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000,\n 0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000,\n 0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000,\n 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000,\n 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x3811", "0000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000,\n 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000,\n 0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000,\n 0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000,\n 0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000,\n 0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000,\n 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000,\n 0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000,\n 0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000,\n 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000,\n 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A",
"4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000,\n 0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000,\n 0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000,\n 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000,\n 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000,\n 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000,\n 0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000,\n 0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000,\n 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000,\n 0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000,\n 0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x3863", "8000, 0x3863C000,\n 0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000,\n 0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000,\n 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000,\n 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000,\n 0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000,\n 0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000,\n 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000,\n 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000,\n 0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000,\n 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000,\n 0x38060000, 0x38062000, 0x38064000, 0x38", "066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000,\n 0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000,\n 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000,\n 0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000,\n 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000,\n 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000,\n 0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000,\n 0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000,\n 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000,\n 0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000,\n 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x38",
"1B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000,\n 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000,\n 0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000,\n 0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000,\n 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000,\n 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000,\n 0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000,\n 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000,\n 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000,\n 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000,\n 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x38", "2FA000, 0x382FC000, 0x382FE000,\n 0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000,\n 0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000,\n 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000,\n 0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000,\n 0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000,\n 0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000,\n 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000,\n 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000,\n 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000,\n 0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000,\n 0x38440000, 0x38442000, 0x", "38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000,\n 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000,\n 0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000,\n 0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000,\n 0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000,\n 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000,\n 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,\n 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000,\n 0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000,\n 0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000,\n 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x",
"3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000,\n 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000,\n 0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000,\n 0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000,\n 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000,\n 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000,\n 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000,\n 0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000,\n 0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000,\n 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000,\n 0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x", "386D8000, 0x386DA000, 0x386DC000, 0x386DE000,\n 0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000,\n 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000,\n 0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000,\n 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000,\n 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000,\n 0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000,\n 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000,\n 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000,\n 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 };\n__constant static const uint32_t exponent_table[64] = {\n 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06", "000000, 0x06800000, 0x07000000, 0x07800000,\n 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000,\n 0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,\n 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 };\n__constant static const unsigned short offset_table[64] = {\n 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,\n 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 };\n\nstatic uint16_t float2halfbits(float value) {\n union { float x; uint32_t y; } u;\n u.x = value;\n uint32_t bits = u.y;\n\n uint16_t hbits = base_table[bits>>23] + (uint16_t)((bits&0x7FFFFF)>>shift_table[bits>>23]);;\n\n return hbits;\n}\n\nstatic float halfbits2float(uint16_t value) {\n uint32_t bits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10];\n\n union { uint32_t x; float y; } u;\n u.x = bits;\n return u.y;\n}\n\nstatic uint16_t halfbitsnextafter(uint16_t from, uint16_t to) {\n int fabs = from & 0x7FFF, tabs = to & 0x7FFF;\n if(fabs > 0x7C00 || tabs > 0x7C00) {\n return ((from&0x7FFF)>0x7C00) ? (from|0x200) : (to|0x200);\n }\n if(from == to || !(fabs|tabs)) {\n return to;\n }\n if(!fabs) {\n return (to&0x8000)+1;\n }\n unsigned int out =\n from +\n (((from>>15)^(unsigned int)((from^(0x8000|(0x8000-(from>>15))))<(to^(0x8000|(0x8000-(to>>15))))))<<1)\n - 1;\n return out;",
"\n}\n\n// End of half.h.\n// Start of scalar.h.\n\n// Implementation of the primitive scalar operations. Very\n// repetitive. This code is inserted directly into both CUDA and\n// OpenCL programs, as well as the CPU code, so it has some #ifdefs to\n// work everywhere. Some operations are defined as macros because\n// this allows us to use them as constant expressions in things like\n// array sizes and static initialisers.\n\n// Some of the #ifdefs are because OpenCL uses type-generic functions\n// for some operations (e.g. sqrt), while C and CUDA sensibly use\n// distinct functions for different precisions (e.g. sqrtf() and\n// sqrt()). This is quite annoying. Due to C's unfortunate casting\n// rules, it is also really easy to accidentally implement\n// floating-point functions in the wrong precision, so be careful.\n\n// Double-precision definitions are only included if the preprocessor\n// macro FUTHARK_F64_ENABLED is set.\n\nstatic inline uint8_t add8(uint8_t x, uint8_t y) {\n return x + y;\n}\n\nstatic inline uint16_t add16(uint16_t x, uint16_t y) {\n return x + y;\n}\n\nstatic inline uint32_t add32(uint32_t x, uint32_t y) {\n return x + y;\n}\n\nstatic inline uint64_t add64(uint64_t x, uint64_t y) {\n return x + y;\n}\n\nstatic inline uint8_t sub8(uint8_t x, uint8_t y) {\n return x - y;\n}\n\nstatic inline uint16_t sub16(uint16_t x, uint16_t y) {\n return x - y;\n}\n\nstatic inline uint32_t sub32(uint32_t x, uint32_t y) {\n return x - y;\n}\n\nstatic inline uint64_t sub64(uint64_t x, uint64_t y) {\n return x - y;\n}\n\nstatic inline uint8_t mul8(uint8_t x, uint8_t y) {\n return x * y;\n}\n\nstatic inline uint16_t mul16(uint16_t x, uint16_t y) {\n return x * y;\n}\n\nstatic inline uint32_t mul32(uint32_t x, uint32_t y) {\n return x * y;\n}\n\nstatic inline uint64_t mul64(uint64_t x, uint64_t y) {\n return x * y;\n}\n\n#if ISPC\n\nstatic inline uint8_t udiv8(uint8_t x, uint8_t y) {\n // This strange pattern is used to prevent the ISPC compiler from\n // causing SIGFPEs and bogus results on divisions where inactive lan", "es\n // have 0-valued divisors. It ensures that any inactive lane instead\n // has a divisor of 1. https://github.com/ispc/ispc/issues/2292\n uint8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n\n return x / ys;\n}\n\nstatic inline uint16_t udiv16(uint16_t x, uint16_t y) {\n uint16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x / ys;\n}\n\nstatic inline uint32_t udiv32(uint32_t x, uint32_t y) {\n uint32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n\n return x / ys;\n}\n\nstatic inline uint64_t udiv64(uint64_t x, uint64_t y) {\n uint64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n\n return x / ys;\n}\n\nstatic inline uint8_t udiv_up8(uint8_t x, uint8_t y) {\n uint8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n\n return (x + y - 1) / ys;\n}\n\nstatic inline uint16_t udiv_up16(uint16_t x, uint16_t y) {\n uint16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return (x + y - 1) / ys;\n}\n\nstatic inline uint32_t udiv_up32(uint32_t x, uint32_t y) {\n uint32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return (x + y - 1) / ys;\n}\n\nstatic inline uint64_t udiv_up64(uint64_t x, uint64_t y) {\n uint64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return (x + y - 1) / ys;\n}\n\nstatic inline uint8_t umod8(uint8_t x, uint8_t y) {\n uint8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x % ys;\n}\n\nstatic inline uint16_t umod16(uint16_t x, uint16_t y) {\n uint16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n\n return x % ys;\n}\n\nstatic inline uint32_t umod32(uint32_t x, uint32_t y) {\n uint32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x % ys;\n}\n\nstatic inline uint64_t umod64(uint64_t x, uint64_t y) {\n uint64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x % ys;\n}\n\nstatic inline uint8_t udiv_safe8(uint8_t x, uint8_t y) {\n uint8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x / ys;\n}\n\nstatic inline uint16_t udiv_safe16(uint16_t x, uint16_t y) {\n uint16_t ys = 1;\n foreach_active(i){\n ", "ys = y;\n }\n \n return y == 0 ? 0 : x / ys;\n}\n\nstatic inline uint32_t udiv_safe32(uint32_t x, uint32_t y) {\n uint32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x / ys;\n}\n\nstatic inline uint64_t udiv_safe64(uint64_t x, uint64_t y) {\n uint64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x / ys;\n}\n\nstatic inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) {\n uint8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : (x + y - 1) / ys;\n}\n\nstatic inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) {\n uint16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : (x + y - 1) / ys;\n}\n\nstatic inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) {\n uint32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : (x + y - 1) / ys;\n}\n\nstatic inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) {\n uint64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : (x + y - 1) / ys;\n}\n\nstatic inline uint8_t umod_safe8(uint8_t x, uint8_t y) {\n uint8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x % ys;\n}\n\nstatic inline uint16_t umod_safe16(uint16_t x, uint16_t y) {\n uint16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x % ys;\n}\n\nstatic inline uint32_t umod_safe32(uint32_t x, uint32_t y) {\n uint32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x % ys;\n}\n\nstatic inline uint64_t umod_safe64(uint64_t x, uint64_t y) {\n uint64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x % ys;\n}\n\nstatic inline int8_t sdiv8(int8_t x, int8_t y) {\n int8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n int8_t q = x / ys;\n int8_t r = x % ys;\n\n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int16_t sdiv16(int16_t x, int16_t y) {\n int16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n int16_t q = x / ys;\n int16_t r = x % ys;\n\n return q - ((r != 0",
" && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int32_t sdiv32(int32_t x, int32_t y) {\n int32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n int32_t q = x / ys;\n int32_t r = x % ys;\n\n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int64_t sdiv64(int64_t x, int64_t y) {\n int64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n int64_t q = x / ys;\n int64_t r = x % ys;\n\n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int8_t sdiv_up8(int8_t x, int8_t y) {\n return sdiv8(x + y - 1, y);\n}\n\nstatic inline int16_t sdiv_up16(int16_t x, int16_t y) {\n return sdiv16(x + y - 1, y);\n}\n\nstatic inline int32_t sdiv_up32(int32_t x, int32_t y) {\n return sdiv32(x + y - 1, y);\n}\n\nstatic inline int64_t sdiv_up64(int64_t x, int64_t y) {\n return sdiv64(x + y - 1, y);\n}\n\nstatic inline int8_t smod8(int8_t x, int8_t y) {\n int8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n int8_t r = x % ys;\n\n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int16_t smod16(int16_t x, int16_t y) {\n int16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n int16_t r = x % ys;\n\n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int32_t smod32(int32_t x, int32_t y) {\n int32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n int32_t r = x % ys;\n\n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int64_t smod64(int64_t x, int64_t y) {\n int64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n int64_t r = x % ys;\n\n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int8_t sdiv_safe8(int8_t x, int8_t y) {\n return y == 0 ? 0 : sdiv8(x, y);\n}\n\nstatic inline int16_t sdiv_safe16(int16_t x, int16_t y) {\n return y == 0 ? 0 : sdiv16(x, y);\n}\n\nstatic inline int32_t sdiv_safe32(int32_t x, int32_t y) {\n return y == 0 ? 0 : sdiv32(x, y);\n}\n\nstatic inline int64_t sdiv_safe64(int64_t x, int64_t y) {\n return y == 0 ? 0 : sdi", "v64(x, y);\n}\n\nstatic inline int8_t sdiv_up_safe8(int8_t x, int8_t y) {\n return sdiv_safe8(x + y - 1, y);\n}\n\nstatic inline int16_t sdiv_up_safe16(int16_t x, int16_t y) {\n return sdiv_safe16(x + y - 1, y);\n}\n\nstatic inline int32_t sdiv_up_safe32(int32_t x, int32_t y) {\n return sdiv_safe32(x + y - 1, y);\n}\n\nstatic inline int64_t sdiv_up_safe64(int64_t x, int64_t y) {\n return sdiv_safe64(x + y - 1, y);\n}\n\nstatic inline int8_t smod_safe8(int8_t x, int8_t y) {\n return y == 0 ? 0 : smod8(x, y);\n}\n\nstatic inline int16_t smod_safe16(int16_t x, int16_t y) {\n return y == 0 ? 0 : smod16(x, y);\n}\n\nstatic inline int32_t smod_safe32(int32_t x, int32_t y) {\n return y == 0 ? 0 : smod32(x, y);\n}\n\nstatic inline int64_t smod_safe64(int64_t x, int64_t y) {\n return y == 0 ? 0 : smod64(x, y);\n}\n\nstatic inline int8_t squot8(int8_t x, int8_t y) {\n int8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x / ys;\n}\n\nstatic inline int16_t squot16(int16_t x, int16_t y) {\n int16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x / ys;\n}\n\nstatic inline int32_t squot32(int32_t x, int32_t y) {\n int32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x / ys;\n}\n\nstatic inline int64_t squot64(int64_t x, int64_t y) {\n int64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x / ys;\n}\n\nstatic inline int8_t srem8(int8_t x, int8_t y) {\n int8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x % ys;\n}\n\nstatic inline int16_t srem16(int16_t x, int16_t y) {\n int16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x % ys;\n}\n\nstatic inline int32_t srem32(int32_t x, int32_t y) {\n int32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x % ys;\n}\n\nstatic inline int64_t srem64(int64_t x, int64_t y) {\n int8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x % ys;\n}\n\nstatic inline int8_t squot_safe8(int8_t x, int8_t y) {\n int8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x / ys;\n}\n\nstatic inline int16_t ", "squot_safe16(int16_t x, int16_t y) {\n int16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x / ys;\n}\n\nstatic inline int32_t squot_safe32(int32_t x, int32_t y) {\n int32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x / ys;\n}\n\nstatic inline int64_t squot_safe64(int64_t x, int64_t y) {\n int64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x / ys;\n}\n\nstatic inline int8_t srem_safe8(int8_t x, int8_t y) {\n int8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x % ys;\n}\n\nstatic inline int16_t srem_safe16(int16_t x, int16_t y) {\n int16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x % ys;\n}\n\nstatic inline int32_t srem_safe32(int32_t x, int32_t y) {\n int32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x % ys;\n}\n\nstatic inline int64_t srem_safe64(int64_t x, int64_t y) {\n int64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x % ys;\n}\n\n#else\n\nstatic inline uint8_t udiv8(uint8_t x, uint8_t y) {\n return x / y;\n}\n\nstatic inline uint16_t udiv16(uint16_t x, uint16_t y) {\n return x / y;\n}\n\nstatic inline uint32_t udiv32(uint32_t x, uint32_t y) {\n return x / y;\n}\n\nstatic inline uint64_t udiv64(uint64_t x, uint64_t y) {\n return x / y;\n}\n\nstatic inline uint8_t udiv_up8(uint8_t x, uint8_t y) {\n return (x + y - 1) / y;\n}\n\nstatic inline uint16_t udiv_up16(uint16_t x, uint16_t y) {\n return (x + y - 1) / y;\n}\n\nstatic inline uint32_t udiv_up32(uint32_t x, uint32_t y) {\n return (x + y - 1) / y;\n}\n\nstatic inline uint64_t udiv_up64(uint64_t x, uint64_t y) {\n return (x + y - 1) / y;\n}\n\nstatic inline uint8_t umod8(uint8_t x, uint8_t y) {\n return x % y;\n}\n\nstatic inline uint16_t umod16(uint16_t x, uint16_t y) {\n return x % y;\n}\n\nstatic inline uint32_t umod32(uint32_t x, uint32_t y) {\n return x % y;\n}\n\nstatic inline uint64_t umod64(uint64_t x, uint64_t y) {\n return x % y;\n}\n\nstatic inline uint8_t udiv_safe8(u",
"int8_t x, uint8_t y) {\n return y == 0 ? 0 : x / y;\n}\n\nstatic inline uint16_t udiv_safe16(uint16_t x, uint16_t y) {\n return y == 0 ? 0 : x / y;\n}\n\nstatic inline uint32_t udiv_safe32(uint32_t x, uint32_t y) {\n return y == 0 ? 0 : x / y;\n}\n\nstatic inline uint64_t udiv_safe64(uint64_t x, uint64_t y) {\n return y == 0 ? 0 : x / y;\n}\n\nstatic inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) {\n return y == 0 ? 0 : (x + y - 1) / y;\n}\n\nstatic inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) {\n return y == 0 ? 0 : (x + y - 1) / y;\n}\n\nstatic inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) {\n return y == 0 ? 0 : (x + y - 1) / y;\n}\n\nstatic inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) {\n return y == 0 ? 0 : (x + y - 1) / y;\n}\n\nstatic inline uint8_t umod_safe8(uint8_t x, uint8_t y) {\n return y == 0 ? 0 : x % y;\n}\n\nstatic inline uint16_t umod_safe16(uint16_t x, uint16_t y) {\n return y == 0 ? 0 : x % y;\n}\n\nstatic inline uint32_t umod_safe32(uint32_t x, uint32_t y) {\n return y == 0 ? 0 : x % y;\n}\n\nstatic inline uint64_t umod_safe64(uint64_t x, uint64_t y) {\n return y == 0 ? 0 : x % y;\n}\n\nstatic inline int8_t sdiv8(int8_t x, int8_t y) {\n int8_t q = x / y;\n int8_t r = x % y;\n\n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int16_t sdiv16(int16_t x, int16_t y) {\n int16_t q = x / y;\n int16_t r = x % y;\n\n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int32_t sdiv32(int32_t x, int32_t y) {\n int32_t q = x / y;\n int32_t r = x % y;\n\n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int64_t sdiv64(int64_t x, int64_t y) {\n int64_t q = x / y;\n int64_t r = x % y;\n\n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int8_t sdiv_up8(int8_t x, int8_t y) {\n return sdiv8(x + y - 1, y);\n}\n\nstatic inline int16_t sdiv_up16(int16_t x, int16_t y) {\n return sdiv16(x + y - 1, y);\n}\n\nstatic inline int32_t sdiv_up32(int32_t x, int32_t y) {\n return sdiv32(x + y - 1, y);\n}\n\nstatic inline int64", "_t sdiv_up64(int64_t x, int64_t y) {\n return sdiv64(x + y - 1, y);\n}\n\nstatic inline int8_t smod8(int8_t x, int8_t y) {\n int8_t r = x % y;\n\n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int16_t smod16(int16_t x, int16_t y) {\n int16_t r = x % y;\n\n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int32_t smod32(int32_t x, int32_t y) {\n int32_t r = x % y;\n\n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int64_t smod64(int64_t x, int64_t y) {\n int64_t r = x % y;\n\n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int8_t sdiv_safe8(int8_t x, int8_t y) {\n return y == 0 ? 0 : sdiv8(x, y);\n}\n\nstatic inline int16_t sdiv_safe16(int16_t x, int16_t y) {\n return y == 0 ? 0 : sdiv16(x, y);\n}\n\nstatic inline int32_t sdiv_safe32(int32_t x, int32_t y) {\n return y == 0 ? 0 : sdiv32(x, y);\n}\n\nstatic inline int64_t sdiv_safe64(int64_t x, int64_t y) {\n return y == 0 ? 0 : sdiv64(x, y);\n}\n\nstatic inline int8_t sdiv_up_safe8(int8_t x, int8_t y) {\n return sdiv_safe8(x + y - 1, y);\n}\n\nstatic inline int16_t sdiv_up_safe16(int16_t x, int16_t y) {\n return sdiv_safe16(x + y - 1, y);\n}\n\nstatic inline int32_t sdiv_up_safe32(int32_t x, int32_t y) {\n return sdiv_safe32(x + y - 1, y);\n}\n\nstatic inline int64_t sdiv_up_safe64(int64_t x, int64_t y) {\n return sdiv_safe64(x + y - 1, y);\n}\n\nstatic inline int8_t smod_safe8(int8_t x, int8_t y) {\n return y == 0 ? 0 : smod8(x, y);\n}\n\nstatic inline int16_t smod_safe16(int16_t x, int16_t y) {\n return y == 0 ? 0 : smod16(x, y);\n}\n\nstatic inline int32_t smod_safe32(int32_t x, int32_t y) {\n return y == 0 ? 0 : smod32(x, y);\n}\n\nstatic inline int64_t smod_safe64(int64_t x, int64_t y) {\n return y == 0 ? 0 : smod64(x, y);\n}\n\nstatic inline int8_t squot8(int8_t x, int8_t y) {\n return x / y;\n}\n\nstatic inline int16_t squot16(int16_t x, int16_t y) {\n return x / y;\n}\n\nstatic inline int32_t squot32(int32", "_t x, int32_t y) {\n return x / y;\n}\n\nstatic inline int64_t squot64(int64_t x, int64_t y) {\n return x / y;\n}\n\nstatic inline int8_t srem8(int8_t x, int8_t y) {\n return x % y;\n}\n\nstatic inline int16_t srem16(int16_t x, int16_t y) {\n return x % y;\n}\n\nstatic inline int32_t srem32(int32_t x, int32_t y) {\n return x % y;\n}\n\nstatic inline int64_t srem64(int64_t x, int64_t y) {\n return x % y;\n}\n\nstatic inline int8_t squot_safe8(int8_t x, int8_t y) {\n return y == 0 ? 0 : x / y;\n}\n\nstatic inline int16_t squot_safe16(int16_t x, int16_t y) {\n return y == 0 ? 0 : x / y;\n}\n\nstatic inline int32_t squot_safe32(int32_t x, int32_t y) {\n return y == 0 ? 0 : x / y;\n}\n\nstatic inline int64_t squot_safe64(int64_t x, int64_t y) {\n return y == 0 ? 0 : x / y;\n}\n\nstatic inline int8_t srem_safe8(int8_t x, int8_t y) {\n return y == 0 ? 0 : x % y;\n}\n\nstatic inline int16_t srem_safe16(int16_t x, int16_t y) {\n return y == 0 ? 0 : x % y;\n}\n\nstatic inline int32_t srem_safe32(int32_t x, int32_t y) {\n return y == 0 ? 0 : x % y;\n}\n\nstatic inline int64_t srem_safe64(int64_t x, int64_t y) {\n return y == 0 ? 0 : x % y;\n}\n\n#endif\n\nstatic inline int8_t smin8(int8_t x, int8_t y) {\n return x < y ? x : y;\n}\n\nstatic inline int16_t smin16(int16_t x, int16_t y) {\n return x < y ? x : y;\n}\n\nstatic inline int32_t smin32(int32_t x, int32_t y) {\n return x < y ? x : y;\n}\n\nstatic inline int64_t smin64(int64_t x, int64_t y) {\n return x < y ? x : y;\n}\n\nstatic inline uint8_t umin8(uint8_t x, uint8_t y) {\n return x < y ? x : y;\n}\n\nstatic inline uint16_t umin16(uint16_t x, uint16_t y) {\n return x < y ? x : y;\n}\n\nstatic inline uint32_t umin32(uint32_t x, uint32_t y) {\n return x < y ? x : y;\n}\n\nstatic inline uint64_t umin64(uint64_t x, uint64_t y) {\n return x < y ? x : y;\n}\n\nstatic inline int8_t smax8(int8_t x, int8_t y) {\n return x < y ? y : x;\n}\n\nstatic inline int16_t smax16(int16_t x, int16_t y) {\n return x < y ? y : x;\n}\n\nstatic inline int32_t smax32(int32_t x, int32_t y) {\n return x < y ? y : x;\n}\n\ns",
"tatic inline int64_t smax64(int64_t x, int64_t y) {\n return x < y ? y : x;\n}\n\nstatic inline uint8_t umax8(uint8_t x, uint8_t y) {\n return x < y ? y : x;\n}\n\nstatic inline uint16_t umax16(uint16_t x, uint16_t y) {\n return x < y ? y : x;\n}\n\nstatic inline uint32_t umax32(uint32_t x, uint32_t y) {\n return x < y ? y : x;\n}\n\nstatic inline uint64_t umax64(uint64_t x, uint64_t y) {\n return x < y ? y : x;\n}\n\nstatic inline uint8_t shl8(uint8_t x, uint8_t y) {\n return (uint8_t)(x << y);\n}\n\nstatic inline uint16_t shl16(uint16_t x, uint16_t y) {\n return (uint16_t)(x << y);\n}\n\nstatic inline uint32_t shl32(uint32_t x, uint32_t y) {\n return x << y;\n}\n\nstatic inline uint64_t shl64(uint64_t x, uint64_t y) {\n return x << y;\n}\n\nstatic inline uint8_t lshr8(uint8_t x, uint8_t y) {\n return x >> y;\n}\n\nstatic inline uint16_t lshr16(uint16_t x, uint16_t y) {\n return x >> y;\n}\n\nstatic inline uint32_t lshr32(uint32_t x, uint32_t y) {\n return x >> y;\n}\n\nstatic inline uint64_t lshr64(uint64_t x, uint64_t y) {\n return x >> y;\n}\n\nstatic inline int8_t ashr8(int8_t x, int8_t y) {\n return x >> y;\n}\n\nstatic inline int16_t ashr16(int16_t x, int16_t y) {\n return x >> y;\n}\n\nstatic inline int32_t ashr32(int32_t x, int32_t y) {\n return x >> y;\n}\n\nstatic inline int64_t ashr64(int64_t x, int64_t y) {\n return x >> y;\n}\n\nstatic inline uint8_t and8(uint8_t x, uint8_t y) {\n return x & y;\n}\n\nstatic inline uint16_t and16(uint16_t x, uint16_t y) {\n return x & y;\n}\n\nstatic inline uint32_t and32(uint32_t x, uint32_t y) {\n return x & y;\n}\n\nstatic inline uint64_t and64(uint64_t x, uint64_t y) {\n return x & y;\n}\n\nstatic inline uint8_t or8(uint8_t x, uint8_t y) {\n return x | y;\n}\n\nstatic inline uint16_t or16(uint16_t x, uint16_t y) {\n return x | y;\n}\n\nstatic inline uint32_t or32(uint32_t x, uint32_t y) {\n return x | y;\n}\n\nstatic inline uint64_t or64(uint64_t x, uint64_t y) {\n return x | y;\n}\n\nstatic inline uint8_t xor8(uint8_t x, uint8_t y) {\n return x ^ y;\n}\n\nstatic inline uint16_t xor16(uint16_", "t x, uint16_t y) {\n return x ^ y;\n}\n\nstatic inline uint32_t xor32(uint32_t x, uint32_t y) {\n return x ^ y;\n}\n\nstatic inline uint64_t xor64(uint64_t x, uint64_t y) {\n return x ^ y;\n}\n\nstatic inline bool ult8(uint8_t x, uint8_t y) {\n return x < y;\n}\n\nstatic inline bool ult16(uint16_t x, uint16_t y) {\n return x < y;\n}\n\nstatic inline bool ult32(uint32_t x, uint32_t y) {\n return x < y;\n}\n\nstatic inline bool ult64(uint64_t x, uint64_t y) {\n return x < y;\n}\n\nstatic inline bool ule8(uint8_t x, uint8_t y) {\n return x <= y;\n}\n\nstatic inline bool ule16(uint16_t x, uint16_t y) {\n return x <= y;\n}\n\nstatic inline bool ule32(uint32_t x, uint32_t y) {\n return x <= y;\n}\n\nstatic inline bool ule64(uint64_t x, uint64_t y) {\n return x <= y;\n}\n\nstatic inline bool slt8(int8_t x, int8_t y) {\n return x < y;\n}\n\nstatic inline bool slt16(int16_t x, int16_t y) {\n return x < y;\n}\n\nstatic inline bool slt32(int32_t x, int32_t y) {\n return x < y;\n}\n\nstatic inline bool slt64(int64_t x, int64_t y) {\n return x < y;\n}\n\nstatic inline bool sle8(int8_t x, int8_t y) {\n return x <= y;\n}\n\nstatic inline bool sle16(int16_t x, int16_t y) {\n return x <= y;\n}\n\nstatic inline bool sle32(int32_t x, int32_t y) {\n return x <= y;\n}\n\nstatic inline bool sle64(int64_t x, int64_t y) {\n return x <= y;\n}\n\nstatic inline uint8_t pow8(uint8_t x, uint8_t y) {\n uint8_t res = 1, rem = y;\n\n while (rem != 0) {\n if (rem & 1)\n res *= x;\n rem >>= 1;\n x *= x;\n }\n return res;\n}\n\nstatic inline uint16_t pow16(uint16_t x, uint16_t y) {\n uint16_t res = 1, rem = y;\n\n while (rem != 0) {\n if (rem & 1)\n res *= x;\n rem >>= 1;\n x *= x;\n }\n return res;\n}\n\nstatic inline uint32_t pow32(uint32_t x, uint32_t y) {\n uint32_t res = 1, rem = y;\n\n while (rem != 0) {\n if (rem & 1)\n res *= x;\n rem >>= 1;\n x *= x;\n }\n return res;\n}\n\nstatic inline uint64_t pow64(uint64_t x, uint64_t y) {\n uint64_t res = 1, rem = y;\n\n while (rem != 0) {\n if (rem & 1)\n res *= x;\n rem >>= 1;\n ", "x *= x;\n }\n return res;\n}\n\nstatic inline bool itob_i8_bool(int8_t x) {\n return x != 0;\n}\n\nstatic inline bool itob_i16_bool(int16_t x) {\n return x != 0;\n}\n\nstatic inline bool itob_i32_bool(int32_t x) {\n return x != 0;\n}\n\nstatic inline bool itob_i64_bool(int64_t x) {\n return x != 0;\n}\n\nstatic inline int8_t btoi_bool_i8(bool x) {\n return x;\n}\n\nstatic inline int16_t btoi_bool_i16(bool x) {\n return x;\n}\n\nstatic inline int32_t btoi_bool_i32(bool x) {\n return x;\n}\n\nstatic inline int64_t btoi_bool_i64(bool x) {\n return x;\n}\n\n#define sext_i8_i8(x) ((int8_t) (int8_t) (x))\n#define sext_i8_i16(x) ((int16_t) (int8_t) (x))\n#define sext_i8_i32(x) ((int32_t) (int8_t) (x))\n#define sext_i8_i64(x) ((int64_t) (int8_t) (x))\n#define sext_i16_i8(x) ((int8_t) (int16_t) (x))\n#define sext_i16_i16(x) ((int16_t) (int16_t) (x))\n#define sext_i16_i32(x) ((int32_t) (int16_t) (x))\n#define sext_i16_i64(x) ((int64_t) (int16_t) (x))\n#define sext_i32_i8(x) ((int8_t) (int32_t) (x))\n#define sext_i32_i16(x) ((int16_t) (int32_t) (x))\n#define sext_i32_i32(x) ((int32_t) (int32_t) (x))\n#define sext_i32_i64(x) ((int64_t) (int32_t) (x))\n#define sext_i64_i8(x) ((int8_t) (int64_t) (x))\n#define sext_i64_i16(x) ((int16_t) (int64_t) (x))\n#define sext_i64_i32(x) ((int32_t) (int64_t) (x))\n#define sext_i64_i64(x) ((int64_t) (int64_t) (x))\n#define zext_i8_i8(x) ((int8_t) (uint8_t) (x))\n#define zext_i8_i16(x) ((int16_t) (uint8_t) (x))\n#define zext_i8_i32(x) ((int32_t) (uint8_t) (x))\n#define zext_i8_i64(x) ((int64_t) (uint8_t) (x))\n#define zext_i16_i8(x) ((int8_t) (uint16_t) (x))\n#define zext_i16_i16(x) ((int16_t) (uint16_t) (x))\n#define zext_i16_i32(x) ((int32_t) (uint16_t) (x))\n#define zext_i16_i64(x) ((int64_t) (uint16_t) (x))\n#define zext_i32_i8(x) ((int8_t) (uint32_t) (x))\n#define zext_i32_i16(x) ((int16_t) (uint32_t) (x))\n#define zext_i32_i32(x) ((int32_t) (uint32_t) (x))\n#define zext_i32_i64(x) ((int64_t) (uint32_t) (x))\n#define zext_i64_i8(x) ((int8_t) (uint64_t) (x))\n#define zext_i64_i16(x) ((int16_t) (",
"uint64_t) (x))\n#define zext_i64_i32(x) ((int32_t) (uint64_t) (x))\n#define zext_i64_i64(x) ((int64_t) (uint64_t) (x))\n\nstatic int8_t abs8(int8_t x) {\n return (int8_t)abs(x);\n}\n\nstatic int16_t abs16(int16_t x) {\n return (int16_t)abs(x);\n}\n\nstatic int32_t abs32(int32_t x) {\n return abs(x);\n}\n\nstatic int64_t abs64(int64_t x) {\n#if defined(__OPENCL_VERSION__) || defined(ISPC)\n return abs(x);\n#else\n return llabs(x);\n#endif\n}\n\n#if defined(__OPENCL_VERSION__)\nstatic int32_t futrts_popc8(int8_t x) {\n return popcount(x);\n}\n\nstatic int32_t futrts_popc16(int16_t x) {\n return popcount(x);\n}\n\nstatic int32_t futrts_popc32(int32_t x) {\n return popcount(x);\n}\n\nstatic int32_t futrts_popc64(int64_t x) {\n return popcount(x);\n}\n#elif defined(__CUDA_ARCH__)\n\nstatic int32_t futrts_popc8(int8_t x) {\n return __popc(zext_i8_i32(x));\n}\n\nstatic int32_t futrts_popc16(int16_t x) {\n return __popc(zext_i16_i32(x));\n}\n\nstatic int32_t futrts_popc32(int32_t x) {\n return __popc(x);\n}\n\nstatic int32_t futrts_popc64(int64_t x) {\n return __popcll(x);\n}\n\n#else // Not OpenCL or CUDA, but plain C.\n\nstatic int32_t futrts_popc8(uint8_t x) {\n int c = 0;\n for (; x; ++c) { x &= x - 1; }\n return c;\n}\n\nstatic int32_t futrts_popc16(uint16_t x) {\n int c = 0;\n for (; x; ++c) { x &= x - 1; }\n return c;\n}\n\nstatic int32_t futrts_popc32(uint32_t x) {\n int c = 0;\n for (; x; ++c) { x &= x - 1; }\n return c;\n}\n\nstatic int32_t futrts_popc64(uint64_t x) {\n int c = 0;\n for (; x; ++c) { x &= x - 1; }\n return c;\n}\n#endif\n\n#if defined(__OPENCL_VERSION__)\nstatic uint8_t futrts_umul_hi8 ( uint8_t a, uint8_t b) { return mul_hi(a, b); }\nstatic uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return mul_hi(a, b); }\nstatic uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return mul_hi(a, b); }\nstatic uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return mul_hi(a, b); }\nstatic uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return mul_hi(a, b); }\nstatic uint16_t futrts_smul_hi16(int16_t a, int16_t b", ") { return mul_hi(a, b); }\nstatic uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return mul_hi(a, b); }\nstatic uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return mul_hi(a, b); }\n#elif defined(__CUDA_ARCH__)\nstatic uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }\nstatic uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }\nstatic uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return __umulhi(a, b); }\nstatic uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return __umul64hi(a, b); }\nstatic uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; }\nstatic uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }\nstatic uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return __mulhi(a, b); }\nstatic uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return __mul64hi(a, b); }\n#elif ISPC\nstatic uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }\nstatic uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }\nstatic uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }\nstatic uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) {\n uint64_t ah = a >> 32;\n uint64_t al = a & 0xffffffff;\n uint64_t bh = b >> 32;\n uint64_t bl = b & 0xffffffff;\n\n uint64_t p1 = al * bl;\n uint64_t p2 = al * bh;\n uint64_t p3 = ah * bl;\n uint64_t p4 = ah * bh;\n\n uint64_t p1h = p1 >> 32;\n uint64_t p2h = p2 >> 32;\n uint64_t p3h = p3 >> 32;\n uint64_t p2l = p2 & 0xffffffff;\n uint64_t p3l = p3 & 0xffffffff;\n\n uint64_t l = p1h + p2l + p3l;\n uint64_t m = (p2 >> 32) + (p3 >> 32);\n uint64_t h = (l >> 32) + m + p4;\n\n return h;\n}\nstatic int8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }\nstatic int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((uint32_t)a", ") * ((uint32_t)b) >> 16; }\nstatic int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }\nstatic int64_t futrts_smul_hi64(int64_t a, int64_t b) {\n uint64_t ah = a >> 32;\n uint64_t al = a & 0xffffffff;\n uint64_t bh = b >> 32;\n uint64_t bl = b & 0xffffffff;\n\n uint64_t p1 = al * bl;\n int64_t p2 = al * bh;\n int64_t p3 = ah * bl;\n uint64_t p4 = ah * bh;\n\n uint64_t p1h = p1 >> 32;\n uint64_t p2h = p2 >> 32;\n uint64_t p3h = p3 >> 32;\n uint64_t p2l = p2 & 0xffffffff;\n uint64_t p3l = p3 & 0xffffffff;\n\n uint64_t l = p1h + p2l + p3l;\n uint64_t m = (p2 >> 32) + (p3 >> 32);\n uint64_t h = (l >> 32) + m + p4;\n\n return h;\n}\n\n#else // Not OpenCL, ISPC, or CUDA, but plain C.\nstatic uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }\nstatic uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }\nstatic uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }\nstatic uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return ((__uint128_t)a) * ((__uint128_t)b) >> 64; }\nstatic int8_t futrts_smul_hi8(int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; }\nstatic int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }\nstatic int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((int64_t)a) * ((int64_t)b) >> 32; }\nstatic int64_t futrts_smul_hi64(int64_t a, int64_t b) { return ((__int128_t)a) * ((__int128_t)b) >> 64; }\n#endif\n\n#if defined(__OPENCL_VERSION__)\nstatic uint8_t futrts_umad_hi8 ( uint8_t a, uint8_t b, uint8_t c) { return mad_hi(a, b, c); }\nstatic uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return mad_hi(a, b, c); }\nstatic uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return mad_hi(a, b, c); }\nstatic uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return mad_hi(a, b, c); }\nstatic uint8_t futrts_sm",
"ad_hi8( int8_t a, int8_t b, int8_t c) { return mad_hi(a, b, c); }\nstatic uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return mad_hi(a, b, c); }\nstatic uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return mad_hi(a, b, c); }\nstatic uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return mad_hi(a, b, c); }\n#else // Not OpenCL\n\nstatic uint8_t futrts_umad_hi8( uint8_t a, uint8_t b, uint8_t c) { return futrts_umul_hi8(a, b) + c; }\nstatic uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return futrts_umul_hi16(a, b) + c; }\nstatic uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return futrts_umul_hi32(a, b) + c; }\nstatic uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return futrts_umul_hi64(a, b) + c; }\nstatic uint8_t futrts_smad_hi8 ( int8_t a, int8_t b, int8_t c) { return futrts_smul_hi8(a, b) + c; }\nstatic uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return futrts_smul_hi16(a, b) + c; }\nstatic uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return futrts_smul_hi32(a, b) + c; }\nstatic uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return futrts_smul_hi64(a, b) + c; }\n#endif\n\n#if defined(__OPENCL_VERSION__)\nstatic int32_t futrts_clzz8(int8_t x) {\n return clz(x);\n}\n\nstatic int32_t futrts_clzz16(int16_t x) {\n return clz(x);\n}\n\nstatic int32_t futrts_clzz32(int32_t x) {\n return clz(x);\n}\n\nstatic int32_t futrts_clzz64(int64_t x) {\n return clz(x);\n}\n\n#elif defined(__CUDA_ARCH__)\n\nstatic int32_t futrts_clzz8(int8_t x) {\n return __clz(zext_i8_i32(x)) - 24;\n}\n\nstatic int32_t futrts_clzz16(int16_t x) {\n return __clz(zext_i16_i32(x)) - 16;\n}\n\nstatic int32_t futrts_clzz32(int32_t x) {\n return __clz(x);\n}\n\nstatic int32_t futrts_clzz64(int64_t x) {\n return __clzll(x);\n}\n\n#elif ISPC\n\nstatic int32_t futrts_clzz8(int8_t x) {\n return count_leading_zeros((int32_t)(uint8_t)x)-24;\n}\n\nstatic int32_t futrts_clzz16(int16_t x) {\n return count_lead", "ing_zeros((int32_t)(uint16_t)x)-16;\n}\n\nstatic int32_t futrts_clzz32(int32_t x) {\n return count_leading_zeros(x);\n}\n\nstatic int32_t futrts_clzz64(int64_t x) {\n return count_leading_zeros(x);\n}\n\n#else // Not OpenCL, ISPC or CUDA, but plain C.\n\nstatic int32_t futrts_clzz8(int8_t x) {\n return x == 0 ? 8 : __builtin_clz((uint32_t)zext_i8_i32(x)) - 24;\n}\n\nstatic int32_t futrts_clzz16(int16_t x) {\n return x == 0 ? 16 : __builtin_clz((uint32_t)zext_i16_i32(x)) - 16;\n}\n\nstatic int32_t futrts_clzz32(int32_t x) {\n return x == 0 ? 32 : __builtin_clz((uint32_t)x);\n}\n\nstatic int32_t futrts_clzz64(int64_t x) {\n return x == 0 ? 64 : __builtin_clzll((uint64_t)x);\n}\n#endif\n\n#if defined(__OPENCL_VERSION__)\nstatic int32_t futrts_ctzz8(int8_t x) {\n int i = 0;\n for (; i < 8 && (x & 1) == 0; i++, x >>= 1)\n ;\n return i;\n}\n\nstatic int32_t futrts_ctzz16(int16_t x) {\n int i = 0;\n for (; i < 16 && (x & 1) == 0; i++, x >>= 1)\n ;\n return i;\n}\n\nstatic int32_t futrts_ctzz32(int32_t x) {\n int i = 0;\n for (; i < 32 && (x & 1) == 0; i++, x >>= 1)\n ;\n return i;\n}\n\nstatic int32_t futrts_ctzz64(int64_t x) {\n int i = 0;\n for (; i < 64 && (x & 1) == 0; i++, x >>= 1)\n ;\n return i;\n}\n\n#elif defined(__CUDA_ARCH__)\n\nstatic int32_t futrts_ctzz8(int8_t x) {\n int y = __ffs(x);\n return y == 0 ? 8 : y - 1;\n}\n\nstatic int32_t futrts_ctzz16(int16_t x) {\n int y = __ffs(x);\n return y == 0 ? 16 : y - 1;\n}\n\nstatic int32_t futrts_ctzz32(int32_t x) {\n int y = __ffs(x);\n return y == 0 ? 32 : y - 1;\n}\n\nstatic int32_t futrts_ctzz64(int64_t x) {\n int y = __ffsll(x);\n return y == 0 ? 64 : y - 1;\n}\n\n#elif ISPC\n\nstatic int32_t futrts_ctzz8(int8_t x) {\n return x == 0 ? 8 : count_trailing_zeros((int32_t)x);\n}\n\nstatic int32_t futrts_ctzz16(int16_t x) {\n return x == 0 ? 16 : count_trailing_zeros((int32_t)x);\n}\n\nstatic int32_t futrts_ctzz32(int32_t x) {\n return count_trailing_zeros(x);\n}\n\nstatic int32_t futrts_ctzz64(int64_t x) {\n return count_trailing_zeros(x);\n}\n\n#else // Not OpenCL or CUDA,", " but plain C.\n\nstatic int32_t futrts_ctzz8(int8_t x) {\n return x == 0 ? 8 : __builtin_ctz((uint32_t)x);\n}\n\nstatic int32_t futrts_ctzz16(int16_t x) {\n return x == 0 ? 16 : __builtin_ctz((uint32_t)x);\n}\n\nstatic int32_t futrts_ctzz32(int32_t x) {\n return x == 0 ? 32 : __builtin_ctz((uint32_t)x);\n}\n\nstatic int32_t futrts_ctzz64(int64_t x) {\n return x == 0 ? 64 : __builtin_ctzll((uint64_t)x);\n}\n#endif\n\nstatic inline float fdiv32(float x, float y) {\n return x / y;\n}\n\nstatic inline float fadd32(float x, float y) {\n return x + y;\n}\n\nstatic inline float fsub32(float x, float y) {\n return x - y;\n}\n\nstatic inline float fmul32(float x, float y) {\n return x * y;\n}\n\nstatic inline bool cmplt32(float x, float y) {\n return x < y;\n}\n\nstatic inline bool cmple32(float x, float y) {\n return x <= y;\n}\n\nstatic inline float sitofp_i8_f32(int8_t x) {\n return (float) x;\n}\n\nstatic inline float sitofp_i16_f32(int16_t x) {\n return (float) x;\n}\n\nstatic inline float sitofp_i32_f32(int32_t x) {\n return (float) x;\n}\n\nstatic inline float sitofp_i64_f32(int64_t x) {\n return (float) x;\n}\n\nstatic inline float uitofp_i8_f32(uint8_t x) {\n return (float) x;\n}\n\nstatic inline float uitofp_i16_f32(uint16_t x) {\n return (float) x;\n}\n\nstatic inline float uitofp_i32_f32(uint32_t x) {\n return (float) x;\n}\n\nstatic inline float uitofp_i64_f32(uint64_t x) {\n return (float) x;\n}\n\n#ifdef __OPENCL_VERSION__\nstatic inline float fabs32(float x) {\n return fabs(x);\n}\n\nstatic inline float fmax32(float x, float y) {\n return fmax(x, y);\n}\n\nstatic inline float fmin32(float x, float y) {\n return fmin(x, y);\n}\n\nstatic inline float fpow32(float x, float y) {\n return pow(x, y);\n}\n\n#elif ISPC\n\nstatic inline float fabs32(float x) {\n return abs(x);\n}\n\nstatic inline float fmax32(float x, float y) {\n return isnan(x) ? y : isnan(y) ? x : max(x, y);\n}\n\nstatic inline float fmin32(float x, float y) {\n return isnan(x) ? y : isnan(y) ? x : min(x, y);\n}\n\nstatic inline float fpow32(float a, float b) {\n float ret;\n f",
"oreach_active (i) {\n uniform float r = __stdlib_powf(extract(a, i), extract(b, i));\n ret = insert(ret, i, r);\n }\n return ret;\n}\n\n#else // Not OpenCL, but CUDA or plain C.\n\nstatic inline float fabs32(float x) {\n return fabsf(x);\n}\n\nstatic inline float fmax32(float x, float y) {\n return fmaxf(x, y);\n}\n\nstatic inline float fmin32(float x, float y) {\n return fminf(x, y);\n}\n\nstatic inline float fpow32(float x, float y) {\n return powf(x, y);\n}\n#endif\n\nstatic inline bool futrts_isnan32(float x) {\n return isnan(x);\n}\n\n#if ISPC\n\nstatic inline bool futrts_isinf32(float x) {\n return !isnan(x) && isnan(x - x);\n}\n\nstatic inline bool futrts_isfinite32(float x) {\n return !isnan(x) && !futrts_isinf32(x);\n}\n\n#else\n\nstatic inline bool futrts_isinf32(float x) {\n return isinf(x);\n}\n\n#endif\n\nstatic inline int8_t fptosi_f32_i8(float x) {\n if (futrts_isnan32(x) || futrts_isinf32(x)) {\n return 0;\n } else {\n return (int8_t) x;\n }\n}\n\nstatic inline int16_t fptosi_f32_i16(float x) {\n if (futrts_isnan32(x) || futrts_isinf32(x)) {\n return 0;\n } else {\n return (int16_t) x;\n }\n}\n\nstatic inline int32_t fptosi_f32_i32(float x) {\n if (futrts_isnan32(x) || futrts_isinf32(x)) {\n return 0;\n } else {\n return (int32_t) x;\n }\n}\n\nstatic inline int64_t fptosi_f32_i64(float x) {\n if (futrts_isnan32(x) || futrts_isinf32(x)) {\n return 0;\n } else {\n return (int64_t) x;\n };\n}\n\nstatic inline uint8_t fptoui_f32_i8(float x) {\n if (futrts_isnan32(x) || futrts_isinf32(x)) {\n return 0;\n } else {\n return (uint8_t) (int8_t) x;\n }\n}\n\nstatic inline uint16_t fptoui_f32_i16(float x) {\n if (futrts_isnan32(x) || futrts_isinf32(x)) {\n return 0;\n } else {\n return (uint16_t) (int16_t) x;\n }\n}\n\nstatic inline uint32_t fptoui_f32_i32(float x) {\n if (futrts_isnan32(x) || futrts_isinf32(x)) {\n return 0;\n } else {\n return (uint32_t) (int32_t) x;\n }\n}\n\nstatic inline uint64_t fptoui_f32_i64(float x) {\n if (futrts_isnan32(x) || futrts_isinf32(x)) {\n ret", "urn 0;\n } else {\n return (uint64_t) (int64_t) x;\n }\n}\n\nstatic inline bool ftob_f32_bool(float x) {\n return x != 0;\n}\n\nstatic inline float btof_bool_f32(bool x) {\n return x ? 1 : 0;\n}\n\n#ifdef __OPENCL_VERSION__\nstatic inline float futrts_log32(float x) {\n return log(x);\n}\n\nstatic inline float futrts_log2_32(float x) {\n return log2(x);\n}\n\nstatic inline float futrts_log10_32(float x) {\n return log10(x);\n}\n\nstatic inline float futrts_log1p_32(float x) {\n return log1p(x);\n}\n\nstatic inline float futrts_sqrt32(float x) {\n return sqrt(x);\n}\n\nstatic inline float futrts_cbrt32(float x) {\n return cbrt(x);\n}\n\nstatic inline float futrts_exp32(float x) {\n return exp(x);\n}\n\nstatic inline float futrts_cos32(float x) {\n return cos(x);\n}\n\nstatic inline float futrts_sin32(float x) {\n return sin(x);\n}\n\nstatic inline float futrts_tan32(float x) {\n return tan(x);\n}\n\nstatic inline float futrts_acos32(float x) {\n return acos(x);\n}\n\nstatic inline float futrts_asin32(float x) {\n return asin(x);\n}\n\nstatic inline float futrts_atan32(float x) {\n return atan(x);\n}\n\nstatic inline float futrts_cosh32(float x) {\n return cosh(x);\n}\n\nstatic inline float futrts_sinh32(float x) {\n return sinh(x);\n}\n\nstatic inline float futrts_tanh32(float x) {\n return tanh(x);\n}\n\nstatic inline float futrts_acosh32(float x) {\n return acosh(x);\n}\n\nstatic inline float futrts_asinh32(float x) {\n return asinh(x);\n}\n\nstatic inline float futrts_atanh32(float x) {\n return atanh(x);\n}\n\nstatic inline float futrts_atan2_32(float x, float y) {\n return atan2(x, y);\n}\n\nstatic inline float futrts_hypot32(float x, float y) {\n return hypot(x, y);\n}\n\nstatic inline float futrts_gamma32(float x) {\n return tgamma(x);\n}\n\nstatic inline float futrts_lgamma32(float x) {\n return lgamma(x);\n}\n\nstatic inline float futrts_erf32(float x) {\n return erf(x);\n}\n\nstatic inline float futrts_erfc32(float x) {\n return erfc(x);\n}\n\nstatic inline float fmod32(float x, float y) {\n return fmod(x, y);\n}\n\nstatic inline float futrt", "s_round32(float x) {\n return rint(x);\n}\n\nstatic inline float futrts_floor32(float x) {\n return floor(x);\n}\n\nstatic inline float futrts_ceil32(float x) {\n return ceil(x);\n}\n\nstatic inline float futrts_nextafter32(float x, float y) {\n return nextafter(x, y);\n}\n\nstatic inline float futrts_lerp32(float v0, float v1, float t) {\n return mix(v0, v1, t);\n}\n\nstatic inline float futrts_mad32(float a, float b, float c) {\n return mad(a, b, c);\n}\n\nstatic inline float futrts_fma32(float a, float b, float c) {\n return fma(a, b, c);\n}\n\n#elif ISPC\n\nstatic inline float futrts_log32(float x) {\n return futrts_isfinite32(x) || (futrts_isinf32(x) && x < 0)? log(x) : x;\n}\n\nstatic inline float futrts_log2_32(float x) {\n return futrts_log32(x) / log(2.0f);\n}\n\nstatic inline float futrts_log10_32(float x) {\n return futrts_log32(x) / log(10.0f);\n}\n\nstatic inline float futrts_log1p_32(float x) {\n if(x == -1.0f || (futrts_isinf32(x) && x > 0.0f)) return x / 0.0f;\n float y = 1.0f + x;\n float z = y - 1.0f;\n return log(y) - (z-x)/y;\n}\n\nstatic inline float futrts_sqrt32(float x) {\n return sqrt(x);\n}\n\nextern \"C\" unmasked uniform float cbrtf(uniform float);\nstatic inline float futrts_cbrt32(float x) {\n float res;\n foreach_active (i) {\n uniform float r = cbrtf(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline float futrts_exp32(float x) {\n return exp(x);\n}\n\nstatic inline float futrts_cos32(float x) {\n return cos(x);\n}\n\nstatic inline float futrts_sin32(float x) {\n return sin(x);\n}\n\nstatic inline float futrts_tan32(float x) {\n return tan(x);\n}\n\nstatic inline float futrts_acos32(float x) {\n return acos(x);\n}\n\nstatic inline float futrts_asin32(float x) {\n return asin(x);\n}\n\nstatic inline float futrts_atan32(float x) {\n return atan(x);\n}\n\nstatic inline float futrts_cosh32(float x) {\n return (exp(x)+exp(-x)) / 2.0f;\n}\n\nstatic inline float futrts_sinh32(float x) {\n return (exp(x)-exp(-x)) / 2.0f;\n}\n\nstatic inline float futrts_tanh32(float x) {\n retur",
"n futrts_sinh32(x)/futrts_cosh32(x);\n}\n\nstatic inline float futrts_acosh32(float x) {\n float f = x+sqrt(x*x-1);\n if(futrts_isfinite32(f)) return log(f);\n return f;\n}\n\nstatic inline float futrts_asinh32(float x) {\n float f = x+sqrt(x*x+1);\n if(futrts_isfinite32(f)) return log(f);\n return f;\n\n}\n\nstatic inline float futrts_atanh32(float x) {\n float f = (1+x)/(1-x);\n if(futrts_isfinite32(f)) return log(f)/2.0f;\n return f;\n\n}\n\nstatic inline float futrts_atan2_32(float x, float y) {\n return (x == 0.0f && y == 0.0f) ? 0.0f : atan2(x, y);\n}\n\nstatic inline float futrts_hypot32(float x, float y) {\n if (futrts_isfinite32(x) && futrts_isfinite32(y)) {\n x = abs(x);\n y = abs(y);\n float a;\n float b;\n if (x >= y){\n a = x;\n b = y;\n } else {\n a = y;\n b = x;\n }\n if(b == 0){\n return a;\n }\n\n int e;\n float an;\n float bn;\n an = frexp (a, &e);\n bn = ldexp (b, - e);\n float cn;\n cn = sqrt (an * an + bn * bn);\n return ldexp (cn, e);\n } else {\n if (futrts_isinf32(x) || futrts_isinf32(y)) return INFINITY;\n else return x + y;\n }\n\n}\n\nextern \"C\" unmasked uniform float tgammaf(uniform float x);\nstatic inline float futrts_gamma32(float x) {\n float res;\n foreach_active (i) {\n uniform float r = tgammaf(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nextern \"C\" unmasked uniform float lgammaf(uniform float x);\nstatic inline float futrts_lgamma32(float x) {\n float res;\n foreach_active (i) {\n uniform float r = lgammaf(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nextern \"C\" unmasked uniform float erff(uniform float x);\nstatic inline float futrts_erf32(float x) {\n float res;\n foreach_active (i) {\n uniform float r = erff(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nextern \"C\" unmasked uniform float erfcf(uniform float x);\nstatic inline float futrts_erfc32(float x) {\n float res;\n foreach_active (i) {\n uniform float r = erfcf(extr", "act(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline float fmod32(float x, float y) {\n return x - y * trunc(x/y);\n}\n\nstatic inline float futrts_round32(float x) {\n return round(x);\n}\n\nstatic inline float futrts_floor32(float x) {\n return floor(x);\n}\n\nstatic inline float futrts_ceil32(float x) {\n return ceil(x);\n}\n\nextern \"C\" unmasked uniform float nextafterf(uniform float x, uniform float y);\nstatic inline float futrts_nextafter32(float x, float y) {\n float res;\n foreach_active (i) {\n uniform float r = nextafterf(extract(x, i), extract(y, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline float futrts_lerp32(float v0, float v1, float t) {\n return v0 + (v1 - v0) * t;\n}\n\nstatic inline float futrts_mad32(float a, float b, float c) {\n return a * b + c;\n}\n\nstatic inline float futrts_fma32(float a, float b, float c) {\n return a * b + c;\n}\n\n#else // Not OpenCL or ISPC, but CUDA or plain C.\n\nstatic inline float futrts_log32(float x) {\n return logf(x);\n}\n\nstatic inline float futrts_log2_32(float x) {\n return log2f(x);\n}\n\nstatic inline float futrts_log10_32(float x) {\n return log10f(x);\n}\n\nstatic inline float futrts_log1p_32(float x) {\n return log1pf(x);\n}\n\nstatic inline float futrts_sqrt32(float x) {\n return sqrtf(x);\n}\n\nstatic inline float futrts_cbrt32(float x) {\n return cbrtf(x);\n}\n\nstatic inline float futrts_exp32(float x) {\n return expf(x);\n}\n\nstatic inline float futrts_cos32(float x) {\n return cosf(x);\n}\n\nstatic inline float futrts_sin32(float x) {\n return sinf(x);\n}\n\nstatic inline float futrts_tan32(float x) {\n return tanf(x);\n}\n\nstatic inline float futrts_acos32(float x) {\n return acosf(x);\n}\n\nstatic inline float futrts_asin32(float x) {\n return asinf(x);\n}\n\nstatic inline float futrts_atan32(float x) {\n return atanf(x);\n}\n\nstatic inline float futrts_cosh32(float x) {\n return coshf(x);\n}\n\nstatic inline float futrts_sinh32(float x) {\n return sinhf(x);\n}\n\nstatic inline float futrts_tanh32(float x) {\n r", "eturn tanhf(x);\n}\n\nstatic inline float futrts_acosh32(float x) {\n return acoshf(x);\n}\n\nstatic inline float futrts_asinh32(float x) {\n return asinhf(x);\n}\n\nstatic inline float futrts_atanh32(float x) {\n return atanhf(x);\n}\n\nstatic inline float futrts_atan2_32(float x, float y) {\n return atan2f(x, y);\n}\n\nstatic inline float futrts_hypot32(float x, float y) {\n return hypotf(x, y);\n}\n\nstatic inline float futrts_gamma32(float x) {\n return tgammaf(x);\n}\n\nstatic inline float futrts_lgamma32(float x) {\n return lgammaf(x);\n}\n\nstatic inline float futrts_erf32(float x) {\n return erff(x);\n}\n\nstatic inline float futrts_erfc32(float x) {\n return erfcf(x);\n}\n\nstatic inline float fmod32(float x, float y) {\n return fmodf(x, y);\n}\n\nstatic inline float futrts_round32(float x) {\n return rintf(x);\n}\n\nstatic inline float futrts_floor32(float x) {\n return floorf(x);\n}\n\nstatic inline float futrts_ceil32(float x) {\n return ceilf(x);\n}\n\nstatic inline float futrts_nextafter32(float x, float y) {\n return nextafterf(x, y);\n}\n\nstatic inline float futrts_lerp32(float v0, float v1, float t) {\n return v0 + (v1 - v0) * t;\n}\n\nstatic inline float futrts_mad32(float a, float b, float c) {\n return a * b + c;\n}\n\nstatic inline float futrts_fma32(float a, float b, float c) {\n return fmaf(a, b, c);\n}\n#endif\n\n#if ISPC\nstatic inline int32_t futrts_to_bits32(float x) {\n return intbits(x);\n}\n\nstatic inline float futrts_from_bits32(int32_t x) {\n return floatbits(x);\n}\n#else\nstatic inline int32_t futrts_to_bits32(float x) {\n union {\n float f;\n int32_t t;\n } p;\n\n p.f = x;\n return p.t;\n}\n\nstatic inline float futrts_from_bits32(int32_t x) {\n union {\n int32_t f;\n float t;\n } p;\n\n p.f = x;\n return p.t;\n}\n#endif\n\nstatic inline float fsignum32(float x) {\n return futrts_isnan32(x) ? x : (x > 0 ? 1 : 0) - (x < 0 ? 1 : 0);\n}\n\n#ifdef FUTHARK_F64_ENABLED\n\n#if ISPC\nstatic inline bool futrts_isinf64(float x) {\n return !isnan(x) && isnan(x - x);\n}\n\nstatic inline bool futrts_isfinite64(fl",
"oat x) {\n return !isnan(x) && !futrts_isinf64(x);\n}\n\nstatic inline double fdiv64(double x, double y) {\n return x / y;\n}\n\nstatic inline double fadd64(double x, double y) {\n return x + y;\n}\n\nstatic inline double fsub64(double x, double y) {\n return x - y;\n}\n\nstatic inline double fmul64(double x, double y) {\n return x * y;\n}\n\nstatic inline bool cmplt64(double x, double y) {\n return x < y;\n}\n\nstatic inline bool cmple64(double x, double y) {\n return x <= y;\n}\n\nstatic inline double sitofp_i8_f64(int8_t x) {\n return (double) x;\n}\n\nstatic inline double sitofp_i16_f64(int16_t x) {\n return (double) x;\n}\n\nstatic inline double sitofp_i32_f64(int32_t x) {\n return (double) x;\n}\n\nstatic inline double sitofp_i64_f64(int64_t x) {\n return (double) x;\n}\n\nstatic inline double uitofp_i8_f64(uint8_t x) {\n return (double) x;\n}\n\nstatic inline double uitofp_i16_f64(uint16_t x) {\n return (double) x;\n}\n\nstatic inline double uitofp_i32_f64(uint32_t x) {\n return (double) x;\n}\n\nstatic inline double uitofp_i64_f64(uint64_t x) {\n return (double) x;\n}\n\nstatic inline double fabs64(double x) {\n return abs(x);\n}\n\nstatic inline double fmax64(double x, double y) {\n return isnan(x) ? y : isnan(y) ? x : max(x, y);\n}\n\nstatic inline double fmin64(double x, double y) {\n return isnan(x) ? y : isnan(y) ? x : min(x, y);\n}\n\nstatic inline double fpow64(double a, double b) {\n float ret;\n foreach_active (i) {\n uniform float r = __stdlib_powf(extract(a, i), extract(b, i));\n ret = insert(ret, i, r);\n }\n return ret;\n}\n\nstatic inline double futrts_log64(double x) {\n return futrts_isfinite64(x) || (futrts_isinf64(x) && x < 0)? log(x) : x;\n}\n\nstatic inline double futrts_log2_64(double x) {\n return futrts_log64(x)/log(2.0d);\n}\n\nstatic inline double futrts_log10_64(double x) {\n return futrts_log64(x)/log(10.0d);\n}\n\nstatic inline double futrts_log1p_64(double x) {\n if(x == -1.0d || (futrts_isinf64(x) && x > 0.0d)) return x / 0.0d;\n double y = 1.0d + x;\n double z = y - 1.0d;\n return log", "(y) - (z-x)/y;\n}\n\nstatic inline double futrts_sqrt64(double x) {\n return sqrt(x);\n}\n\nextern \"C\" unmasked uniform double cbrt(uniform double);\nstatic inline double futrts_cbrt64(double x) {\n double res;\n foreach_active (i) {\n uniform double r = cbrtf(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline double futrts_exp64(double x) {\n return exp(x);\n}\n\nstatic inline double futrts_cos64(double x) {\n return cos(x);\n}\n\nstatic inline double futrts_sin64(double x) {\n return sin(x);\n}\n\nstatic inline double futrts_tan64(double x) {\n return tan(x);\n}\n\nstatic inline double futrts_acos64(double x) {\n return acos(x);\n}\n\nstatic inline double futrts_asin64(double x) {\n return asin(x);\n}\n\nstatic inline double futrts_atan64(double x) {\n return atan(x);\n}\n\nstatic inline double futrts_cosh64(double x) {\n return (exp(x)+exp(-x)) / 2.0d;\n}\n\nstatic inline double futrts_sinh64(double x) {\n return (exp(x)-exp(-x)) / 2.0d;\n}\n\nstatic inline double futrts_tanh64(double x) {\n return futrts_sinh64(x)/futrts_cosh64(x);\n}\n\nstatic inline double futrts_acosh64(double x) {\n double f = x+sqrt(x*x-1.0d);\n if(futrts_isfinite64(f)) return log(f);\n return f;\n}\n\nstatic inline double futrts_asinh64(double x) {\n double f = x+sqrt(x*x+1.0d);\n if(futrts_isfinite64(f)) return log(f);\n return f;\n}\n\nstatic inline double futrts_atanh64(double x) {\n double f = (1.0d+x)/(1.0d-x);\n if(futrts_isfinite64(f)) return log(f)/2.0d;\n return f;\n\n}\n\nstatic inline double futrts_atan2_64(double x, double y) {\n return atan2(x, y);\n}\n\nextern \"C\" unmasked uniform double hypot(uniform double x, uniform double y);\nstatic inline double futrts_hypot64(double x, double y) {\n double res;\n foreach_active (i) {\n uniform double r = hypot(extract(x, i), extract(y, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nextern \"C\" unmasked uniform double tgamma(uniform double x);\nstatic inline double futrts_gamma64(double x) {\n double res;\n foreach_active (i) {\n uniform double r", " = tgamma(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nextern \"C\" unmasked uniform double lgamma(uniform double x);\nstatic inline double futrts_lgamma64(double x) {\n double res;\n foreach_active (i) {\n uniform double r = lgamma(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nextern \"C\" unmasked uniform double erf(uniform double x);\nstatic inline double futrts_erf64(double x) {\n double res;\n foreach_active (i) {\n uniform double r = erf(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nextern \"C\" unmasked uniform double erfc(uniform double x);\nstatic inline double futrts_erfc64(double x) {\n double res;\n foreach_active (i) {\n uniform double r = erfc(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline double futrts_fma64(double a, double b, double c) {\n return a * b + c;\n}\n\nstatic inline double futrts_round64(double x) {\n return round(x);\n}\n\nstatic inline double futrts_ceil64(double x) {\n return ceil(x);\n}\n\nextern \"C\" unmasked uniform double nextafter(uniform float x, uniform double y);\nstatic inline float futrts_nextafter64(double x, double y) {\n double res;\n foreach_active (i) {\n uniform double r = nextafter(extract(x, i), extract(y, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline double futrts_floor64(double x) {\n return floor(x);\n}\n\nstatic inline bool futrts_isnan64(double x) {\n return isnan(x);\n}\n\nstatic inline int8_t fptosi_f64_i8(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (int8_t) x;\n }\n}\n\nstatic inline int16_t fptosi_f64_i16(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (int16_t) x;\n }\n}\n\nstatic inline int32_t fptosi_f64_i32(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (int32_t) x;\n }\n}\n\nstatic inline int64_t fptosi_f64_i64(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {",
"\n return 0;\n } else {\n return (int64_t) x;\n }\n}\n\nstatic inline uint8_t fptoui_f64_i8(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (uint8_t) (int8_t) x;\n }\n}\n\nstatic inline uint16_t fptoui_f64_i16(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (uint16_t) (int16_t) x;\n }\n}\n\nstatic inline uint32_t fptoui_f64_i32(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (uint32_t) (int32_t) x;\n }\n}\n\nstatic inline uint64_t fptoui_f64_i64(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (uint64_t) (int64_t) x;\n }\n}\n\nstatic inline bool ftob_f64_bool(double x) {\n return x != 0.0;\n}\n\nstatic inline double btof_bool_f64(bool x) {\n return x ? 1.0 : 0.0;\n}\n\nstatic inline int64_t futrts_to_bits64(double x) {\n int64_t res;\n foreach_active (i) {\n uniform double tmp = extract(x, i);\n uniform int64_t r = *((uniform int64_t* uniform)&tmp);\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline double futrts_from_bits64(int64_t x) {\n double res;\n foreach_active (i) {\n uniform int64_t tmp = extract(x, i);\n uniform double r = *((uniform double* uniform)&tmp);\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline double fmod64(double x, double y) {\n return x - y * trunc(x/y);\n}\n\nstatic inline double fsignum64(double x) {\n return futrts_isnan64(x) ? x : (x > 0 ? 1.0d : 0.0d) - (x < 0 ? 1.0d : 0.0d);\n}\n\nstatic inline double futrts_lerp64(double v0, double v1, double t) {\n return v0 + (v1 - v0) * t;\n}\n\nstatic inline double futrts_mad64(double a, double b, double c) {\n return a * b + c;\n}\n\nstatic inline float fpconv_f32_f32(float x) {\n return (float) x;\n}\n\nstatic inline double fpconv_f32_f64(float x) {\n return (double) x;\n}\n\nstatic inline float fpconv_f64_f32(double x) {\n return (float) x;\n}\n\nstatic inline double fpconv_f64_f64(double x) {\n return (double", ") x;\n}\n\n#else\n\nstatic inline double fdiv64(double x, double y) {\n return x / y;\n}\n\nstatic inline double fadd64(double x, double y) {\n return x + y;\n}\n\nstatic inline double fsub64(double x, double y) {\n return x - y;\n}\n\nstatic inline double fmul64(double x, double y) {\n return x * y;\n}\n\nstatic inline bool cmplt64(double x, double y) {\n return x < y;\n}\n\nstatic inline bool cmple64(double x, double y) {\n return x <= y;\n}\n\nstatic inline double sitofp_i8_f64(int8_t x) {\n return (double) x;\n}\n\nstatic inline double sitofp_i16_f64(int16_t x) {\n return (double) x;\n}\n\nstatic inline double sitofp_i32_f64(int32_t x) {\n return (double) x;\n}\n\nstatic inline double sitofp_i64_f64(int64_t x) {\n return (double) x;\n}\n\nstatic inline double uitofp_i8_f64(uint8_t x) {\n return (double) x;\n}\n\nstatic inline double uitofp_i16_f64(uint16_t x) {\n return (double) x;\n}\n\nstatic inline double uitofp_i32_f64(uint32_t x) {\n return (double) x;\n}\n\nstatic inline double uitofp_i64_f64(uint64_t x) {\n return (double) x;\n}\n\nstatic inline double fabs64(double x) {\n return fabs(x);\n}\n\nstatic inline double fmax64(double x, double y) {\n return fmax(x, y);\n}\n\nstatic inline double fmin64(double x, double y) {\n return fmin(x, y);\n}\n\nstatic inline double fpow64(double x, double y) {\n return pow(x, y);\n}\n\nstatic inline double futrts_log64(double x) {\n return log(x);\n}\n\nstatic inline double futrts_log2_64(double x) {\n return log2(x);\n}\n\nstatic inline double futrts_log10_64(double x) {\n return log10(x);\n}\n\nstatic inline double futrts_log1p_64(double x) {\n return log1p(x);\n}\n\nstatic inline double futrts_sqrt64(double x) {\n return sqrt(x);\n}\n\nstatic inline double futrts_cbrt64(double x) {\n return cbrt(x);\n}\n\nstatic inline double futrts_exp64(double x) {\n return exp(x);\n}\n\nstatic inline double futrts_cos64(double x) {\n return cos(x);\n}\n\nstatic inline double futrts_sin64(double x) {\n return sin(x);\n}\n\nstatic inline double futrts_tan64(double x) {\n return tan(x);\n}\n\nstatic inline double futrts_a", "cos64(double x) {\n return acos(x);\n}\n\nstatic inline double futrts_asin64(double x) {\n return asin(x);\n}\n\nstatic inline double futrts_atan64(double x) {\n return atan(x);\n}\n\nstatic inline double futrts_cosh64(double x) {\n return cosh(x);\n}\n\nstatic inline double futrts_sinh64(double x) {\n return sinh(x);\n}\n\nstatic inline double futrts_tanh64(double x) {\n return tanh(x);\n}\n\nstatic inline double futrts_acosh64(double x) {\n return acosh(x);\n}\n\nstatic inline double futrts_asinh64(double x) {\n return asinh(x);\n}\n\nstatic inline double futrts_atanh64(double x) {\n return atanh(x);\n}\n\nstatic inline double futrts_atan2_64(double x, double y) {\n return atan2(x, y);\n}\n\nstatic inline double futrts_hypot64(double x, double y) {\n return hypot(x, y);\n}\n\nstatic inline double futrts_gamma64(double x) {\n return tgamma(x);\n}\n\nstatic inline double futrts_lgamma64(double x) {\n return lgamma(x);\n}\n\nstatic inline double futrts_erf64(double x) {\n return erf(x);\n}\n\nstatic inline double futrts_erfc64(double x) {\n return erfc(x);\n}\n\nstatic inline double futrts_fma64(double a, double b, double c) {\n return fma(a, b, c);\n}\n\nstatic inline double futrts_round64(double x) {\n return rint(x);\n}\n\nstatic inline double futrts_ceil64(double x) {\n return ceil(x);\n}\n\nstatic inline float futrts_nextafter64(float x, float y) {\n return nextafter(x, y);\n}\n\nstatic inline double futrts_floor64(double x) {\n return floor(x);\n}\n\nstatic inline bool futrts_isnan64(double x) {\n return isnan(x);\n}\n\nstatic inline bool futrts_isinf64(double x) {\n return isinf(x);\n}\n\nstatic inline int8_t fptosi_f64_i8(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (int8_t) x;\n }\n}\n\nstatic inline int16_t fptosi_f64_i16(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (int16_t) x;\n }\n}\n\nstatic inline int32_t fptosi_f64_i32(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (int32_t",
") x;\n }\n}\n\nstatic inline int64_t fptosi_f64_i64(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (int64_t) x;\n }\n}\n\nstatic inline uint8_t fptoui_f64_i8(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (uint8_t) (int8_t) x;\n }\n}\n\nstatic inline uint16_t fptoui_f64_i16(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (uint16_t) (int16_t) x;\n }\n}\n\nstatic inline uint32_t fptoui_f64_i32(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (uint32_t) (int32_t) x;\n }\n}\n\nstatic inline uint64_t fptoui_f64_i64(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (uint64_t) (int64_t) x;\n }\n}\n\nstatic inline bool ftob_f64_bool(double x) {\n return x != 0;\n}\n\nstatic inline double btof_bool_f64(bool x) {\n return x ? 1 : 0;\n}\n\nstatic inline int64_t futrts_to_bits64(double x) {\n union {\n double f;\n int64_t t;\n } p;\n\n p.f = x;\n return p.t;\n}\n\nstatic inline double futrts_from_bits64(int64_t x) {\n union {\n int64_t f;\n double t;\n } p;\n\n p.f = x;\n return p.t;\n}\n\nstatic inline double fmod64(double x, double y) {\n return fmod(x, y);\n}\n\nstatic inline double fsignum64(double x) {\n return futrts_isnan64(x) ? x : (x > 0) - (x < 0);\n}\n\nstatic inline double futrts_lerp64(double v0, double v1, double t) {\n#ifdef __OPENCL_VERSION__\n return mix(v0, v1, t);\n#else\n return v0 + (v1 - v0) * t;\n#endif\n}\n\nstatic inline double futrts_mad64(double a, double b, double c) {\n#ifdef __OPENCL_VERSION__\n return mad(a, b, c);\n#else\n return a * b + c;\n#endif\n}\n\nstatic inline float fpconv_f32_f32(float x) {\n return (float) x;\n}\n\nstatic inline double fpconv_f32_f64(float x) {\n return (double) x;\n}\n\nstatic inline float fpconv_f64_f32(double x) {\n return (float) x;\n}\n\nstatic inline double fpconv_f64_f64(double x) {\n return (double) x;\n}\n\n#endif\n\n#endif\n\n// End", " of scalar.h.\n// Start of scalar_f16.h.\n\n// Half-precision is emulated if needed (e.g. in straight C) with the\n// native type used if possible. The emulation works by typedef'ing\n// 'float' to 'f16', and then implementing all operations on single\n// precision. To cut down on duplication, we use the same code for\n// those Futhark functions that require just operators or casts. The\n// in-memory representation for arrays will still be 16 bits even\n// under emulation, so the compiler will have to be careful when\n// generating reads or writes.\n\n#if !defined(cl_khr_fp16) && !(defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) && !(defined(ISPC))\n#define EMULATE_F16\n#endif\n\n#if !defined(EMULATE_F16) && defined(__OPENCL_VERSION__)\n#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n#endif\n\n#ifdef EMULATE_F16\n\n// Note that the half-precision storage format is still 16 bits - the\n// compiler will have to be real careful!\ntypedef float f16;\n\n#elif ISPC\ntypedef float16 f16;\n\n#else\n\n#ifdef __CUDA_ARCH__\n#include <cuda_fp16.h>\n#endif\n\ntypedef half f16;\n\n#endif\n\n// Some of these functions convert to single precision because half\n// precision versions are not available.\n\nstatic inline f16 fadd16(f16 x, f16 y) {\n return x + y;\n}\n\nstatic inline f16 fsub16(f16 x, f16 y) {\n return x - y;\n}\n\nstatic inline f16 fmul16(f16 x, f16 y) {\n return x * y;\n}\n\nstatic inline bool cmplt16(f16 x, f16 y) {\n return x < y;\n}\n\nstatic inline bool cmple16(f16 x, f16 y) {\n return x <= y;\n}\n\nstatic inline f16 sitofp_i8_f16(int8_t x) {\n return (f16) x;\n}\n\nstatic inline f16 sitofp_i16_f16(int16_t x) {\n return (f16) x;\n}\n\nstatic inline f16 sitofp_i32_f16(int32_t x) {\n return (f16) x;\n}\n\nstatic inline f16 sitofp_i64_f16(int64_t x) {\n return (f16) x;\n}\n\nstatic inline f16 uitofp_i8_f16(uint8_t x) {\n return (f16) x;\n}\n\nstatic inline f16 uitofp_i16_f16(uint16_t x) {\n return (f16) x;\n}\n\nstatic inline f16 uitofp_i32_f16(uint32_t x) {\n return (f16) x;\n}\n\nstatic inline f16 uitofp_i64_f16(uint64_t x) {\n return (f1", "6) x;\n}\n\nstatic inline int8_t fptosi_f16_i8(f16 x) {\n return (int8_t) (float) x;\n}\n\nstatic inline int16_t fptosi_f16_i16(f16 x) {\n return (int16_t) x;\n}\n\nstatic inline int32_t fptosi_f16_i32(f16 x) {\n return (int32_t) x;\n}\n\nstatic inline int64_t fptosi_f16_i64(f16 x) {\n return (int64_t) x;\n}\n\nstatic inline uint8_t fptoui_f16_i8(f16 x) {\n return (uint8_t) (float) x;\n}\n\nstatic inline uint16_t fptoui_f16_i16(f16 x) {\n return (uint16_t) x;\n}\n\nstatic inline uint32_t fptoui_f16_i32(f16 x) {\n return (uint32_t) x;\n}\n\nstatic inline uint64_t fptoui_f16_i64(f16 x) {\n return (uint64_t) x;\n}\n\nstatic inline bool ftob_f16_bool(f16 x) {\n return x != (f16)0;\n}\n\nstatic inline f16 btof_bool_f16(bool x) {\n return x ? 1 : 0;\n}\n\n#ifndef EMULATE_F16\nstatic inline bool futrts_isnan16(f16 x) {\n return isnan((float)x);\n}\n\n#ifdef __OPENCL_VERSION__\n\nstatic inline f16 fabs16(f16 x) {\n return fabs(x);\n}\n\nstatic inline f16 fmax16(f16 x, f16 y) {\n return fmax(x, y);\n}\n\nstatic inline f16 fmin16(f16 x, f16 y) {\n return fmin(x, y);\n}\n\nstatic inline f16 fpow16(f16 x, f16 y) {\n return pow(x, y);\n}\n\n#elif ISPC\nstatic inline f16 fabs16(f16 x) {\n return abs(x);\n}\n\nstatic inline f16 fmax16(f16 x, f16 y) {\n return futrts_isnan16(x) ? y : futrts_isnan16(y) ? x : max(x, y);\n}\n\nstatic inline f16 fmin16(f16 x, f16 y) {\n return futrts_isnan16(x) ? y : futrts_isnan16(y) ? x : min(x, y);\n}\n\nstatic inline f16 fpow16(f16 x, f16 y) {\n return pow(x, y);\n}\n#else // Assuming CUDA.\n\nstatic inline f16 fabs16(f16 x) {\n return fabsf(x);\n}\n\nstatic inline f16 fmax16(f16 x, f16 y) {\n return fmaxf(x, y);\n}\n\nstatic inline f16 fmin16(f16 x, f16 y) {\n return fminf(x, y);\n}\n\nstatic inline f16 fpow16(f16 x, f16 y) {\n return powf(x, y);\n}\n#endif\n\n#if ISPC\nstatic inline bool futrts_isinf16(float x) {\n return !futrts_isnan16(x) && futrts_isnan16(x - x);\n}\nstatic inline bool futrts_isfinite16(float x) {\n return !futrts_isnan16(x) && !futrts_isinf16(x);\n}\n\n#else\n\nstatic inline bool futrts_isinf16(f16 x) {\n retu",
"rn isinf((float)x);\n}\n#endif\n\n#ifdef __OPENCL_VERSION__\nstatic inline f16 futrts_log16(f16 x) {\n return log(x);\n}\n\nstatic inline f16 futrts_log2_16(f16 x) {\n return log2(x);\n}\n\nstatic inline f16 futrts_log10_16(f16 x) {\n return log10(x);\n}\n\nstatic inline f16 futrts_log1p_16(f16 x) {\n return log1p(x);\n}\n\nstatic inline f16 futrts_sqrt16(f16 x) {\n return sqrt(x);\n}\n\nstatic inline f16 futrts_cbrt16(f16 x) {\n return cbrt(x);\n}\n\nstatic inline f16 futrts_exp16(f16 x) {\n return exp(x);\n}\n\nstatic inline f16 futrts_cos16(f16 x) {\n return cos(x);\n}\n\nstatic inline f16 futrts_sin16(f16 x) {\n return sin(x);\n}\n\nstatic inline f16 futrts_tan16(f16 x) {\n return tan(x);\n}\n\nstatic inline f16 futrts_acos16(f16 x) {\n return acos(x);\n}\n\nstatic inline f16 futrts_asin16(f16 x) {\n return asin(x);\n}\n\nstatic inline f16 futrts_atan16(f16 x) {\n return atan(x);\n}\n\nstatic inline f16 futrts_cosh16(f16 x) {\n return cosh(x);\n}\n\nstatic inline f16 futrts_sinh16(f16 x) {\n return sinh(x);\n}\n\nstatic inline f16 futrts_tanh16(f16 x) {\n return tanh(x);\n}\n\nstatic inline f16 futrts_acosh16(f16 x) {\n return acosh(x);\n}\n\nstatic inline f16 futrts_asinh16(f16 x) {\n return asinh(x);\n}\n\nstatic inline f16 futrts_atanh16(f16 x) {\n return atanh(x);\n}\n\nstatic inline f16 futrts_atan2_16(f16 x, f16 y) {\n return atan2(x, y);\n}\n\nstatic inline f16 futrts_hypot16(f16 x, f16 y) {\n return hypot(x, y);\n}\n\nstatic inline f16 futrts_gamma16(f16 x) {\n return tgamma(x);\n}\n\nstatic inline f16 futrts_lgamma16(f16 x) {\n return lgamma(x);\n}\n\nstatic inline f16 futrts_erf16(f16 x) {\n return erf(x);\n}\n\nstatic inline f16 futrts_erfc16(f16 x) {\n return erfc(x);\n}\n\nstatic inline f16 fmod16(f16 x, f16 y) {\n return fmod(x, y);\n}\n\nstatic inline f16 futrts_round16(f16 x) {\n return rint(x);\n}\n\nstatic inline f16 futrts_floor16(f16 x) {\n return floor(x);\n}\n\nstatic inline f16 futrts_ceil16(f16 x) {\n return ceil(x);\n}\n\nstatic inline f16 futrts_nextafter16(f16 x, f16 y) {\n return nextafter(x, y);\n}\n\nstatic inline f16 futrts_", "lerp16(f16 v0, f16 v1, f16 t) {\n return mix(v0, v1, t);\n}\n\nstatic inline f16 futrts_mad16(f16 a, f16 b, f16 c) {\n return mad(a, b, c);\n}\n\nstatic inline f16 futrts_fma16(f16 a, f16 b, f16 c) {\n return fma(a, b, c);\n}\n#elif ISPC\n\nstatic inline f16 futrts_log16(f16 x) {\n return futrts_isfinite16(x) || (futrts_isinf16(x) && x < 0) ? log(x) : x;\n}\n\nstatic inline f16 futrts_log2_16(f16 x) {\n return futrts_log16(x) / log(2.0f16);\n}\n\nstatic inline f16 futrts_log10_16(f16 x) {\n return futrts_log16(x) / log(10.0f16);\n}\n\nstatic inline f16 futrts_log1p_16(f16 x) {\n if(x == -1.0f16 || (futrts_isinf16(x) && x > 0.0f16)) return x / 0.0f16;\n f16 y = 1.0f16 + x;\n f16 z = y - 1.0f16;\n return log(y) - (z-x)/y;\n}\n\nstatic inline f16 futrts_sqrt16(f16 x) {\n return (float16)sqrt((float)x);\n}\n\nstatic inline f16 futrts_exp16(f16 x) {\n return exp(x);\n}\n\nstatic inline f16 futrts_cos16(f16 x) {\n return (float16)cos((float)x);\n}\n\nstatic inline f16 futrts_sin16(f16 x) {\n return (float16)sin((float)x);\n}\n\nstatic inline f16 futrts_tan16(f16 x) {\n return (float16)tan((float)x);\n}\n\nstatic inline f16 futrts_acos16(f16 x) {\n return (float16)acos((float)x);\n}\n\nstatic inline f16 futrts_asin16(f16 x) {\n return (float16)asin((float)x);\n}\n\nstatic inline f16 futrts_atan16(f16 x) {\n return (float16)atan((float)x);\n}\n\nstatic inline f16 futrts_cosh16(f16 x) {\n return (exp(x)+exp(-x)) / 2.0f16;\n}\n\nstatic inline f16 futrts_sinh16(f16 x) {\n return (exp(x)-exp(-x)) / 2.0f16;\n}\n\nstatic inline f16 futrts_tanh16(f16 x) {\n return futrts_sinh16(x)/futrts_cosh16(x);\n}\n\nstatic inline f16 futrts_acosh16(f16 x) {\n float16 f = x+(float16)sqrt((float)(x*x-1));\n if(futrts_isfinite16(f)) return log(f);\n return f;\n}\n\nstatic inline f16 futrts_asinh16(f16 x) {\n float16 f = x+(float16)sqrt((float)(x*x+1));\n if(futrts_isfinite16(f)) return log(f);\n return f;\n}\n\nstatic inline f16 futrts_atanh16(f16 x) {\n float16 f = (1+x)/(1-x);\n if(futrts_isfinite16(f)) return log(f)/2.0f16;\n return f;\n}\n\nstatic inline", " f16 futrts_atan2_16(f16 x, f16 y) {\n return (float16)atan2((float)x, (float)y);\n}\n\nstatic inline f16 futrts_hypot16(f16 x, f16 y) {\n return (float16)futrts_hypot32((float)x, (float)y);\n}\n\nextern \"C\" unmasked uniform float tgammaf(uniform float x);\nstatic inline f16 futrts_gamma16(f16 x) {\n f16 res;\n foreach_active (i) {\n uniform f16 r = (f16)tgammaf(extract((float)x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nextern \"C\" unmasked uniform float lgammaf(uniform float x);\nstatic inline f16 futrts_lgamma16(f16 x) {\n f16 res;\n foreach_active (i) {\n uniform f16 r = (f16)lgammaf(extract((float)x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline f16 futrts_cbrt16(f16 x) {\n f16 res = (f16)futrts_cbrt32((float)x);\n return res;\n}\n\nstatic inline f16 futrts_erf16(f16 x) {\n f16 res = (f16)futrts_erf32((float)x);\n return res;\n}\n\nstatic inline f16 futrts_erfc16(f16 x) {\n f16 res = (f16)futrts_erfc32((float)x);\n return res;\n}\n\nstatic inline f16 fmod16(f16 x, f16 y) {\n return x - y * (float16)trunc((float) (x/y));\n}\n\nstatic inline f16 futrts_round16(f16 x) {\n return (float16)round((float)x);\n}\n\nstatic inline f16 futrts_floor16(f16 x) {\n return (float16)floor((float)x);\n}\n\nstatic inline f16 futrts_ceil16(f16 x) {\n return (float16)ceil((float)x);\n}\n\nstatic inline f16 futrts_nextafter16(f16 x, f16 y) {\n return (float16)futrts_nextafter32((float)x, (float) y);\n}\n\nstatic inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {\n return v0 + (v1 - v0) * t;\n}\n\nstatic inline f16 futrts_mad16(f16 a, f16 b, f16 c) {\n return a * b + c;\n}\n\nstatic inline f16 futrts_fma16(f16 a, f16 b, f16 c) {\n return a * b + c;\n}\n\n#else // Assume CUDA.\n\nstatic inline f16 futrts_log16(f16 x) {\n return hlog(x);\n}\n\nstatic inline f16 futrts_log2_16(f16 x) {\n return hlog2(x);\n}\n\nstatic inline f16 futrts_log10_16(f16 x) {\n return hlog10(x);\n}\n\nstatic inline f16 futrts_log1p_16(f16 x) {\n return (f16)log1pf((float)x);\n}\n\nstatic inline f16 futrts_sqrt16(f16 x) {\n ret",
"urn hsqrt(x);\n}\n\nstatic inline f16 futrts_cbrt16(f16 x) {\n return cbrtf(x);\n}\n\nstatic inline f16 futrts_exp16(f16 x) {\n return hexp(x);\n}\n\nstatic inline f16 futrts_cos16(f16 x) {\n return hcos(x);\n}\n\nstatic inline f16 futrts_sin16(f16 x) {\n return hsin(x);\n}\n\nstatic inline f16 futrts_tan16(f16 x) {\n return tanf(x);\n}\n\nstatic inline f16 futrts_acos16(f16 x) {\n return acosf(x);\n}\n\nstatic inline f16 futrts_asin16(f16 x) {\n return asinf(x);\n}\n\nstatic inline f16 futrts_atan16(f16 x) {\n return atanf(x);\n}\n\nstatic inline f16 futrts_cosh16(f16 x) {\n return coshf(x);\n}\n\nstatic inline f16 futrts_sinh16(f16 x) {\n return sinhf(x);\n}\n\nstatic inline f16 futrts_tanh16(f16 x) {\n return tanhf(x);\n}\n\nstatic inline f16 futrts_acosh16(f16 x) {\n return acoshf(x);\n}\n\nstatic inline f16 futrts_asinh16(f16 x) {\n return asinhf(x);\n}\n\nstatic inline f16 futrts_atanh16(f16 x) {\n return atanhf(x);\n}\n\nstatic inline f16 futrts_atan2_16(f16 x, f16 y) {\n return atan2f(x, y);\n}\n\nstatic inline f16 futrts_hypot16(f16 x, f16 y) {\n return hypotf(x, y);\n}\n\nstatic inline f16 futrts_gamma16(f16 x) {\n return tgammaf(x);\n}\n\nstatic inline f16 futrts_lgamma16(f16 x) {\n return lgammaf(x);\n}\n\nstatic inline f16 futrts_erf16(f16 x) {\n return erff(x);\n}\n\nstatic inline f16 futrts_erfc16(f16 x) {\n return erfcf(x);\n}\n\nstatic inline f16 fmod16(f16 x, f16 y) {\n return fmodf(x, y);\n}\n\nstatic inline f16 futrts_round16(f16 x) {\n return rintf(x);\n}\n\nstatic inline f16 futrts_floor16(f16 x) {\n return hfloor(x);\n}\n\nstatic inline f16 futrts_ceil16(f16 x) {\n return hceil(x);\n}\n\nstatic inline f16 futrts_nextafter16(f16 x, f16 y) {\n return __ushort_as_half(halfbitsnextafter(__half_as_ushort(x), __half_as_ushort(y)));\n}\n\nstatic inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {\n return v0 + (v1 - v0) * t;\n}\n\nstatic inline f16 futrts_mad16(f16 a, f16 b, f16 c) {\n return a * b + c;\n}\n\nstatic inline f16 futrts_fma16(f16 a, f16 b, f16 c) {\n return fmaf(a, b, c);\n}\n\n#endif\n\n// The CUDA __half type cannot be put ", "in unions for some reason, so we\n// use bespoke conversion functions instead.\n#ifdef __CUDA_ARCH__\nstatic inline int16_t futrts_to_bits16(f16 x) {\n return __half_as_ushort(x);\n}\nstatic inline f16 futrts_from_bits16(int16_t x) {\n return __ushort_as_half(x);\n}\n#elif ISPC\n\nstatic inline int16_t futrts_to_bits16(f16 x) {\n varying int16_t y = *((varying int16_t * uniform)&x);\n return y;\n}\n\nstatic inline f16 futrts_from_bits16(int16_t x) {\n varying f16 y = *((varying f16 * uniform)&x);\n return y;\n}\n#else\nstatic inline int16_t futrts_to_bits16(f16 x) {\n union {\n f16 f;\n int16_t t;\n } p;\n\n p.f = x;\n return p.t;\n}\n\nstatic inline f16 futrts_from_bits16(int16_t x) {\n union {\n int16_t f;\n f16 t;\n } p;\n\n p.f = x;\n return p.t;\n}\n#endif\n\n#else // No native f16 - emulate.\n\nstatic inline f16 fabs16(f16 x) {\n return fabs32(x);\n}\n\nstatic inline f16 fmax16(f16 x, f16 y) {\n return fmax32(x, y);\n}\n\nstatic inline f16 fmin16(f16 x, f16 y) {\n return fmin32(x, y);\n}\n\nstatic inline f16 fpow16(f16 x, f16 y) {\n return fpow32(x, y);\n}\n\nstatic inline bool futrts_isnan16(f16 x) {\n return futrts_isnan32(x);\n}\n\nstatic inline bool futrts_isinf16(f16 x) {\n return futrts_isinf32(x);\n}\n\nstatic inline f16 futrts_log16(f16 x) {\n return futrts_log32(x);\n}\n\nstatic inline f16 futrts_log2_16(f16 x) {\n return futrts_log2_32(x);\n}\n\nstatic inline f16 futrts_log10_16(f16 x) {\n return futrts_log10_32(x);\n}\n\nstatic inline f16 futrts_log1p_16(f16 x) {\n return futrts_log1p_32(x);\n}\n\nstatic inline f16 futrts_sqrt16(f16 x) {\n return futrts_sqrt32(x);\n}\n\nstatic inline f16 futrts_cbrt16(f16 x) {\n return futrts_cbrt32(x);\n}\n\nstatic inline f16 futrts_exp16(f16 x) {\n return futrts_exp32(x);\n}\n\nstatic inline f16 futrts_cos16(f16 x) {\n return futrts_cos32(x);\n}\n\nstatic inline f16 futrts_sin16(f16 x) {\n return futrts_sin32(x);\n}\n\nstatic inline f16 futrts_tan16(f16 x) {\n return futrts_tan32(x);\n}\n\nstatic inline f16 futrts_acos16(f16 x) {\n return futrts_acos32(x);\n}\n\nstatic inline f16 f", "utrts_asin16(f16 x) {\n return futrts_asin32(x);\n}\n\nstatic inline f16 futrts_atan16(f16 x) {\n return futrts_atan32(x);\n}\n\nstatic inline f16 futrts_cosh16(f16 x) {\n return futrts_cosh32(x);\n}\n\nstatic inline f16 futrts_sinh16(f16 x) {\n return futrts_sinh32(x);\n}\n\nstatic inline f16 futrts_tanh16(f16 x) {\n return futrts_tanh32(x);\n}\n\nstatic inline f16 futrts_acosh16(f16 x) {\n return futrts_acosh32(x);\n}\n\nstatic inline f16 futrts_asinh16(f16 x) {\n return futrts_asinh32(x);\n}\n\nstatic inline f16 futrts_atanh16(f16 x) {\n return futrts_atanh32(x);\n}\n\nstatic inline f16 futrts_atan2_16(f16 x, f16 y) {\n return futrts_atan2_32(x, y);\n}\n\nstatic inline f16 futrts_hypot16(f16 x, f16 y) {\n return futrts_hypot32(x, y);\n}\n\nstatic inline f16 futrts_gamma16(f16 x) {\n return futrts_gamma32(x);\n}\n\nstatic inline f16 futrts_lgamma16(f16 x) {\n return futrts_lgamma32(x);\n}\n\nstatic inline f16 futrts_erf16(f16 x) {\n return futrts_erf32(x);\n}\n\nstatic inline f16 futrts_erfc16(f16 x) {\n return futrts_erfc32(x);\n}\n\nstatic inline f16 fmod16(f16 x, f16 y) {\n return fmod32(x, y);\n}\n\nstatic inline f16 futrts_round16(f16 x) {\n return futrts_round32(x);\n}\n\nstatic inline f16 futrts_floor16(f16 x) {\n return futrts_floor32(x);\n}\n\nstatic inline f16 futrts_ceil16(f16 x) {\n return futrts_ceil32(x);\n}\n\nstatic inline f16 futrts_nextafter16(f16 x, f16 y) {\n return halfbits2float(halfbitsnextafter(float2halfbits(x), float2halfbits(y)));\n}\n\nstatic inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {\n return futrts_lerp32(v0, v1, t);\n}\n\nstatic inline f16 futrts_mad16(f16 a, f16 b, f16 c) {\n return futrts_mad32(a, b, c);\n}\n\nstatic inline f16 futrts_fma16(f16 a, f16 b, f16 c) {\n return futrts_fma32(a, b, c);\n}\n\n// Even when we are using an OpenCL that does not support cl_khr_fp16,\n// it must still support vload_half for actually creating a\n// half-precision number, which can then be efficiently converted to a\n// float. Similarly for vstore_half.\n#ifdef __OPENCL_VERSION__\n\nstatic inline int16_t futrt",
"s_to_bits16(f16 x) {\n int16_t y;\n // Violating strict aliasing here.\n vstore_half((float)x, 0, (half*)&y);\n return y;\n}\n\nstatic inline f16 futrts_from_bits16(int16_t x) {\n return (f16)vload_half(0, (half*)&x);\n}\n\n#else\n\nstatic inline int16_t futrts_to_bits16(f16 x) {\n return (int16_t)float2halfbits(x);\n}\n\nstatic inline f16 futrts_from_bits16(int16_t x) {\n return halfbits2float((uint16_t)x);\n}\n\nstatic inline f16 fsignum16(f16 x) {\n return futrts_isnan16(x) ? x : (x > 0 ? 1 : 0) - (x < 0 ? 1 : 0);\n}\n\n#endif\n\n#endif\n\nstatic inline float fpconv_f16_f16(f16 x) {\n return x;\n}\n\nstatic inline float fpconv_f16_f32(f16 x) {\n return x;\n}\n\nstatic inline f16 fpconv_f32_f16(float x) {\n return (f16) x;\n}\n\n#ifdef FUTHARK_F64_ENABLED\n\nstatic inline double fpconv_f16_f64(f16 x) {\n return (double) x;\n}\n\n#if ISPC\nstatic inline f16 fpconv_f64_f16(double x) {\n return (f16) ((float)x);\n}\n#else\nstatic inline f16 fpconv_f64_f16(double x) {\n return (f16) x;\n}\n#endif\n#endif\n\n\n// End of scalar_f16.h.\n// Start of atomics.h\n\ninline int32_t atomic_xchg_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicExch((int32_t*)p, x);\n#else\n return atomic_xor(p, x);\n#endif\n}\n\ninline int32_t atomic_xchg_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicExch((int32_t*)p, x);\n#else\n return atomic_xor(p, x);\n#endif\n}\n\ninline int32_t atomic_cmpxchg_i32_global(volatile __global int32_t *p,\n int32_t cmp, int32_t val) {\n#ifdef FUTHARK_CUDA\n return atomicCAS((int32_t*)p, cmp, val);\n#else\n return atomic_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int32_t atomic_cmpxchg_i32_local(volatile __local int32_t *p,\n int32_t cmp, int32_t val) {\n#ifdef FUTHARK_CUDA\n return atomicCAS((int32_t*)p, cmp, val);\n#else\n return atomic_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int32_t atomic_add_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n re", "turn atomicAdd((int32_t*)p, x);\n#else\n return atomic_add(p, x);\n#endif\n}\n\ninline int32_t atomic_add_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAdd((int32_t*)p, x);\n#else\n return atomic_add(p, x);\n#endif\n}\n\ninline float atomic_fadd_f32_global(volatile __global float *p, float x) {\n#ifdef FUTHARK_CUDA\n return atomicAdd((float*)p, x);\n#else\n union { int32_t i; float f; } old;\n union { int32_t i; float f; } assumed;\n old.f = *p;\n do {\n assumed.f = old.f;\n old.f = old.f + x;\n old.i = atomic_cmpxchg_i32_global((volatile __global int32_t*)p, assumed.i, old.i);\n } while (assumed.i != old.i);\n return old.f;\n#endif\n}\n\ninline float atomic_fadd_f32_local(volatile __local float *p, float x) {\n#ifdef FUTHARK_CUDA\n return atomicAdd((float*)p, x);\n#else\n union { int32_t i; float f; } old;\n union { int32_t i; float f; } assumed;\n old.f = *p;\n do {\n assumed.f = old.f;\n old.f = old.f + x;\n old.i = atomic_cmpxchg_i32_local((volatile __local int32_t*)p, assumed.i, old.i);\n } while (assumed.i != old.i);\n return old.f;\n#endif\n}\n\ninline int32_t atomic_smax_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((int32_t*)p, x);\n#else\n return atomic_max(p, x);\n#endif\n}\n\ninline int32_t atomic_smax_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((int32_t*)p, x);\n#else\n return atomic_max(p, x);\n#endif\n}\n\ninline int32_t atomic_smin_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((int32_t*)p, x);\n#else\n return atomic_min(p, x);\n#endif\n}\n\ninline int32_t atomic_smin_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((int32_t*)p, x);\n#else\n return atomic_min(p, x);\n#endif\n}\n\ninline uint32_t atomic_umax_i32_global(volatile __global uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((uint32_t*)p, x);\n#else\n return atomic_max(p, x);\n#endif\n}\n\n", "inline uint32_t atomic_umax_i32_local(volatile __local uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((uint32_t*)p, x);\n#else\n return atomic_max(p, x);\n#endif\n}\n\ninline uint32_t atomic_umin_i32_global(volatile __global uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((uint32_t*)p, x);\n#else\n return atomic_min(p, x);\n#endif\n}\n\ninline uint32_t atomic_umin_i32_local(volatile __local uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((uint32_t*)p, x);\n#else\n return atomic_min(p, x);\n#endif\n}\n\ninline int32_t atomic_and_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAnd((int32_t*)p, x);\n#else\n return atomic_and(p, x);\n#endif\n}\n\ninline int32_t atomic_and_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAnd((int32_t*)p, x);\n#else\n return atomic_and(p, x);\n#endif\n}\n\ninline int32_t atomic_or_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicOr((int32_t*)p, x);\n#else\n return atomic_or(p, x);\n#endif\n}\n\ninline int32_t atomic_or_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicOr((int32_t*)p, x);\n#else\n return atomic_or(p, x);\n#endif\n}\n\ninline int32_t atomic_xor_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicXor((int32_t*)p, x);\n#else\n return atomic_xor(p, x);\n#endif\n}\n\ninline int32_t atomic_xor_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicXor((int32_t*)p, x);\n#else\n return atomic_xor(p, x);\n#endif\n}\n\n// Start of 64 bit atomics\n\ninline int64_t atomic_xchg_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicExch((uint64_t*)p, x);\n#else\n return atom_xor(p, x);\n#endif\n}\n\ninline int64_t atomic_xchg_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicExch((uint64_t*)p, x);\n#else\n return atom_xor(p, x);\n#endif\n}\n\ninli",
"ne int64_t atomic_cmpxchg_i64_global(volatile __global int64_t *p,\n int64_t cmp, int64_t val) {\n#ifdef FUTHARK_CUDA\n return atomicCAS((uint64_t*)p, cmp, val);\n#else\n return atom_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int64_t atomic_cmpxchg_i64_local(volatile __local int64_t *p,\n int64_t cmp, int64_t val) {\n#ifdef FUTHARK_CUDA\n return atomicCAS((uint64_t*)p, cmp, val);\n#else\n return atom_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int64_t atomic_add_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAdd((uint64_t*)p, x);\n#else\n return atom_add(p, x);\n#endif\n}\n\ninline int64_t atomic_add_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAdd((uint64_t*)p, x);\n#else\n return atom_add(p, x);\n#endif\n}\n\n#ifdef FUTHARK_F64_ENABLED\n\ninline double atomic_fadd_f64_global(volatile __global double *p, double x) {\n#if defined(FUTHARK_CUDA) && __CUDA_ARCH__ >= 600\n return atomicAdd((double*)p, x);\n#else\n union { int64_t i; double f; } old;\n union { int64_t i; double f; } assumed;\n old.f = *p;\n do {\n assumed.f = old.f;\n old.f = old.f + x;\n old.i = atomic_cmpxchg_i64_global((volatile __global int64_t*)p, assumed.i, old.i);\n } while (assumed.i != old.i);\n return old.f;\n#endif\n}\n\ninline double atomic_fadd_f64_local(volatile __local double *p, double x) {\n#if defined(FUTHARK_CUDA) && __CUDA_ARCH__ >= 600\n return atomicAdd((double*)p, x);\n#else\n union { int64_t i; double f; } old;\n union { int64_t i; double f; } assumed;\n old.f = *p;\n do {\n assumed.f = old.f;\n old.f = old.f + x;\n old.i = atomic_cmpxchg_i64_local((volatile __local int64_t*)p, assumed.i, old.i);\n } while (assumed.i != old.i);\n return old.f;\n#endif\n}\n\n#endif\n\ninline int64_t atomic_smax_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((int64_t*)p, x);\n#else\n return atom_max(p, x);\n#endif\n}\n\ninline ", "int64_t atomic_smax_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((int64_t*)p, x);\n#else\n return atom_max(p, x);\n#endif\n}\n\ninline int64_t atomic_smin_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((int64_t*)p, x);\n#else\n return atom_min(p, x);\n#endif\n}\n\ninline int64_t atomic_smin_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((int64_t*)p, x);\n#else\n return atom_min(p, x);\n#endif\n}\n\ninline uint64_t atomic_umax_i64_global(volatile __global uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((uint64_t*)p, x);\n#else\n return atom_max(p, x);\n#endif\n}\n\ninline uint64_t atomic_umax_i64_local(volatile __local uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((uint64_t*)p, x);\n#else\n return atom_max(p, x);\n#endif\n}\n\ninline uint64_t atomic_umin_i64_global(volatile __global uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((uint64_t*)p, x);\n#else\n return atom_min(p, x);\n#endif\n}\n\ninline uint64_t atomic_umin_i64_local(volatile __local uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((uint64_t*)p, x);\n#else\n return atom_min(p, x);\n#endif\n}\n\ninline int64_t atomic_and_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAnd((int64_t*)p, x);\n#else\n return atom_and(p, x);\n#endif\n}\n\ninline int64_t atomic_and_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAnd((int64_t*)p, x);\n#else\n return atom_and(p, x);\n#endif\n}\n\ninline int64_t atomic_or_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicOr((int64_t*)p, x);\n#else\n return atom_or(p, x);\n#endif\n}\n\ninline int64_t atomic_or_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicOr((int64_t*)p, x);\n#else\n return atom_or(p, x);\n#endif\n}\n\ninline int64_t atomic_xor_i64_global(volatile __global ", "int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicXor((int64_t*)p, x);\n#else\n return atom_xor(p, x);\n#endif\n}\n\ninline int64_t atomic_xor_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicXor((int64_t*)p, x);\n#else\n return atom_xor(p, x);\n#endif\n}\n\n// End of atomics.h\n\n\n\n__attribute__((reqd_work_group_size(addzisegmap_group_sizze_6879, 1, 1)))\n__kernel void addzisegmap_6892(__global int *global_failure, int64_t n_6776, __global unsigned char *xs_mem_6916, __global unsigned char *ys_mem_6917, __global unsigned char *mem_6920)\n{\n #define segmap_group_sizze_6888 (addzisegmap_group_sizze_6879)\n \n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n if (*global_failure >= 0)\n return;\n \n int32_t local_tid_6925;\n int64_t group_sizze_6928;\n int32_t wave_sizze_6927;\n int32_t group_tid_6926;\n \n local_tid_6925 = get_local_id(0);\n group_sizze_6928 = get_local_size(0);\n wave_sizze_6927 = LOCKSTEP_WIDTH;\n group_tid_6926 = get_group_id(0);\n \n int32_t global_tid_6924 = group_tid_6926 * group_sizze_6928 + local_tid_6925;\n int32_t phys_tid_6892 = global_tid_6924;\n int64_t global_tid_6929 = sext_i32_i64(group_tid_6926) * segmap_group_sizze_6888 + sext_i32_i64(local_tid_6925);\n int64_t slice_6930 = n_6776;\n int64_t gtid_6891 = global_tid_6929;\n int64_t remnant_6931 = global_tid_6929 - gtid_6891;\n \n if (slt64(gtid_6891, n_6776)) {\n int8_t x_6893 = ((__global int8_t *) xs_mem_6916)[gtid_6891];\n int8_t x_6894 = ((__global int8_t *) ys_mem_6917)[gtid_6891];\n int8_t defunc_0_f_res_6895 = add8(x_6893, x_6894);\n \n ((__global int8_t *) mem_6920)[gtid_6891] = defunc_0_f_res_6895;\n }\n \n error_0:\n return;\n #undef segmap_group_sizze_6888\n}\n__attribute__((reqd_work_group_size(add_i64zisegmap_group_sizze_6899, 1, 1)))\n__kernel void add_i64zisegmap_6912(__global int *global_failure, int64_t n_", "6836, __global unsigned char *xs_mem_6916, __global unsigned char *ys_mem_6917, __global unsigned char *mem_6921)\n{\n #define segmap_group_sizze_6908 (add_i64zisegmap_group_sizze_6899)\n \n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n if (*global_failure >= 0)\n return;\n \n int32_t local_tid_6925;\n int64_t group_sizze_6928;\n int32_t wave_sizze_6927;\n int32_t group_tid_6926;\n \n local_tid_6925 = get_local_id(0);\n group_sizze_6928 = get_local_size(0);\n wave_sizze_6927 = LOCKSTEP_WIDTH;\n group_tid_6926 = get_group_id(0);\n \n int32_t global_tid_6924 = group_tid_6926 * group_sizze_6928 + local_tid_6925;\n int32_t phys_tid_6912 = global_tid_6924;\n int64_t global_tid_6929 = sext_i32_i64(group_tid_6926) * segmap_group_sizze_6908 + sext_i32_i64(local_tid_6925);\n int64_t slice_6930 = n_6836;\n int64_t gtid_6911 = global_tid_6929;\n int64_t remnant_6931 = global_tid_6929 - gtid_6911;\n \n if (slt64(gtid_6911, n_6836)) {\n int64_t x_6913 = ((__global int64_t *) xs_mem_6916)[gtid_6911];\n int64_t x_6914 = ((__global int64_t *) ys_mem_6917)[gtid_6911];\n int64_t defunc_0_f_res_6915 = add64(x_6913, x_6914);\n \n ((__global int64_t *) mem_6921)[gtid_6911] = defunc_0_f_res_6915;\n }\n \n error_0:\n return;\n #undef segmap_group_sizze_6908\n}\n", NULL};
// Start of backends/opencl.h
// Forward declarations.
struct opencl_device_option;
// Invoked by setup_opencl() after the platform and device has been
// found, but before the program is loaded. Its intended use is to
// tune constants based on the selected platform and device.
static void post_opencl_setup(struct futhark_context*, struct opencl_device_option*);
static void set_tuning_params(struct futhark_context* ctx);
static char* get_failure_msg(int failure_idx, int64_t args[]);
#define OPENCL_SUCCEED_FATAL(e) opencl_succeed_fatal(e, #e, __FILE__, __LINE__)
#define OPENCL_SUCCEED_NONFATAL(e) opencl_succeed_nonfatal(e, #e, __FILE__, __LINE__)
// Take care not to override an existing error.
#define OPENCL_SUCCEED_OR_RETURN(e) { \
char *serror = OPENCL_SUCCEED_NONFATAL(e); \
if (serror) { \
if (!ctx->error) { \
ctx->error = serror; \
return bad; \
} else { \
free(serror); \
} \
} \
}
// OPENCL_SUCCEED_OR_RETURN returns the value of the variable 'bad' in
// scope. By default, it will be this one. Create a local variable
// of some other type if needed. This is a bit of a hack, but it
// saves effort in the code generator.
static const int bad = 1;
static const char* opencl_error_string(cl_int err) {
switch (err) {
case CL_SUCCESS: return "Success!";
case CL_DEVICE_NOT_FOUND: return "Device not found.";
case CL_DEVICE_NOT_AVAILABLE: return "Device not available";
case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available";
case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure";
case CL_OUT_OF_RESOURCES: return "Out of resources";
case CL_OUT_OF_HOST_MEMORY: return "Out of host memory";
case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available";
case CL_MEM_COPY_OVERLAP: return "Memory copy overlap";
case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch";
case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported";
case CL_BUILD_PROGRAM_FAILURE: return "Program build failure";
case CL_MAP_FAILURE: return "Map failure";
case CL_INVALID_VALUE: return "Invalid value";
case CL_INVALID_DEVICE_TYPE: return "Invalid device type";
case CL_INVALID_PLATFORM: return "Invalid platform";
case CL_INVALID_DEVICE: return "Invalid device";
case CL_INVALID_CONTEXT: return "Invalid context";
case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties";
case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue";
case CL_INVALID_HOST_PTR: return "Invalid host pointer";
case CL_INVALID_MEM_OBJECT: return "Invalid memory object";
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor";
case CL_INVALID_IMAGE_SIZE: return "Invalid image size";
case CL_INVALID_SAMPLER: return "Invalid sampler";
case CL_INVALID_BINARY: return "Invalid binary";
case CL_INVALID_BUILD_OPTIONS: return "Invalid build options";
case CL_INVALID_PROGRAM: return "Invalid program";
case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable";
case CL_INVALID_KERNEL_NAME: return "Invalid kernel name";
case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition";
case CL_INVALID_KERNEL: return "Invalid kernel";
case CL_INVALID_ARG_INDEX: return "Invalid argument index";
case CL_INVALID_ARG_VALUE: return "Invalid argument value";
case CL_INVALID_ARG_SIZE: return "Invalid argument size";
case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments";
case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension";
case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size";
case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size";
case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset";
case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list";
case CL_INVALID_EVENT: return "Invalid event";
case CL_INVALID_OPERATION: return "Invalid operation";
case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object";
case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size";
case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level";
default: return "Unknown";
}
}
static void opencl_succeed_fatal(cl_int ret,
const char *call,
const char *file,
int line) {
if (ret != CL_SUCCESS) {
futhark_panic(-1, "%s:%d: OpenCL call\n %s\nfailed with error code %d (%s)\n",
file, line, call, ret, opencl_error_string(ret));
}
}
static char* opencl_succeed_nonfatal(cl_int ret,
const char *call,
const char *file,
int line) {
if (ret != CL_SUCCESS) {
return msgprintf("%s:%d: OpenCL call\n %s\nfailed with error code %d (%s)\n",
file, line, call, ret, opencl_error_string(ret));
} else {
return NULL;
}
}
struct futhark_context_config {
int in_use;
int debugging;
int profiling;
int logging;
const char *cache_fname;
int num_tuning_params;
int64_t *tuning_params;
const char** tuning_param_names;
const char** tuning_param_vars;
const char** tuning_param_classes;
// Uniform fields above.
int preferred_device_num;
const char *preferred_platform;
const char *preferred_device;
int ignore_blacklist;
const char* dump_program_to;
const char* load_program_from;
const char* dump_binary_to;
const char* load_binary_from;
size_t default_group_size;
size_t default_num_groups;
size_t default_tile_size;
size_t default_reg_tile_size;
size_t default_threshold;
int default_group_size_changed;
int default_tile_size_changed;
int num_build_opts;
const char **build_opts;
cl_command_queue queue;
int queue_set;
};
static void backend_context_config_setup(struct futhark_context_config* cfg) {
cfg->num_build_opts = 0;
cfg->build_opts = (const char**) malloc(sizeof(const char*));
cfg->build_opts[0] = NULL;
cfg->preferred_device_num = 0;
cfg->preferred_platform = "";
cfg->preferred_device = "";
cfg->ignore_blacklist = 0;
cfg->dump_program_to = NULL;
cfg->load_program_from = NULL;
cfg->dump_binary_to = NULL;
cfg->load_binary_from = NULL;
// The following are dummy sizes that mean the concrete defaults
// will be set during initialisation via hardware-inspection-based
// heuristics.
cfg->default_group_size = 0;
cfg->default_num_groups = 0;
cfg->default_tile_size = 0;
cfg->default_reg_tile_size = 0;
cfg->default_threshold = 0;
cfg->default_group_size_changed = 0;
cfg->default_tile_size_changed = 0;
cfg->queue_set = 0;
}
static void backend_context_config_teardown(struct futhark_context_config* cfg) {
free(cfg->build_opts);
}
void futhark_context_config_add_build_option(struct futhark_context_config* cfg, const char *opt) {
cfg->build_opts[cfg->num_build_opts] = opt;
cfg->num_build_opts++;
cfg->build_opts = (const char**) realloc(cfg->build_opts, (cfg->num_build_opts+1) * sizeof(const char*));
cfg->build_opts[cfg->num_build_opts] = NULL;
}
void futhark_context_config_set_device(struct futhark_context_config *cfg, const char* s) {
int x = 0;
if (*s == '#') {
s++;
while (isdigit(*s)) {
x = x * 10 + (*s++)-'0';
}
// Skip trailing spaces.
while (isspace(*s)) {
s++;
}
}
cfg->preferred_device = s;
cfg->preferred_device_num = x;
cfg->ignore_blacklist = 1;
}
void futhark_context_config_set_platform(struct futhark_context_config *cfg, const char *s) {
cfg->preferred_platform = s;
cfg->ignore_blacklist = 1;
}
void futhark_context_config_set_command_queue(struct futhark_context_config *cfg, cl_command_queue q) {
cfg->queue = q;
cfg->queue_set = 1;
}
struct opencl_device_option {
cl_platform_id platform;
cl_device_id device;
cl_device_type device_type;
char *platform_name;
char *device_name;
};
static char* opencl_platform_info(cl_platform_id platform,
cl_platform_info param) {
size_t req_bytes;
char *info;
OPENCL_SUCCEED_FATAL(clGetPlatformInfo(platform, param, 0, NULL, &req_bytes));
info = (char*) malloc(req_bytes);
OPENCL_SUCCEED_FATAL(clGetPlatformInfo(platform, param, req_bytes, info, NULL));
return info;
}
static char* opencl_device_info(cl_device_id device,
cl_device_info param) {
size_t req_bytes;
char *info;
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device, param, 0, NULL, &req_bytes));
info = (char*) malloc(req_bytes);
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device, param, req_bytes, info, NULL));
return info;
}
static int is_blacklisted(const char *platform_name, const char *device_name,
const struct futhark_context_config *cfg) {
if (strcmp(cfg->preferred_platform, "") != 0 ||
strcmp(cfg->preferred_device, "") != 0) {
return 0;
} else if (strstr(platform_name, "Apple") != NULL &&
strstr(device_name, "Intel(R) Core(TM)") != NULL) {
return 1;
} else {
return 0;
}
}
static void opencl_all_device_options(struct opencl_device_option **devices_out,
size_t *num_devices_out) {
size_t num_devices = 0, num_devices_added = 0;
cl_platform_id *all_platforms;
cl_uint *platform_num_devices;
cl_uint num_platforms;
// Find the number of platforms.
OPENCL_SUCCEED_FATAL(clGetPlatformIDs(0, NULL, &num_platforms));
// Make room for them.
all_platforms = calloc(num_platforms, sizeof(cl_platform_id));
platform_num_devices = calloc(num_platforms, sizeof(cl_uint));
// Fetch all the platforms.
OPENCL_SUCCEED_FATAL(clGetPlatformIDs(num_platforms, all_platforms, NULL));
// Count the number of devices for each platform, as well as the
// total number of devices.
for (cl_uint i = 0; i < num_platforms; i++) {
if (clGetDeviceIDs(all_platforms[i], CL_DEVICE_TYPE_ALL,
0, NULL, &platform_num_devices[i]) == CL_SUCCESS) {
num_devices += platform_num_devices[i];
} else {
platform_num_devices[i] = 0;
}
}
// Make room for all the device options.
struct opencl_device_option *devices =
calloc(num_devices, sizeof(struct opencl_device_option));
// Loop through the platforms, getting information about their devices.
for (cl_uint i = 0; i < num_platforms; i++) {
cl_platform_id platform = all_platforms[i];
cl_uint num_platform_devices = platform_num_devices[i];
if (num_platform_devices == 0) {
continue;
}
char *platform_name = opencl_platform_info(platform, CL_PLATFORM_NAME);
cl_device_id *platform_devices =
calloc(num_platform_devices, sizeof(cl_device_id));
// Fetch all the devices.
OPENCL_SUCCEED_FATAL(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL,
num_platform_devices, platform_devices, NULL));
// Loop through the devices, adding them to the devices array.
for (cl_uint i = 0; i < num_platform_devices; i++) {
char *device_name = opencl_device_info(platform_devices[i], CL_DEVICE_NAME);
devices[num_devices_added].platform = platform;
devices[num_devices_added].device = platform_devices[i];
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(platform_devices[i], CL_DEVICE_TYPE,
sizeof(cl_device_type),
&devices[num_devices_added].device_type,
NULL));
// We don't want the structs to share memory, so copy the platform name.
// Each device name is already unique.
devices[num_devices_added].platform_name = strclone(platform_name);
devices[num_devices_added].device_name = device_name;
num_devices_added++;
}
free(platform_devices);
free(platform_name);
}
free(all_platforms);
free(platform_num_devices);
*devices_out = devices;
*num_devices_out = num_devices;
}
void futhark_context_config_select_device_interactively(struct futhark_context_config *cfg) {
struct opencl_device_option *devices;
size_t num_devices;
opencl_all_device_options(&devices, &num_devices);
printf("Choose OpenCL device:\n");
const char *cur_platform = "";
for (size_t i = 0; i < num_devices; i++) {
struct opencl_device_option device = devices[i];
if (strcmp(cur_platform, device.platform_name) != 0) {
printf("Platform: %s\n", device.platform_name);
cur_platform = device.platform_name;
}
printf("[%d] %s\n", (int)i, device.device_name);
}
int selection;
printf("Choice: ");
if (scanf("%d", &selection) == 1) {
cfg->preferred_platform = "";
cfg->preferred_device = "";
cfg->preferred_device_num = selection;
cfg->ignore_blacklist = 1;
}
// Free all the platform and device names.
for (size_t j = 0; j < num_devices; j++) {
free(devices[j].platform_name);
free(devices[j].device_name);
}
free(devices);
}
void futhark_context_config_list_devices(struct futhark_context_config *cfg) {
(void)cfg;
struct opencl_device_option *devices;
size_t num_devices;
opencl_all_device_options(&devices, &num_devices);
const char *cur_platform = "";
for (size_t i = 0; i < num_devices; i++) {
struct opencl_device_option device = devices[i];
if (strcmp(cur_platform, device.platform_name) != 0) {
printf("Platform: %s\n", device.platform_name);
cur_platform = device.platform_name;
}
printf("[%d]: %s\n", (int)i, device.device_name);
}
// Free all the platform and device names.
for (size_t j = 0; j < num_devices; j++) {
free(devices[j].platform_name);
free(devices[j].device_name);
}
free(devices);
}
void futhark_context_config_dump_program_to(struct futhark_context_config *cfg, const char *path) {
cfg->dump_program_to = path;
}
void futhark_context_config_load_program_from(struct futhark_context_config *cfg, const char *path) {
cfg->load_program_from = path;
}
void futhark_context_config_dump_binary_to(struct futhark_context_config *cfg, const char *path) {
cfg->dump_binary_to = path;
}
void futhark_context_config_load_binary_from(struct futhark_context_config *cfg, const char *path) {
cfg->load_binary_from = path;
}
void futhark_context_config_set_default_group_size(struct futhark_context_config *cfg, int size) {
cfg->default_group_size = size;
cfg->default_group_size_changed = 1;
}
void futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg, int num) {
cfg->default_num_groups = num;
}
void futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg, int size) {
cfg->default_tile_size = size;
cfg->default_tile_size_changed = 1;
}
void futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg, int size) {
cfg->default_reg_tile_size = size;
}
void futhark_context_config_set_default_threshold(struct futhark_context_config *cfg, int size) {
cfg->default_threshold = size;
}
int futhark_context_config_set_tuning_param(struct futhark_context_config *cfg,
const char *param_name,
size_t new_value) {
for (int i = 0; i < cfg->num_tuning_params; i++) {
if (strcmp(param_name, cfg->tuning_param_names[i]) == 0) {
cfg->tuning_params[i] = new_value;
return 0;
}
}
if (strcmp(param_name, "default_group_size") == 0) {
cfg->default_group_size = new_value;
return 0;
}
if (strcmp(param_name, "default_num_groups") == 0) {
cfg->default_num_groups = new_value;
return 0;
}
if (strcmp(param_name, "default_threshold") == 0) {
cfg->default_threshold = new_value;
return 0;
}
if (strcmp(param_name, "default_tile_size") == 0) {
cfg->default_tile_size = new_value;
return 0;
}
if (strcmp(param_name, "default_reg_tile_size") == 0) {
cfg->default_reg_tile_size = new_value;
return 0;
}
return 1;
}
// A record of something that happened.
struct profiling_record {
cl_event *event;
int *runs;
int64_t *runtime;
};
struct futhark_context {
struct futhark_context_config* cfg;
int detail_memory;
int debugging;
int profiling;
int profiling_paused;
int logging;
lock_t lock;
char *error;
lock_t error_lock;
FILE *log;
struct constants *constants;
struct free_list free_list;
int64_t peak_mem_usage_default;
int64_t cur_mem_usage_default;
struct program* program;
// Common fields above.
cl_mem global_failure;
cl_mem global_failure_args;
struct tuning_params tuning_params;
// True if a potentially failing kernel has been enqueued.
cl_int failure_is_an_option;
int total_runs;
long int total_runtime;
int64_t peak_mem_usage_device;
int64_t cur_mem_usage_device;
cl_device_id device;
cl_context ctx;
cl_command_queue queue;
cl_program clprogram;
struct free_list cl_free_list;
size_t max_group_size;
size_t max_num_groups;
size_t max_tile_size;
size_t max_threshold;
size_t max_local_memory;
size_t lockstep_width;
struct profiling_record *profiling_records;
int profiling_records_capacity;
int profiling_records_used;
};
static cl_build_status build_opencl_program(cl_program program, cl_device_id device, const char* options) {
cl_int clBuildProgram_error = clBuildProgram(program, 1, &device, options, NULL, NULL);
// Avoid termination due to CL_BUILD_PROGRAM_FAILURE
if (clBuildProgram_error != CL_SUCCESS &&
clBuildProgram_error != CL_BUILD_PROGRAM_FAILURE) {
OPENCL_SUCCEED_FATAL(clBuildProgram_error);
}
cl_build_status build_status;
OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program,
device,
CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status),
&build_status,
NULL));
if (build_status != CL_SUCCESS) {
char *build_log;
size_t ret_val_size;
OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size));
build_log = (char*) malloc(ret_val_size+1);
OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL));
// The spec technically does not say whether the build log is zero-terminated, so let's be careful.
build_log[ret_val_size] = '\0';
fprintf(stderr, "Build log:\n%s\n", build_log);
free(build_log);
}
return build_status;
}
static char* mk_compile_opts(struct futhark_context *ctx,
const char *extra_build_opts[],
struct opencl_device_option device_option) {
int compile_opts_size = 1024;
for (int i = 0; i < ctx->cfg->num_tuning_params; i++) {
compile_opts_size += strlen(ctx->cfg->tuning_param_names[i]) + 20;
}
for (int i = 0; extra_build_opts[i] != NULL; i++) {
compile_opts_size += strlen(extra_build_opts[i] + 1);
}
char *compile_opts = (char*) malloc(compile_opts_size);
int w = snprintf(compile_opts, compile_opts_size,
"-DLOCKSTEP_WIDTH=%d ",
(int)ctx->lockstep_width);
w += snprintf(compile_opts+w, compile_opts_size-w,
"-D%s=%d ",
"max_group_size",
(int)ctx->max_group_size);
for (int i = 0; i < ctx->cfg->num_tuning_params; i++) {
w += snprintf(compile_opts+w, compile_opts_size-w,
"-D%s=%d ",
ctx->cfg->tuning_param_vars[i],
(int)ctx->cfg->tuning_params[i]);
}
for (int i = 0; extra_build_opts[i] != NULL; i++) {
w += snprintf(compile_opts+w, compile_opts_size-w,
"%s ", extra_build_opts[i]);
}
// Oclgrind claims to support cl_khr_fp16, but this is not actually
// the case.
if (strcmp(device_option.platform_name, "Oclgrind") == 0) {
w += snprintf(compile_opts+w, compile_opts_size-w, "-DEMULATE_F16 ");
}
return compile_opts;
}
// Count up the runtime all the profiling_records that occured during execution.
// Also clears the buffer of profiling_records.
static cl_int opencl_tally_profiling_records(struct futhark_context *ctx) {
cl_int err;
for (int i = 0; i < ctx->profiling_records_used; i++) {
struct profiling_record record = ctx->profiling_records[i];
cl_ulong start_t, end_t;
if ((err = clGetEventProfilingInfo(*record.event,
CL_PROFILING_COMMAND_START,
sizeof(start_t),
&start_t,
NULL)) != CL_SUCCESS) {
return err;
}
if ((err = clGetEventProfilingInfo(*record.event,
CL_PROFILING_COMMAND_END,
sizeof(end_t),
&end_t,
NULL)) != CL_SUCCESS) {
return err;
}
// OpenCL provides nanosecond resolution, but we want
// microseconds.
*record.runs += 1;
*record.runtime += (end_t - start_t)/1000;
if ((err = clReleaseEvent(*record.event)) != CL_SUCCESS) {
return err;
}
free(record.event);
}
ctx->profiling_records_used = 0;
return CL_SUCCESS;
}
// If profiling, produce an event associated with a profiling record.
static cl_event* opencl_get_event(struct futhark_context *ctx, int *runs, int64_t *runtime) {
if (ctx->profiling_records_used == ctx->profiling_records_capacity) {
ctx->profiling_records_capacity *= 2;
ctx->profiling_records =
realloc(ctx->profiling_records,
ctx->profiling_records_capacity *
sizeof(struct profiling_record));
}
cl_event *event = malloc(sizeof(cl_event));
ctx->profiling_records[ctx->profiling_records_used].event = event;
ctx->profiling_records[ctx->profiling_records_used].runs = runs;
ctx->profiling_records[ctx->profiling_records_used].runtime = runtime;
ctx->profiling_records_used++;
return event;
}
// Allocate memory from driver. The problem is that OpenCL may perform
// lazy allocation, so we cannot know whether an allocation succeeded
// until the first time we try to use it. Hence we immediately
// perform a write to see if the allocation succeeded. This is slow,
// but the assumption is that this operation will be rare (most things
// will go through the free list).
static int opencl_alloc_actual(struct futhark_context *ctx, size_t size, cl_mem *mem_out) {
int error;
*mem_out = clCreateBuffer(ctx->ctx, CL_MEM_READ_WRITE, size, NULL, &error);
if (error != CL_SUCCESS) {
return error;
}
int x = 2;
error = clEnqueueWriteBuffer(ctx->queue, *mem_out,
CL_TRUE,
0, sizeof(x), &x,
0, NULL, NULL);
// No need to wait for completion here. clWaitForEvents() cannot
// return mem object allocation failures. This implies that the
// buffer is faulted onto the device on enqueue. (Observation by
// Andreas Kloeckner.)
return error;
}
static int opencl_alloc(struct futhark_context *ctx, FILE *log,
size_t min_size, const char *tag,
cl_mem *mem_out, size_t *size_out) {
(void)tag;
if (min_size < sizeof(int)) {
min_size = sizeof(int);
}
cl_mem* memptr;
if (free_list_find(&ctx->cl_free_list, min_size, tag, size_out, (fl_mem*)&memptr) == 0) {
// Successfully found a free block. Is it big enough?
if (*size_out >= min_size) {
if (ctx->cfg->debugging) {
fprintf(log, "No need to allocate: Found a block in the free list.\n");
}
*mem_out = *memptr;
free(memptr);
return CL_SUCCESS;
} else {
if (ctx->cfg->debugging) {
fprintf(log, "Found a free block, but it was too small.\n");
}
int error = clReleaseMemObject(*memptr);
free(*memptr);
if (error != CL_SUCCESS) {
return error;
}
}
}
*size_out = min_size;
// We have to allocate a new block from the driver. If the
// allocation does not succeed, then we might be in an out-of-memory
// situation. We now start freeing things from the free list until
// we think we have freed enough that the allocation will succeed.
// Since we don't know how far the allocation is from fitting, we
// have to check after every deallocation. This might be pretty
// expensive. Let's hope that this case is hit rarely.
if (ctx->cfg->debugging) {
fprintf(log, "Actually allocating the desired block.\n");
}
int error = opencl_alloc_actual(ctx, min_size, mem_out);
while (error == CL_MEM_OBJECT_ALLOCATION_FAILURE) {
if (ctx->cfg->debugging) {
fprintf(log, "Out of OpenCL memory: releasing entry from the free list...\n");
}
cl_mem* memptr;
if (free_list_first(&ctx->cl_free_list, (fl_mem*)&memptr) == 0) {
cl_mem mem = *memptr;
free(memptr);
error = clReleaseMemObject(mem);
if (error != CL_SUCCESS) {
return error;
}
} else {
break;
}
error = opencl_alloc_actual(ctx, min_size, mem_out);
}
return error;
}
static int opencl_free(struct futhark_context *ctx,
cl_mem mem, size_t size, const char *tag) {
cl_mem* memptr = malloc(sizeof(cl_mem));
*memptr = mem;
free_list_insert(&ctx->cl_free_list, size, (fl_mem)memptr, tag);
return CL_SUCCESS;
}
static int opencl_free_all(struct futhark_context *ctx) {
free_list_pack(&ctx->cl_free_list);
cl_mem* memptr;
while (free_list_first(&ctx->cl_free_list, (fl_mem*)&memptr) == 0) {
cl_mem mem = *memptr;
free(memptr);
int error = clReleaseMemObject(mem);
if (error != CL_SUCCESS) {
return error;
}
}
return CL_SUCCESS;
}
int futhark_context_sync(struct futhark_context* ctx) {
// Check for any delayed error.
cl_int failure_idx = -1;
if (ctx->failure_is_an_option) {
OPENCL_SUCCEED_OR_RETURN(
clEnqueueReadBuffer(ctx->queue,
ctx->global_failure,
CL_FALSE,
0, sizeof(cl_int), &failure_idx,
0, NULL, NULL));
ctx->failure_is_an_option = 0;
}
OPENCL_SUCCEED_OR_RETURN(clFinish(ctx->queue));
if (failure_idx >= 0) {
// We have to clear global_failure so that the next entry point
// is not considered a failure from the start.
cl_int no_failure = -1;
OPENCL_SUCCEED_OR_RETURN(
clEnqueueWriteBuffer(ctx->queue, ctx->global_failure, CL_TRUE,
0, sizeof(cl_int), &no_failure,
0, NULL, NULL));
int64_t args[max_failure_args+1];
OPENCL_SUCCEED_OR_RETURN(
clEnqueueReadBuffer(ctx->queue,
ctx->global_failure_args,
CL_TRUE,
0, sizeof(args), &args,
0, NULL, NULL));
ctx->error = get_failure_msg(failure_idx, args);
return FUTHARK_PROGRAM_ERROR;
}
return 0;
}
// We take as input several strings representing the program, because
// C does not guarantee that the compiler supports particularly large
// literals. Notably, Visual C has a limit of 2048 characters. The
// array must be NULL-terminated.
static void setup_opencl_with_command_queue(struct futhark_context *ctx,
cl_command_queue queue,
const char *srcs[],
const char *extra_build_opts[],
const char* cache_fname) {
int error;
free_list_init(&ctx->cl_free_list);
ctx->queue = queue;
OPENCL_SUCCEED_FATAL(clGetCommandQueueInfo(ctx->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx->ctx, NULL));
// Fill out the device info. This is redundant work if we are
// called from setup_opencl() (which is the common case), but I
// doubt it matters much.
struct opencl_device_option device_option;
OPENCL_SUCCEED_FATAL(clGetCommandQueueInfo(ctx->queue, CL_QUEUE_DEVICE,
sizeof(cl_device_id),
&device_option.device,
NULL));
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_PLATFORM,
sizeof(cl_platform_id),
&device_option.platform,
NULL));
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_TYPE,
sizeof(cl_device_type),
&device_option.device_type,
NULL));
device_option.platform_name = opencl_platform_info(device_option.platform, CL_PLATFORM_NAME);
device_option.device_name = opencl_device_info(device_option.device, CL_DEVICE_NAME);
ctx->device = device_option.device;
if (f64_required) {
cl_uint supported;
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
sizeof(cl_uint), &supported, NULL));
if (!supported) {
futhark_panic(1, "Program uses double-precision floats, but this is not supported on the chosen device: %s\n",
device_option.device_name);
}
}
size_t max_group_size;
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(size_t), &max_group_size, NULL));
size_t max_tile_size = sqrt(max_group_size);
cl_ulong max_local_memory;
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_LOCAL_MEM_SIZE,
sizeof(size_t), &max_local_memory, NULL));
// Futhark reserves 4 bytes for bookkeeping information.
max_local_memory -= 4;
// The OpenCL implementation may reserve some local memory bytes for
// various purposes. In principle, we should use
// clGetKernelWorkGroupInfo() to figure out for each kernel how much
// is actually available, but our current code generator design
// makes this infeasible. Instead, we have this nasty hack where we
// arbitrarily subtract some bytes, based on empirical measurements
// (but which might be arbitrarily wrong). Fortunately, we rarely
// try to really push the local memory usage.
if (strstr(device_option.platform_name, "NVIDIA CUDA") != NULL) {
max_local_memory -= 12;
} else if (strstr(device_option.platform_name, "AMD") != NULL) {
max_local_memory -= 16;
}
// Make sure this function is defined.
post_opencl_setup(ctx, &device_option);
if (max_group_size < ctx->cfg->default_group_size) {
if (ctx->cfg->default_group_size_changed) {
fprintf(stderr, "Note: Device limits default group size to %zu (down from %zu).\n",
max_group_size, ctx->cfg->default_group_size);
}
ctx->cfg->default_group_size = max_group_size;
}
if (max_tile_size < ctx->cfg->default_tile_size) {
if (ctx->cfg->default_tile_size_changed) {
fprintf(stderr, "Note: Device limits default tile size to %zu (down from %zu).\n",
max_tile_size, ctx->cfg->default_tile_size);
}
ctx->cfg->default_tile_size = max_tile_size;
}
ctx->max_group_size = max_group_size;
ctx->max_tile_size = max_tile_size; // No limit.
ctx->max_threshold = ctx->max_num_groups = 0; // No limit.
ctx->max_local_memory = max_local_memory;
// Now we go through all the sizes, clamp them to the valid range,
// or set them to the default.
for (int i = 0; i < ctx->cfg->num_tuning_params; i++) {
const char *size_class = ctx->cfg->tuning_param_classes[i];
int64_t *size_value = &ctx->cfg->tuning_params[i];
const char* size_name = ctx->cfg->tuning_param_names[i];
int64_t max_value = 0, default_value = 0;
if (strstr(size_class, "group_size") == size_class) {
max_value = max_group_size;
default_value = ctx->cfg->default_group_size;
} else if (strstr(size_class, "num_groups") == size_class) {
max_value = max_group_size; // Futhark assumes this constraint.
default_value = ctx->cfg->default_num_groups;
// XXX: as a quick and dirty hack, use twice as many threads for
// histograms by default. We really should just be smarter
// about sizes somehow.
if (strstr(size_name, ".seghist_") != NULL) {
default_value *= 2;
}
} else if (strstr(size_class, "tile_size") == size_class) {
max_value = sqrt(max_group_size);
default_value = ctx->cfg->default_tile_size;
} else if (strstr(size_class, "reg_tile_size") == size_class) {
max_value = 0; // No limit.
default_value = ctx->cfg->default_reg_tile_size;
} else if (strstr(size_class, "threshold") == size_class) {
// Threshold can be as large as it takes.
default_value = ctx->cfg->default_threshold;
} else {
// Bespoke sizes have no limit or default.
}
if (*size_value == 0) {
*size_value = default_value;
} else if (max_value > 0 && *size_value > max_value) {
fprintf(stderr, "Note: Device limits %s to %d (down from %d)\n",
size_name, (int)max_value, (int)*size_value);
*size_value = max_value;
}
}
if (ctx->lockstep_width == 0) {
ctx->lockstep_width = 1;
}
if (ctx->cfg->logging) {
fprintf(stderr, "Lockstep width: %d\n", (int)ctx->lockstep_width);
fprintf(stderr, "Default group size: %d\n", (int)ctx->cfg->default_group_size);
fprintf(stderr, "Default number of groups: %d\n", (int)ctx->cfg->default_num_groups);
}
char *compile_opts = mk_compile_opts(ctx, extra_build_opts, device_option);
if (ctx->cfg->logging) {
fprintf(stderr, "OpenCL compiler options: %s\n", compile_opts);
}
char *fut_opencl_src = NULL;
cl_program prog;
error = CL_SUCCESS;
struct cache_hash h;
int loaded_from_cache = 0;
if (ctx->cfg->load_binary_from == NULL) {
size_t src_size = 0;
// Maybe we have to read OpenCL source from somewhere else (used for debugging).
if (ctx->cfg->load_program_from != NULL) {
fut_opencl_src = slurp_file(ctx->cfg->load_program_from, NULL);
assert(fut_opencl_src != NULL);
} else {
// Construct the OpenCL source concatenating all the fragments.
for (const char **src = srcs; src && *src; src++) {
src_size += strlen(*src);
}
fut_opencl_src = (char*) malloc(src_size + 1);
size_t n, i;
for (i = 0, n = 0; srcs && srcs[i]; i++) {
strncpy(fut_opencl_src+n, srcs[i], src_size-n);
n += strlen(srcs[i]);
}
fut_opencl_src[src_size] = 0;
}
if (ctx->cfg->dump_program_to != NULL) {
if (ctx->cfg->logging) {
fprintf(stderr, "Dumping OpenCL source to %s...\n", ctx->cfg->dump_program_to);
}
dump_file(ctx->cfg->dump_program_to, fut_opencl_src, strlen(fut_opencl_src));
}
if (cache_fname != NULL) {
if (ctx->cfg->logging) {
fprintf(stderr, "Restoring cache from from %s...\n", cache_fname);
}
cache_hash_init(&h);
cache_hash(&h, fut_opencl_src, strlen(fut_opencl_src));
cache_hash(&h, compile_opts, strlen(compile_opts));
unsigned char *buf;
size_t bufsize;
errno = 0;
if (cache_restore(cache_fname, &h, &buf, &bufsize) != 0) {
if (ctx->cfg->logging) {
fprintf(stderr, "Failed to restore cache (errno: %s)\n", strerror(errno));
}
} else {
if (ctx->cfg->logging) {
fprintf(stderr, "Cache restored; loading OpenCL binary...\n");
}
cl_int status = 0;
prog = clCreateProgramWithBinary(ctx->ctx, 1, &device_option.device,
&bufsize, (const unsigned char**)&buf,
&status, &error);
if (status == CL_SUCCESS) {
loaded_from_cache = 1;
if (ctx->cfg->logging) {
fprintf(stderr, "Loading succeeded.\n");
}
} else {
if (ctx->cfg->logging) {
fprintf(stderr, "Loading failed.\n");
}
}
}
}
if (!loaded_from_cache) {
if (ctx->cfg->logging) {
fprintf(stderr, "Creating OpenCL program...\n");
}
const char* src_ptr[] = {fut_opencl_src};
prog = clCreateProgramWithSource(ctx->ctx, 1, src_ptr, &src_size, &error);
OPENCL_SUCCEED_FATAL(error);
}
} else {
if (ctx->cfg->logging) {
fprintf(stderr, "Loading OpenCL binary from %s...\n", ctx->cfg->load_binary_from);
}
size_t binary_size;
unsigned char *fut_opencl_bin =
(unsigned char*) slurp_file(ctx->cfg->load_binary_from, &binary_size);
assert(fut_opencl_bin != NULL);
const unsigned char *binaries[1] = { fut_opencl_bin };
cl_int status = 0;
prog = clCreateProgramWithBinary(ctx->ctx, 1, &device_option.device,
&binary_size, binaries,
&status, &error);
OPENCL_SUCCEED_FATAL(status);
OPENCL_SUCCEED_FATAL(error);
}
if (ctx->cfg->logging) {
fprintf(stderr, "Building OpenCL program...\n");
}
OPENCL_SUCCEED_FATAL(build_opencl_program(prog, device_option.device, compile_opts));
free(compile_opts);
free(fut_opencl_src);
size_t binary_size = 0;
unsigned char *binary = NULL;
int store_in_cache = cache_fname != NULL && !loaded_from_cache;
if (store_in_cache || ctx->cfg->dump_binary_to != NULL) {
OPENCL_SUCCEED_FATAL(clGetProgramInfo(prog, CL_PROGRAM_BINARY_SIZES,
sizeof(size_t), &binary_size, NULL));
binary = (unsigned char*) malloc(binary_size);
OPENCL_SUCCEED_FATAL(clGetProgramInfo(prog, CL_PROGRAM_BINARIES,
sizeof(unsigned char*), &binary, NULL));
}
if (store_in_cache) {
if (ctx->cfg->logging) {
fprintf(stderr, "Caching OpenCL binary in %s...\n", cache_fname);
}
if (cache_store(cache_fname, &h, binary, binary_size) != 0) {
printf("Failed to cache binary: %s\n", strerror(errno));
}
}
if (ctx->cfg->dump_binary_to != NULL) {
if (ctx->cfg->logging) {
fprintf(stderr, "Dumping OpenCL binary to %s...\n", ctx->cfg->dump_binary_to);
}
dump_file(ctx->cfg->dump_binary_to, binary, binary_size);
}
ctx->clprogram = prog;
}
static struct opencl_device_option get_preferred_device(const struct futhark_context_config *cfg) {
struct opencl_device_option *devices;
size_t num_devices;
opencl_all_device_options(&devices, &num_devices);
int num_device_matches = 0;
for (size_t i = 0; i < num_devices; i++) {
struct opencl_device_option device = devices[i];
if (strstr(device.platform_name, cfg->preferred_platform) != NULL &&
strstr(device.device_name, cfg->preferred_device) != NULL &&
(cfg->ignore_blacklist ||
!is_blacklisted(device.platform_name, device.device_name, cfg)) &&
num_device_matches++ == cfg->preferred_device_num) {
// Free all the platform and device names, except the ones we have chosen.
for (size_t j = 0; j < num_devices; j++) {
if (j != i) {
free(devices[j].platform_name);
free(devices[j].device_name);
}
}
free(devices);
return device;
}
}
futhark_panic(1, "Could not find acceptable OpenCL device.\n");
exit(1); // Never reached
}
static void setup_opencl(struct futhark_context *ctx,
const char *srcs[],
const char *extra_build_opts[],
const char* cache_fname) {
struct opencl_device_option device_option = get_preferred_device(ctx->cfg);
if (ctx->cfg->logging) {
fprintf(stderr, "Using platform: %s\n", device_option.platform_name);
fprintf(stderr, "Using device: %s\n", device_option.device_name);
}
// Note that NVIDIA's OpenCL requires the platform property
cl_context_properties properties[] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)device_option.platform,
0
};
cl_int clCreateContext_error;
ctx->ctx = clCreateContext(properties, 1, &device_option.device, NULL, NULL, &clCreateContext_error);
OPENCL_SUCCEED_FATAL(clCreateContext_error);
cl_int clCreateCommandQueue_error;
cl_command_queue queue =
clCreateCommandQueue(ctx->ctx,
device_option.device,
ctx->cfg->profiling ? CL_QUEUE_PROFILING_ENABLE : 0,
&clCreateCommandQueue_error);
OPENCL_SUCCEED_FATAL(clCreateCommandQueue_error);
setup_opencl_with_command_queue(ctx, queue, srcs, extra_build_opts, cache_fname);
}
int backend_context_setup(struct futhark_context* ctx) {
ctx->lockstep_width = 0; // Real value set later.
ctx->profiling_records_capacity = 200;
ctx->profiling_records_used = 0;
ctx->profiling_records =
malloc(ctx->profiling_records_capacity *
sizeof(struct profiling_record));
ctx->failure_is_an_option = 0;
ctx->total_runs = 0;
ctx->total_runtime = 0;
ctx->peak_mem_usage_device = 0;
ctx->cur_mem_usage_device = 0;
if (ctx->cfg->queue_set) {
setup_opencl_with_command_queue(ctx, ctx->cfg->queue, opencl_program, ctx->cfg->build_opts, ctx->cfg->cache_fname);
} else {
setup_opencl(ctx, opencl_program, ctx->cfg->build_opts, ctx->cfg->cache_fname);
}
cl_int error;
cl_int no_error = -1;
ctx->global_failure =
clCreateBuffer(ctx->ctx,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
sizeof(cl_int), &no_error, &error);
OPENCL_SUCCEED_OR_RETURN(error);
// The +1 is to avoid zero-byte allocations.
ctx->global_failure_args =
clCreateBuffer(ctx->ctx,
CL_MEM_READ_WRITE,
sizeof(int64_t)*(max_failure_args+1), NULL, &error);
OPENCL_SUCCEED_OR_RETURN(error);
return 0;
}
void backend_context_teardown(struct futhark_context* ctx) {
OPENCL_SUCCEED_FATAL(clReleaseMemObject(ctx->global_failure));
OPENCL_SUCCEED_FATAL(clReleaseMemObject(ctx->global_failure_args));
(void)opencl_tally_profiling_records(ctx);
free(ctx->profiling_records);
(void)opencl_free_all(ctx);
(void)clReleaseProgram(ctx->clprogram);
(void)clReleaseCommandQueue(ctx->queue);
(void)clReleaseContext(ctx->ctx);
}
cl_command_queue futhark_context_get_command_queue(struct futhark_context* ctx) {
return ctx->queue;
}
// End of backends/opencl.h
static char *get_failure_msg(int failure_idx, int64_t args[])
{
switch (failure_idx) { }
return strdup("Unknown error. This is a compiler bug.");
}
void post_opencl_setup(struct futhark_context *ctx, struct opencl_device_option *option)
{
if ((ctx->lockstep_width == 0 && strstr(option->platform_name, "NVIDIA CUDA") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
ctx->lockstep_width = 32;
}
if ((ctx->lockstep_width == 0 && strstr(option->platform_name, "AMD Accelerated Parallel Processing") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
ctx->lockstep_width = 32;
}
if ((ctx->lockstep_width == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
ctx->lockstep_width = 1;
}
if ((ctx->cfg->default_num_groups == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
size_t MAX_COMPUTE_UNITS_val = 0;
clGetDeviceInfo(ctx->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MAX_COMPUTE_UNITS_val), &MAX_COMPUTE_UNITS_val, NULL);
ctx->cfg->default_num_groups = 4 * MAX_COMPUTE_UNITS_val;
}
if ((ctx->cfg->default_group_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
ctx->cfg->default_group_size = 256;
}
if ((ctx->cfg->default_tile_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
ctx->cfg->default_tile_size = 16;
}
if ((ctx->cfg->default_reg_tile_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
ctx->cfg->default_reg_tile_size = 4;
}
if ((ctx->cfg->default_threshold == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
ctx->cfg->default_threshold = 32768;
}
if ((ctx->lockstep_width == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) {
ctx->lockstep_width = 1;
}
if ((ctx->cfg->default_num_groups == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) {
size_t MAX_COMPUTE_UNITS_val = 0;
clGetDeviceInfo(ctx->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MAX_COMPUTE_UNITS_val), &MAX_COMPUTE_UNITS_val, NULL);
ctx->cfg->default_num_groups = MAX_COMPUTE_UNITS_val;
}
if ((ctx->cfg->default_group_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) {
ctx->cfg->default_group_size = 32;
}
if ((ctx->cfg->default_tile_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) {
ctx->cfg->default_tile_size = 4;
}
if ((ctx->cfg->default_reg_tile_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) {
ctx->cfg->default_reg_tile_size = 1;
}
if ((ctx->cfg->default_threshold == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) {
size_t MAX_COMPUTE_UNITS_val = 0;
clGetDeviceInfo(ctx->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MAX_COMPUTE_UNITS_val), &MAX_COMPUTE_UNITS_val, NULL);
ctx->cfg->default_threshold = MAX_COMPUTE_UNITS_val;
}
}
struct program {
cl_kernel addzisegmap_6892;
cl_kernel add_i64zisegmap_6912;
int64_t copy_dev_to_dev_total_runtime;
int copy_dev_to_dev_runs;
int64_t copy_dev_to_host_total_runtime;
int copy_dev_to_host_runs;
int64_t copy_host_to_dev_total_runtime;
int copy_host_to_dev_runs;
int64_t copy_scalar_to_dev_total_runtime;
int copy_scalar_to_dev_runs;
int64_t copy_scalar_from_dev_total_runtime;
int copy_scalar_from_dev_runs;
int64_t addzisegmap_6892_total_runtime;
int addzisegmap_6892_runs;
int64_t add_i64zisegmap_6912_total_runtime;
int add_i64zisegmap_6912_runs;
};
static void setup_program(struct futhark_context *ctx)
{
(void) ctx;
int error = 0;
(void) error;
ctx->program = malloc(sizeof(struct program));
ctx->program->copy_dev_to_dev_total_runtime = 0;
ctx->program->copy_dev_to_dev_runs = 0;
ctx->program->copy_dev_to_host_total_runtime = 0;
ctx->program->copy_dev_to_host_runs = 0;
ctx->program->copy_host_to_dev_total_runtime = 0;
ctx->program->copy_host_to_dev_runs = 0;
ctx->program->copy_scalar_to_dev_total_runtime = 0;
ctx->program->copy_scalar_to_dev_runs = 0;
ctx->program->copy_scalar_from_dev_total_runtime = 0;
ctx->program->copy_scalar_from_dev_runs = 0;
ctx->program->addzisegmap_6892_total_runtime = 0;
ctx->program->addzisegmap_6892_runs = 0;
ctx->program->add_i64zisegmap_6912_total_runtime = 0;
ctx->program->add_i64zisegmap_6912_runs = 0;
{
ctx->program->addzisegmap_6892 = clCreateKernel(ctx->clprogram, "addzisegmap_6892", &error);
OPENCL_SUCCEED_FATAL(error);
OPENCL_SUCCEED_FATAL(clSetKernelArg(ctx->program->addzisegmap_6892, 0, sizeof(cl_mem), &ctx->global_failure));
if (ctx->debugging)
fprintf(ctx->log, "Created kernel %s.\n", "add.segmap_6892");
}
{
ctx->program->add_i64zisegmap_6912 = clCreateKernel(ctx->clprogram, "add_i64zisegmap_6912", &error);
OPENCL_SUCCEED_FATAL(error);
OPENCL_SUCCEED_FATAL(clSetKernelArg(ctx->program->add_i64zisegmap_6912, 0, sizeof(cl_mem), &ctx->global_failure));
if (ctx->debugging)
fprintf(ctx->log, "Created kernel %s.\n", "add_i64.segmap_6912");
}
}
static void teardown_program(struct futhark_context *ctx)
{
(void) ctx;
int error = 0;
(void) error;
OPENCL_SUCCEED_FATAL(clReleaseKernel(ctx->program->addzisegmap_6892));
OPENCL_SUCCEED_FATAL(clReleaseKernel(ctx->program->add_i64zisegmap_6912));
free(ctx->program);
}
static void set_tuning_params(struct futhark_context *ctx)
{
(void) ctx;
ctx->tuning_params.addzisegmap_group_sizze_6879 = &ctx->cfg->tuning_params[0];
ctx->tuning_params.add_i64zisegmap_group_sizze_6899 = &ctx->cfg->tuning_params[1];
}
int memblock_unref_device(struct futhark_context *ctx, struct memblock_device *block, const char *desc)
{
if (block->references != NULL) {
*block->references -= 1;
if (ctx->detail_memory)
fprintf(ctx->log, "Unreferencing block %s (allocated as %s) in %s: %d references remaining.\n", desc, block->desc, "space 'device'", *block->references);
if (*block->references == 0) {
ctx->cur_mem_usage_device -= block->size;
OPENCL_SUCCEED_OR_RETURN(opencl_free(ctx, block->mem, block->size, desc));
free(block->references);
if (ctx->detail_memory)
fprintf(ctx->log, "%lld bytes freed (now allocated: %lld bytes)\n", (long long) block->size, (long long) ctx->cur_mem_usage_device);
}
block->references = NULL;
}
return 0;
}
int memblock_alloc_device(struct futhark_context *ctx, struct memblock_device *block, int64_t size, const char *desc)
{
if (size < 0)
futhark_panic(1, "Negative allocation of %lld bytes attempted for %s in %s.\n", (long long) size, desc, "space 'device'", ctx->cur_mem_usage_device);
int ret = memblock_unref_device(ctx, block, desc);
if (ret != FUTHARK_SUCCESS)
return ret;
if (ctx->detail_memory)
fprintf(ctx->log, "Allocating %lld bytes for %s in %s (then allocated: %lld bytes)", (long long) size, desc, "space 'device'", (long long) ctx->cur_mem_usage_device + size);
if (ctx->cur_mem_usage_device > ctx->peak_mem_usage_device) {
ctx->peak_mem_usage_device = ctx->cur_mem_usage_device;
if (ctx->detail_memory)
fprintf(ctx->log, " (new peak).\n");
} else if (ctx->detail_memory)
fprintf(ctx->log, ".\n");
ctx->error = OPENCL_SUCCEED_NONFATAL(opencl_alloc(ctx, ctx->log, (size_t) size, desc, &block->mem, (size_t *) &size));
if (ctx->error == NULL) {
block->references = (int *) malloc(sizeof(int));
*block->references = 1;
block->size = size;
block->desc = desc;
ctx->cur_mem_usage_device += size;
return FUTHARK_SUCCESS;
} else {
// We are naively assuming that any memory allocation error is due to OOM.
lock_lock(&ctx->error_lock);
char *old_error = ctx->error;
ctx->error = msgprintf("Failed to allocate memory in %s.\nAttempted allocation: %12lld bytes\nCurrently allocated: %12lld bytes\n%s", "space 'device'", (long long) size, (long long) ctx->cur_mem_usage_device, old_error);
free(old_error);
lock_unlock(&ctx->error_lock);
return FUTHARK_OUT_OF_MEMORY;
}
}
int memblock_set_device(struct futhark_context *ctx, struct memblock_device *lhs, struct memblock_device *rhs, const char *lhs_desc)
{
int ret = memblock_unref_device(ctx, lhs, lhs_desc);
if (rhs->references != NULL)
(*rhs->references)++;
*lhs = *rhs;
return ret;
}
int memblock_unref(struct futhark_context *ctx, struct memblock *block, const char *desc)
{
if (block->references != NULL) {
*block->references -= 1;
if (ctx->detail_memory)
fprintf(ctx->log, "Unreferencing block %s (allocated as %s) in %s: %d references remaining.\n", desc, block->desc, "default space", *block->references);
if (*block->references == 0) {
ctx->cur_mem_usage_default -= block->size;
host_free(ctx, (size_t) block->size, desc, (void *) block->mem);
free(block->references);
if (ctx->detail_memory)
fprintf(ctx->log, "%lld bytes freed (now allocated: %lld bytes)\n", (long long) block->size, (long long) ctx->cur_mem_usage_default);
}
block->references = NULL;
}
return 0;
}
int memblock_alloc(struct futhark_context *ctx, struct memblock *block, int64_t size, const char *desc)
{
if (size < 0)
futhark_panic(1, "Negative allocation of %lld bytes attempted for %s in %s.\n", (long long) size, desc, "default space", ctx->cur_mem_usage_default);
int ret = memblock_unref(ctx, block, desc);
if (ret != FUTHARK_SUCCESS)
return ret;
if (ctx->detail_memory)
fprintf(ctx->log, "Allocating %lld bytes for %s in %s (then allocated: %lld bytes)", (long long) size, desc, "default space", (long long) ctx->cur_mem_usage_default + size);
if (ctx->cur_mem_usage_default > ctx->peak_mem_usage_default) {
ctx->peak_mem_usage_default = ctx->cur_mem_usage_default;
if (ctx->detail_memory)
fprintf(ctx->log, " (new peak).\n");
} else if (ctx->detail_memory)
fprintf(ctx->log, ".\n");
host_alloc(ctx, (size_t) size, desc, (size_t *) &size, (void *) &block->mem);
if (ctx->error == NULL) {
block->references = (int *) malloc(sizeof(int));
*block->references = 1;
block->size = size;
block->desc = desc;
ctx->cur_mem_usage_default += size;
return FUTHARK_SUCCESS;
} else {
// We are naively assuming that any memory allocation error is due to OOM.
lock_lock(&ctx->error_lock);
char *old_error = ctx->error;
ctx->error = msgprintf("Failed to allocate memory in %s.\nAttempted allocation: %12lld bytes\nCurrently allocated: %12lld bytes\n%s", "default space", (long long) size, (long long) ctx->cur_mem_usage_default, old_error);
free(old_error);
lock_unlock(&ctx->error_lock);
return FUTHARK_OUT_OF_MEMORY;
}
}
int memblock_set(struct futhark_context *ctx, struct memblock *lhs, struct memblock *rhs, const char *lhs_desc)
{
int ret = memblock_unref(ctx, lhs, lhs_desc);
if (rhs->references != NULL)
(*rhs->references)++;
*lhs = *rhs;
return ret;
}
void futhark_context_config_set_debugging(struct futhark_context_config *cfg, int flag)
{
cfg->profiling = cfg->logging = cfg->debugging = flag;
}
void futhark_context_config_set_profiling(struct futhark_context_config *cfg, int flag)
{
cfg->profiling = flag;
}
void futhark_context_config_set_logging(struct futhark_context_config *cfg, int flag)
{
cfg->logging = flag;
}
void futhark_context_config_set_cache_file(struct futhark_context_config *cfg, const char *f)
{
cfg->cache_fname = f;
}
int futhark_get_tuning_param_count(void)
{
return num_tuning_params;
}
const char *futhark_get_tuning_param_name(int i)
{
return tuning_param_names[i];
}
const char *futhark_get_tuning_param_class(int i)
{
return tuning_param_classes[i];
}
char *futhark_context_report(struct futhark_context *ctx)
{
if (futhark_context_sync(ctx) != 0)
return NULL;
struct str_builder builder;
str_builder_init(&builder);
str_builder(&builder, "Peak memory usage for space 'device': %lld bytes.\n", (long long) ctx->peak_mem_usage_device);
{ }
if (ctx->profiling) {
OPENCL_SUCCEED_FATAL(opencl_tally_profiling_records(ctx));
str_builder(&builder, "copy_dev_to_dev ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->copy_dev_to_dev_runs, (long) ctx->program->copy_dev_to_dev_total_runtime / (ctx->program->copy_dev_to_dev_runs != 0 ? ctx->program->copy_dev_to_dev_runs : 1), (long) ctx->program->copy_dev_to_dev_total_runtime);
ctx->total_runtime += ctx->program->copy_dev_to_dev_total_runtime;
ctx->total_runs += ctx->program->copy_dev_to_dev_runs;
str_builder(&builder, "copy_dev_to_host ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->copy_dev_to_host_runs, (long) ctx->program->copy_dev_to_host_total_runtime / (ctx->program->copy_dev_to_host_runs != 0 ? ctx->program->copy_dev_to_host_runs : 1), (long) ctx->program->copy_dev_to_host_total_runtime);
ctx->total_runtime += ctx->program->copy_dev_to_host_total_runtime;
ctx->total_runs += ctx->program->copy_dev_to_host_runs;
str_builder(&builder, "copy_host_to_dev ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->copy_host_to_dev_runs, (long) ctx->program->copy_host_to_dev_total_runtime / (ctx->program->copy_host_to_dev_runs != 0 ? ctx->program->copy_host_to_dev_runs : 1), (long) ctx->program->copy_host_to_dev_total_runtime);
ctx->total_runtime += ctx->program->copy_host_to_dev_total_runtime;
ctx->total_runs += ctx->program->copy_host_to_dev_runs;
str_builder(&builder, "copy_scalar_to_dev ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->copy_scalar_to_dev_runs, (long) ctx->program->copy_scalar_to_dev_total_runtime / (ctx->program->copy_scalar_to_dev_runs != 0 ? ctx->program->copy_scalar_to_dev_runs : 1), (long) ctx->program->copy_scalar_to_dev_total_runtime);
ctx->total_runtime += ctx->program->copy_scalar_to_dev_total_runtime;
ctx->total_runs += ctx->program->copy_scalar_to_dev_runs;
str_builder(&builder, "copy_scalar_from_dev ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->copy_scalar_from_dev_runs, (long) ctx->program->copy_scalar_from_dev_total_runtime / (ctx->program->copy_scalar_from_dev_runs != 0 ? ctx->program->copy_scalar_from_dev_runs : 1), (long) ctx->program->copy_scalar_from_dev_total_runtime);
ctx->total_runtime += ctx->program->copy_scalar_from_dev_total_runtime;
ctx->total_runs += ctx->program->copy_scalar_from_dev_runs;
str_builder(&builder, "add.segmap_6892 ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->addzisegmap_6892_runs, (long) ctx->program->addzisegmap_6892_total_runtime / (ctx->program->addzisegmap_6892_runs != 0 ? ctx->program->addzisegmap_6892_runs : 1), (long) ctx->program->addzisegmap_6892_total_runtime);
ctx->total_runtime += ctx->program->addzisegmap_6892_total_runtime;
ctx->total_runs += ctx->program->addzisegmap_6892_runs;
str_builder(&builder, "add_i64.segmap_6912 ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->add_i64zisegmap_6912_runs, (long) ctx->program->add_i64zisegmap_6912_total_runtime / (ctx->program->add_i64zisegmap_6912_runs != 0 ? ctx->program->add_i64zisegmap_6912_runs : 1), (long) ctx->program->add_i64zisegmap_6912_total_runtime);
ctx->total_runtime += ctx->program->add_i64zisegmap_6912_total_runtime;
ctx->total_runs += ctx->program->add_i64zisegmap_6912_runs;
str_builder(&builder, "%d operations with cumulative runtime: %6ldus\n", ctx->total_runs, ctx->total_runtime);
}
return builder.str;
}
char *futhark_context_get_error(struct futhark_context *ctx)
{
char *error = ctx->error;
ctx->error = NULL;
return error;
}
void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f)
{
ctx->log = f;
}
void futhark_context_pause_profiling(struct futhark_context *ctx)
{
ctx->profiling_paused = 1;
}
void futhark_context_unpause_profiling(struct futhark_context *ctx)
{
ctx->profiling_paused = 0;
}
int futhark_context_clear_caches(struct futhark_context *ctx)
{
lock_lock(&ctx->lock);
ctx->peak_mem_usage_device = 0;
ctx->peak_mem_usage_default = 0;
if (ctx->error == NULL)
ctx->error = OPENCL_SUCCEED_NONFATAL(opencl_free_all(ctx));
lock_unlock(&ctx->lock);
return ctx->error != NULL;
}
// Start of context.h
// Eventually it would be nice to move the context definition in here
// instead of generating it in the compiler. For now it defines
// various helper functions that must be available.
// Internal functions.
static void set_error(struct futhark_context* ctx, char *error) {
lock_lock(&ctx->error_lock);
if (ctx->error == NULL) {
ctx->error = error;
} else {
free(error);
}
lock_unlock(&ctx->error_lock);
}
// XXX: should be static, but used in ispc_util.h
void lexical_realloc_error(struct futhark_context* ctx, size_t new_size) {
set_error(ctx,
msgprintf("Failed to allocate memory.\nAttempted allocation: %12lld bytes\n",
(long long) new_size));
}
static int lexical_realloc(struct futhark_context *ctx,
unsigned char **ptr,
int64_t *old_size,
int64_t new_size) {
unsigned char *new = realloc(*ptr, (size_t)new_size);
if (new == NULL) {
lexical_realloc_error(ctx, new_size);
return FUTHARK_OUT_OF_MEMORY;
} else {
*ptr = new;
*old_size = new_size;
return FUTHARK_SUCCESS;
}
}
static void free_all_in_free_list(struct futhark_context* ctx) {
fl_mem mem;
free_list_pack(&ctx->free_list);
while (free_list_first(&ctx->free_list, (fl_mem*)&mem) == 0) {
free((void*)mem);
}
}
static int is_small_alloc(size_t size) {
return size < 1024*1024;
}
static void host_alloc(struct futhark_context* ctx,
size_t size, const char* tag, size_t* size_out, void** mem_out) {
if (is_small_alloc(size) || free_list_find(&ctx->free_list, size, tag, size_out, (fl_mem*)mem_out) != 0) {
*size_out = size;
*mem_out = malloc(size);
}
}
static void host_free(struct futhark_context* ctx,
size_t size, const char* tag, void* mem) {
// Small allocations are handled by malloc()s own free list. The
// threshold here is kind of arbitrary, but seems to work OK.
// Larger allocations are mmap()ed/munmapped() every time, which is
// very slow, and Futhark programs tend to use a few very large
// allocations.
if (is_small_alloc(size)) {
free(mem);
} else {
free_list_insert(&ctx->free_list, size, (fl_mem)mem, tag);
}
}
struct futhark_context_config* futhark_context_config_new(void) {
struct futhark_context_config* cfg = malloc(sizeof(struct futhark_context_config));
if (cfg == NULL) {
return NULL;
}
cfg->in_use = 0;
cfg->debugging = 0;
cfg->profiling = 0;
cfg->logging = 0;
cfg->cache_fname = NULL;
cfg->num_tuning_params = num_tuning_params;
cfg->tuning_params = malloc(cfg->num_tuning_params * sizeof(int64_t));
memcpy(cfg->tuning_params, tuning_param_defaults,
cfg->num_tuning_params * sizeof(int64_t));
cfg->tuning_param_names = tuning_param_names;
cfg->tuning_param_vars = tuning_param_vars;
cfg->tuning_param_classes = tuning_param_classes;
backend_context_config_setup(cfg);
return cfg;
}
void futhark_context_config_free(struct futhark_context_config* cfg) {
assert(!cfg->in_use);
backend_context_config_teardown(cfg);
free(cfg->tuning_params);
free(cfg);
}
struct futhark_context* futhark_context_new(struct futhark_context_config* cfg) {
struct futhark_context* ctx = malloc(sizeof(struct futhark_context));
if (ctx == NULL) {
return NULL;
}
assert(!cfg->in_use);
ctx->cfg = cfg;
ctx->cfg->in_use = 1;
create_lock(&ctx->error_lock);
create_lock(&ctx->lock);
free_list_init(&ctx->free_list);
ctx->peak_mem_usage_default = 0;
ctx->cur_mem_usage_default = 0;
ctx->constants = malloc(sizeof(struct constants));
ctx->detail_memory = cfg->debugging;
ctx->debugging = cfg->debugging;
ctx->logging = cfg->logging;
ctx->profiling = cfg->profiling;
ctx->profiling_paused = 0;
ctx->error = NULL;
ctx->log = stderr;
if (backend_context_setup(ctx) == 0) {
set_tuning_params(ctx);
setup_program(ctx);
init_constants(ctx);
(void)futhark_context_clear_caches(ctx);
(void)futhark_context_sync(ctx);
}
return ctx;
}
void futhark_context_free(struct futhark_context* ctx) {
free_constants(ctx);
teardown_program(ctx);
backend_context_teardown(ctx);
free_all_in_free_list(ctx);
free_list_destroy(&ctx->free_list);
free(ctx->constants);
free_lock(&ctx->lock);
free_lock(&ctx->error_lock);
ctx->cfg->in_use = 0;
free(ctx);
}
// End of context.h
static int futrts_entry_add(struct futhark_context *ctx, struct memblock_device *mem_out_p_6932, struct memblock_device xs_mem_6916, struct memblock_device ys_mem_6917, int64_t n_6776);
static int futrts_entry_add_i64(struct futhark_context *ctx, struct memblock_device *mem_out_p_6938, struct memblock_device xs_mem_6916, struct memblock_device ys_mem_6917, int64_t n_6836);
static int init_constants(struct futhark_context *ctx)
{
(void) ctx;
int err = 0;
cleanup:
return err;
}
static int free_constants(struct futhark_context *ctx)
{
(void) ctx;
return 0;
}
struct futhark_u8_1d {
struct memblock_device mem;
int64_t shape[1];
};
struct futhark_u8_1d *futhark_new_u8_1d(struct futhark_context *ctx, const uint8_t *data, int64_t dim0)
{
struct futhark_u8_1d *bad = NULL;
struct futhark_u8_1d *arr = (struct futhark_u8_1d *) malloc(sizeof(struct futhark_u8_1d));
if (arr == NULL)
return bad;
lock_lock(&ctx->lock);
arr->mem.references = NULL;
if (memblock_alloc_device(ctx, &arr->mem, dim0 * 1, "arr->mem"))
return NULL;
arr->shape[0] = dim0;
if ((size_t) dim0 * 1 > 0)
OPENCL_SUCCEED_OR_RETURN(clEnqueueWriteBuffer(ctx->queue, arr->mem.mem, CL_FALSE, (size_t) 0, (size_t) ((size_t) dim0 * 1), data + 0, 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_dev_to_host_runs, &ctx->program->copy_dev_to_host_total_runtime)));
lock_unlock(&ctx->lock);
return arr;
}
struct futhark_u8_1d *futhark_new_raw_u8_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0)
{
struct futhark_u8_1d *bad = NULL;
struct futhark_u8_1d *arr = (struct futhark_u8_1d *) malloc(sizeof(struct futhark_u8_1d));
if (arr == NULL)
return bad;
lock_lock(&ctx->lock);
arr->mem.references = NULL;
if (memblock_alloc_device(ctx, &arr->mem, dim0 * 1, "arr->mem"))
return NULL;
arr->shape[0] = dim0;
if ((size_t) dim0 * 1 > 0) {
OPENCL_SUCCEED_OR_RETURN(clEnqueueCopyBuffer(ctx->queue, data, arr->mem.mem, (size_t) offset, (size_t) 0, (size_t) ((size_t) dim0 * 1), 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_dev_to_dev_runs, &ctx->program->copy_dev_to_dev_total_runtime)));
if (ctx->debugging)
OPENCL_SUCCEED_FATAL(clFinish(ctx->queue));
}
lock_unlock(&ctx->lock);
return arr;
}
int futhark_free_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr)
{
lock_lock(&ctx->lock);
if (memblock_unref_device(ctx, &arr->mem, "arr->mem") != 0)
return 1;
lock_unlock(&ctx->lock);
free(arr);
return 0;
}
int futhark_values_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr, uint8_t *data)
{
lock_lock(&ctx->lock);
if ((size_t) arr->shape[0] * 1 > 0) {
cl_bool sync_call = CL_FALSE;
OPENCL_SUCCEED_OR_RETURN(clEnqueueReadBuffer(ctx->queue, arr->mem.mem, ctx->failure_is_an_option ? CL_FALSE : sync_call, (size_t) 0, (size_t) ((size_t) arr->shape[0] * 1), data + 0, 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_host_to_dev_runs, &ctx->program->copy_host_to_dev_total_runtime)));
if ((sync_call && ctx->failure_is_an_option) && futhark_context_sync(ctx) != 0)
return 1;
}
lock_unlock(&ctx->lock);
return 0;
}
cl_mem futhark_values_raw_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr)
{
(void) ctx;
return arr->mem.mem;
}
const int64_t *futhark_shape_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr)
{
(void) ctx;
return arr->shape;
}
struct futhark_i64_1d {
struct memblock_device mem;
int64_t shape[1];
};
struct futhark_i64_1d *futhark_new_i64_1d(struct futhark_context *ctx, const int64_t *data, int64_t dim0)
{
struct futhark_i64_1d *bad = NULL;
struct futhark_i64_1d *arr = (struct futhark_i64_1d *) malloc(sizeof(struct futhark_i64_1d));
if (arr == NULL)
return bad;
lock_lock(&ctx->lock);
arr->mem.references = NULL;
if (memblock_alloc_device(ctx, &arr->mem, dim0 * 8, "arr->mem"))
return NULL;
arr->shape[0] = dim0;
if ((size_t) dim0 * 8 > 0)
OPENCL_SUCCEED_OR_RETURN(clEnqueueWriteBuffer(ctx->queue, arr->mem.mem, CL_FALSE, (size_t) 0, (size_t) ((size_t) dim0 * 8), data + 0, 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_dev_to_host_runs, &ctx->program->copy_dev_to_host_total_runtime)));
lock_unlock(&ctx->lock);
return arr;
}
struct futhark_i64_1d *futhark_new_raw_i64_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0)
{
struct futhark_i64_1d *bad = NULL;
struct futhark_i64_1d *arr = (struct futhark_i64_1d *) malloc(sizeof(struct futhark_i64_1d));
if (arr == NULL)
return bad;
lock_lock(&ctx->lock);
arr->mem.references = NULL;
if (memblock_alloc_device(ctx, &arr->mem, dim0 * 8, "arr->mem"))
return NULL;
arr->shape[0] = dim0;
if ((size_t) dim0 * 8 > 0) {
OPENCL_SUCCEED_OR_RETURN(clEnqueueCopyBuffer(ctx->queue, data, arr->mem.mem, (size_t) offset, (size_t) 0, (size_t) ((size_t) dim0 * 8), 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_dev_to_dev_runs, &ctx->program->copy_dev_to_dev_total_runtime)));
if (ctx->debugging)
OPENCL_SUCCEED_FATAL(clFinish(ctx->queue));
}
lock_unlock(&ctx->lock);
return arr;
}
int futhark_free_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr)
{
lock_lock(&ctx->lock);
if (memblock_unref_device(ctx, &arr->mem, "arr->mem") != 0)
return 1;
lock_unlock(&ctx->lock);
free(arr);
return 0;
}
int futhark_values_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr, int64_t *data)
{
lock_lock(&ctx->lock);
if ((size_t) arr->shape[0] * 8 > 0) {
cl_bool sync_call = CL_FALSE;
OPENCL_SUCCEED_OR_RETURN(clEnqueueReadBuffer(ctx->queue, arr->mem.mem, ctx->failure_is_an_option ? CL_FALSE : sync_call, (size_t) 0, (size_t) ((size_t) arr->shape[0] * 8), data + 0, 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_host_to_dev_runs, &ctx->program->copy_host_to_dev_total_runtime)));
if ((sync_call && ctx->failure_is_an_option) && futhark_context_sync(ctx) != 0)
return 1;
}
lock_unlock(&ctx->lock);
return 0;
}
cl_mem futhark_values_raw_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr)
{
(void) ctx;
return arr->mem.mem;
}
const int64_t *futhark_shape_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr)
{
(void) ctx;
return arr->shape;
}
static int futrts_entry_add(struct futhark_context *ctx, struct memblock_device *mem_out_p_6932, struct memblock_device xs_mem_6916, struct memblock_device ys_mem_6917, int64_t n_6776)
{
(void) ctx;
int err = 0;
struct memblock_device mem_6920;
mem_6920.references = NULL;
struct memblock_device mem_out_6922;
mem_out_6922.references = NULL;
int64_t bytes_6919 = smax64((int64_t) 0, n_6776);
int64_t segmap_group_sizze_6888;
segmap_group_sizze_6888 = *ctx->tuning_params.addzisegmap_group_sizze_6879;
int64_t segmap_usable_groups_6889 = sdiv_up64(n_6776, segmap_group_sizze_6888);
if (memblock_alloc_device(ctx, &mem_6920, bytes_6919, "mem_6920")) {
err = 1;
goto cleanup;
}
if (ctx->debugging)
fprintf(ctx->log, "%s\n", "\n# SegMap");
int32_t virt_num_groups_6923 = sext_i64_i32(sdiv_up64(n_6776, segmap_group_sizze_6888));
OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->addzisegmap_6892, 1, sizeof(n_6776), &n_6776));
OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->addzisegmap_6892, 2, sizeof(xs_mem_6916.mem), &xs_mem_6916.mem));
OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->addzisegmap_6892, 3, sizeof(ys_mem_6917.mem), &ys_mem_6917.mem));
OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->addzisegmap_6892, 4, sizeof(mem_6920.mem), &mem_6920.mem));
if (1 * ((size_t) segmap_usable_groups_6889 * (size_t) *ctx->tuning_params.addzisegmap_group_sizze_6879) != 0) {
const size_t global_work_sizze_6933[1] = {(size_t) segmap_usable_groups_6889 * (size_t) *ctx->tuning_params.addzisegmap_group_sizze_6879};
const size_t local_work_sizze_6937[1] = {(size_t) *ctx->tuning_params.addzisegmap_group_sizze_6879};
int64_t time_start_6934 = 0, time_end_6935 = 0;
if (ctx->debugging) {
fprintf(ctx->log, "Launching %s with global work size [%zu] and local work size [%zu]; local memory: %d bytes.\n", "add.segmap_6892", global_work_sizze_6933[0], local_work_sizze_6937[0], (int) 0);
time_start_6934 = get_wall_time();
}
cl_event *pevent = ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->addzisegmap_6892_runs, &ctx->program->addzisegmap_6892_total_runtime);
OPENCL_SUCCEED_OR_RETURN(clEnqueueNDRangeKernel(ctx->queue, ctx->program->addzisegmap_6892, 1, NULL, global_work_sizze_6933, local_work_sizze_6937, 0, NULL, pevent));
if (ctx->debugging) {
OPENCL_SUCCEED_FATAL(clFinish(ctx->queue));
time_end_6935 = get_wall_time();
long time_diff_6936 = time_end_6935 - time_start_6934;
fprintf(ctx->log, "kernel %s runtime: %ldus\n", "add.segmap_6892", time_diff_6936);
}
}
if (ctx->debugging)
fprintf(ctx->log, "%s\n", "");
if (memblock_set_device(ctx, &mem_out_6922, &mem_6920, "mem_6920") != 0)
return 1;
if (memblock_set_device(ctx, &*mem_out_p_6932, &mem_out_6922, "mem_out_6922") != 0)
return 1;
cleanup:
{
if (memblock_unref_device(ctx, &mem_6920, "mem_6920") != 0)
return 1;
if (memblock_unref_device(ctx, &mem_out_6922, "mem_out_6922") != 0)
return 1;
}
return err;
}
static int futrts_entry_add_i64(struct futhark_context *ctx, struct memblock_device *mem_out_p_6938, struct memblock_device xs_mem_6916, struct memblock_device ys_mem_6917, int64_t n_6836)
{
(void) ctx;
int err = 0;
struct memblock_device mem_6921;
mem_6921.references = NULL;
struct memblock_device mem_out_6922;
mem_out_6922.references = NULL;
int64_t binop_y_6919 = (int64_t) 8 * n_6836;
int64_t bytes_6920 = smax64((int64_t) 0, binop_y_6919);
int64_t segmap_group_sizze_6908;
segmap_group_sizze_6908 = *ctx->tuning_params.add_i64zisegmap_group_sizze_6899;
int64_t segmap_usable_groups_6909 = sdiv_up64(n_6836, segmap_group_sizze_6908);
if (memblock_alloc_device(ctx, &mem_6921, bytes_6920, "mem_6921")) {
err = 1;
goto cleanup;
}
if (ctx->debugging)
fprintf(ctx->log, "%s\n", "\n# SegMap");
int32_t virt_num_groups_6923 = sext_i64_i32(sdiv_up64(n_6836, segmap_group_sizze_6908));
OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->add_i64zisegmap_6912, 1, sizeof(n_6836), &n_6836));
OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->add_i64zisegmap_6912, 2, sizeof(xs_mem_6916.mem), &xs_mem_6916.mem));
OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->add_i64zisegmap_6912, 3, sizeof(ys_mem_6917.mem), &ys_mem_6917.mem));
OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->add_i64zisegmap_6912, 4, sizeof(mem_6921.mem), &mem_6921.mem));
if (1 * ((size_t) segmap_usable_groups_6909 * (size_t) *ctx->tuning_params.add_i64zisegmap_group_sizze_6899) != 0) {
const size_t global_work_sizze_6939[1] = {(size_t) segmap_usable_groups_6909 * (size_t) *ctx->tuning_params.add_i64zisegmap_group_sizze_6899};
const size_t local_work_sizze_6943[1] = {(size_t) *ctx->tuning_params.add_i64zisegmap_group_sizze_6899};
int64_t time_start_6940 = 0, time_end_6941 = 0;
if (ctx->debugging) {
fprintf(ctx->log, "Launching %s with global work size [%zu] and local work size [%zu]; local memory: %d bytes.\n", "add_i64.segmap_6912", global_work_sizze_6939[0], local_work_sizze_6943[0], (int) 0);
time_start_6940 = get_wall_time();
}
cl_event *pevent = ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->add_i64zisegmap_6912_runs, &ctx->program->add_i64zisegmap_6912_total_runtime);
OPENCL_SUCCEED_OR_RETURN(clEnqueueNDRangeKernel(ctx->queue, ctx->program->add_i64zisegmap_6912, 1, NULL, global_work_sizze_6939, local_work_sizze_6943, 0, NULL, pevent));
if (ctx->debugging) {
OPENCL_SUCCEED_FATAL(clFinish(ctx->queue));
time_end_6941 = get_wall_time();
long time_diff_6942 = time_end_6941 - time_start_6940;
fprintf(ctx->log, "kernel %s runtime: %ldus\n", "add_i64.segmap_6912", time_diff_6942);
}
}
if (ctx->debugging)
fprintf(ctx->log, "%s\n", "");
if (memblock_set_device(ctx, &mem_out_6922, &mem_6921, "mem_6921") != 0)
return 1;
if (memblock_set_device(ctx, &*mem_out_p_6938, &mem_out_6922, "mem_out_6922") != 0)
return 1;
cleanup:
{
if (memblock_unref_device(ctx, &mem_6921, "mem_6921") != 0)
return 1;
if (memblock_unref_device(ctx, &mem_out_6922, "mem_out_6922") != 0)
return 1;
}
return err;
}
int futhark_entry_add(struct futhark_context *ctx, struct futhark_u8_1d **out0, const struct futhark_u8_1d *in0, const struct futhark_u8_1d *in1)
{
int64_t n_6776 = (int64_t) 0;
int ret = 0;
lock_lock(&ctx->lock);
struct memblock_device mem_out_6922;
mem_out_6922.references = NULL;
struct memblock_device ys_mem_6917;
ys_mem_6917.references = NULL;
struct memblock_device xs_mem_6916;
xs_mem_6916.references = NULL;
xs_mem_6916 = in0->mem;
n_6776 = in0->shape[0];
ys_mem_6917 = in1->mem;
n_6776 = in1->shape[0];
if (!(n_6776 == in0->shape[0] && n_6776 == in1->shape[0])) {
ret = 1;
set_error(ctx, msgprintf("Error: entry point arguments have invalid sizes.\n"));
}
if (ret == 0) {
ret = futrts_entry_add(ctx, &mem_out_6922, xs_mem_6916, ys_mem_6917, n_6776);
if (ret == 0) {
assert((*out0 = (struct futhark_u8_1d *) malloc(sizeof(struct futhark_u8_1d))) != NULL);
(*out0)->mem = mem_out_6922;
(*out0)->shape[0] = n_6776;
}
}
lock_unlock(&ctx->lock);
return ret;
}
int futhark_entry_add_i64(struct futhark_context *ctx, struct futhark_i64_1d **out0, const struct futhark_i64_1d *in0, const struct futhark_i64_1d *in1)
{
int64_t n_6836 = (int64_t) 0;
int ret = 0;
lock_lock(&ctx->lock);
struct memblock_device mem_out_6922;
mem_out_6922.references = NULL;
struct memblock_device ys_mem_6917;
ys_mem_6917.references = NULL;
struct memblock_device xs_mem_6916;
xs_mem_6916.references = NULL;
xs_mem_6916 = in0->mem;
n_6836 = in0->shape[0];
ys_mem_6917 = in1->mem;
n_6836 = in1->shape[0];
if (!(n_6836 == in0->shape[0] && n_6836 == in1->shape[0])) {
ret = 1;
set_error(ctx, msgprintf("Error: entry point arguments have invalid sizes.\n"));
}
if (ret == 0) {
ret = futrts_entry_add_i64(ctx, &mem_out_6922, xs_mem_6916, ys_mem_6917, n_6836);
if (ret == 0) {
assert((*out0 = (struct futhark_i64_1d *) malloc(sizeof(struct futhark_i64_1d))) != NULL);
(*out0)->mem = mem_out_6922;
(*out0)->shape[0] = n_6836;
}
}
lock_unlock(&ctx->lock);
return ret;
}
defmodule Map.NIF do
@on_load :load_nifs
def load_nifs do
:erlang.load_nif('./lib_map_nif', 0)
end
def futhark_context_config_new do
raise "NIF futhark_context_config_new not implemented"
end
def futhark_context_new(_cfg) do
raise "NIF futhark_context_new not implemented"
end
def futhark_context_sync(_ctx) do
raise "NIF futhark_context_sync not implemented"
end
def futhark_new_i64_1d(_ctx, _binary) do
raise "NIF futhark_new_i64_1d not implemented"
end
def futhark_i64_1d_to_binary(_ctx, _in) do
raise "NIF futhark_i64_1d_to_binary not implemented"
end
def futhark_new_u8_1d(_ctx, _binary) do
raise "NIF futhark_new_u8_1d not implemented"
end
def futhark_u8_1d_to_binary(_ctx, _in) do
raise "NIF futhark_u8_1d_to_binary not implemented"
end
def futhark_entry_add(_ctx, _xs, _ys) do
raise "NIF futhark_entry_add not implemented"
end
def futhark_entry_add_i64(_ctx, _xs, _ys) do
raise "NIF futhark_entry_add_i64 not implemented"
end
end
// Generated by Futhark 0.25.0 (prerelease - include info below when reporting bugs)
// git: 6a2e6e1 (Sun Apr 30 19:03:19 2023 +0200) [modified]
#pragma once
// Headers
#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdio.h>
#include <float.h>
#define CL_TARGET_OPENCL_VERSION 120
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifdef __APPLE__
#define CL_SILENCE_DEPRECATION
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
// Initialisation
struct futhark_context_config;
struct futhark_context_config *futhark_context_config_new(void);
void futhark_context_config_free(struct futhark_context_config *cfg);
int futhark_context_config_set_tuning_param(struct futhark_context_config *cfg, const char *param_name, size_t new_value);
struct futhark_context;
struct futhark_context *futhark_context_new(struct futhark_context_config *cfg);
void futhark_context_free(struct futhark_context *cfg);
void futhark_context_config_add_build_option(struct futhark_context_config *cfg, const char *opt);
void futhark_context_config_set_device(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_set_platform(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_select_device_interactively(struct futhark_context_config *cfg);
void futhark_context_config_list_devices(struct futhark_context_config *cfg);
void futhark_context_config_dump_program_to(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_load_program_from(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_dump_binary_to(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_load_binary_from(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_set_default_group_size(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_default_threshold(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_command_queue(struct futhark_context_config *cfg, cl_command_queue);
void futhark_context_config_set_debugging(struct futhark_context_config *cfg, int flag);
void futhark_context_config_set_profiling(struct futhark_context_config *cfg, int flag);
void futhark_context_config_set_logging(struct futhark_context_config *cfg, int flag);
int futhark_get_tuning_param_count(void);
const char *futhark_get_tuning_param_name(int);
const char *futhark_get_tuning_param_class(int);
// Arrays
struct futhark_i64_1d;
struct futhark_i64_1d *futhark_new_i64_1d(struct futhark_context *ctx, const int64_t *data, int64_t dim0);
struct futhark_i64_1d *futhark_new_raw_i64_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0);
int futhark_free_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr);
int futhark_values_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr, int64_t *data);
cl_mem futhark_values_raw_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr);
const int64_t *futhark_shape_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr);
struct futhark_u8_1d;
struct futhark_u8_1d *futhark_new_u8_1d(struct futhark_context *ctx, const uint8_t *data, int64_t dim0);
struct futhark_u8_1d *futhark_new_raw_u8_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0);
int futhark_free_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr);
int futhark_values_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr, uint8_t *data);
cl_mem futhark_values_raw_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr);
const int64_t *futhark_shape_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr);
// Opaque values
// Entry points
int futhark_entry_add(struct futhark_context *ctx, struct futhark_u8_1d **out0, const struct futhark_u8_1d *in0, const struct futhark_u8_1d *in1);
int futhark_entry_add_i64(struct futhark_context *ctx, struct futhark_i64_1d **out0, const struct futhark_i64_1d *in0, const struct futhark_i64_1d *in1);
// Miscellaneous
int futhark_context_sync(struct futhark_context *ctx);
cl_command_queue futhark_context_get_command_queue(struct futhark_context *ctx);
void futhark_context_config_set_cache_file(struct futhark_context_config *cfg, const char *f);
char *futhark_context_report(struct futhark_context *ctx);
char *futhark_context_get_error(struct futhark_context *ctx);
void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f);
void futhark_context_pause_profiling(struct futhark_context *ctx);
void futhark_context_unpause_profiling(struct futhark_context *ctx);
int futhark_context_clear_caches(struct futhark_context *ctx);
#define FUTHARK_BACKEND_opencl
#define FUTHARK_SUCCESS 0
#define FUTHARK_PROGRAM_ERROR 2
#define FUTHARK_OUT_OF_MEMORY 3
#ifdef __cplusplus
}
#endif
{
"backend": "opencl",
"entry_points": {
"add": {
"cfun": "futhark_entry_add",
"inputs": [
{
"name": "xs",
"type": "[]u8",
"unique": false
},
{
"name": "ys",
"type": "[]u8",
"unique": false
}
],
"outputs": [
{
"type": "[]u8",
"unique": false
}
],
"tuning_params": [
"add.segmap_group_size_6879"
]
},
"add_i64": {
"cfun": "futhark_entry_add_i64",
"inputs": [
{
"name": "xs",
"type": "[]i64",
"unique": false
},
{
"name": "ys",
"type": "[]i64",
"unique": false
}
],
"outputs": [
{
"type": "[]i64",
"unique": false
}
],
"tuning_params": [
"add_i64.segmap_group_size_6899"
]
}
},
"types": {
"[]i64": {
"ctype": "struct futhark_i64_1d *",
"elemtype": "i64",
"kind": "array",
"ops": {
"free": "futhark_free_i64_1d",
"new": "futhark_new_i64_1d",
"shape": "futhark_shape_i64_1d",
"values": "futhark_values_i64_1d"
},
"rank": 1
},
"[]u8": {
"ctype": "struct futhark_u8_1d *",
"elemtype": "u8",
"kind": "array",
"ops": {
"free": "futhark_free_u8_1d",
"new": "futhark_new_u8_1d",
"shape": "futhark_shape_u8_1d",
"values": "futhark_values_u8_1d"
},
"rank": 1
}
},
"version": "0.25.0 (prerelease - include info below when reporting bugs)\ngit: 6a2e6e1 (Sun Apr 30 19:03:19 2023 +0200) [modified]"
}
#include <erl_nif.h>
#include "lib_map.c"
struct futhark_context;
ERL_NIF_TERM atom_ok;
ErlNifResourceType* CONFIG_TYPE;
ErlNifResourceType* CONTEXT_TYPE;
ErlNifResourceType* I64_1D;
ErlNifResourceType* U8_1D;
static int open_resource(ErlNifEnv* env, ErlNifResourceType** resource_type, const char* name)
{
const char* mod = "resources";
int flags = ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER;
*resource_type = enif_open_resource_type(env, mod, name, NULL, flags, NULL);
if(CONFIG_TYPE == NULL) return -1;
return 0;
}
static int load(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info)
{
if(open_resource(env, &CONFIG_TYPE, "Config") == -1) return -1;
if(open_resource(env, &CONTEXT_TYPE, "Context") == -1) return -1;
if(open_resource(env, &I64_1D, "i64_1d") == -1) return -1;
if(open_resource(env, &U8_1D, "u8_1d") == -1) return -1;
atom_ok = enif_make_atom(env, "ok");
return 0;
}
static ERL_NIF_TERM futhark_context_config_new_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
struct futhark_context_config **res;
ERL_NIF_TERM ret;
if(argc != 0) {
return enif_make_badarg(env);
}
res = enif_alloc_resource(CONFIG_TYPE, sizeof(struct futhark_context_config *));
if(res == NULL) return enif_make_badarg(env);
struct futhark_context_config* tmp = futhark_context_config_new();
*res = tmp;
ret = enif_make_resource(env, res);
enif_release_resource(res);
return enif_make_tuple2(env, atom_ok, ret);
}
static ERL_NIF_TERM futhark_context_new_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
struct futhark_context_config **cfg;
struct futhark_context **res;
ERL_NIF_TERM ret;
if(argc != 1) {
return enif_make_badarg(env);
}
if(!enif_get_resource(env, argv[0], CONFIG_TYPE, (void**) &cfg)) {
return enif_make_badarg(env);
}
res = enif_alloc_resource(CONTEXT_TYPE, sizeof(struct futhark_context *));
if(res == NULL) return enif_make_badarg(env);
struct futhark_context* tmp = futhark_context_new(*cfg);
*res = tmp;
ret = enif_make_resource(env, res);
enif_release_resource(res);
return enif_make_tuple2(env, atom_ok, ret);
}
static ERL_NIF_TERM futhark_context_sync_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
struct futhark_context **ctx;
if(argc != 1) {
return enif_make_badarg(env);
}
if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
return enif_make_badarg(env);
}
futhark_context_sync(*ctx);
return atom_ok;
}
static ERL_NIF_TERM futhark_new_i64_1d_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
struct futhark_context **ctx;
ErlNifBinary bin;
struct futhark_i64_1d **res;
ERL_NIF_TERM ret;
if(argc != 2) {
return enif_make_badarg(env);
}
if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
return enif_make_badarg(env);
}
if (!enif_inspect_binary(env, argv[1], &bin)) {
return enif_make_badarg(env);
}
res = enif_alloc_resource(I64_1D, sizeof(struct futhark_i64_1d *));
if(res == NULL) return enif_make_badarg(env);
struct futhark_i64_1d * tmp = futhark_new_i64_1d(*ctx, (const int64_t *)bin.data, bin.size / sizeof(int64_t));
const int64_t *shape = futhark_shape_i64_1d(*ctx, tmp);
*res = tmp;
ret = enif_make_resource(env, res);
enif_release_resource(res);
return enif_make_tuple2(env, atom_ok, ret);
}
static ERL_NIF_TERM futhark_i64_1d_to_binary_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
struct futhark_context **ctx;
struct futhark_i64_1d **xs;
ErlNifBinary binary;
ERL_NIF_TERM ret;
if(argc != 2) {
return enif_make_badarg(env);
}
if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
return enif_make_badarg(env);
}
if(!enif_get_resource(env, argv[1], I64_1D, (void**) &xs)) {
return enif_make_badarg(env);
}
const int64_t *shape = futhark_shape_i64_1d(*ctx, *xs);
enif_alloc_binary(shape[0] * sizeof(int64_t), &binary);
if (futhark_values_i64_1d(*ctx, *xs, (int64_t *)(binary.data)) != 0) return enif_make_badarg(env);
futhark_context_sync(*ctx);
ret = enif_make_binary(env, &binary);
return enif_make_tuple2(env, atom_ok, ret);
}
static ERL_NIF_TERM futhark_new_u8_1d_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
struct futhark_context **ctx;
ErlNifBinary bin;
struct futhark_u8_1d **res;
ERL_NIF_TERM ret;
if(argc != 2) {
return enif_make_badarg(env);
}
if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
return enif_make_badarg(env);
}
if (!enif_inspect_binary(env, argv[1], &bin)) {
return enif_make_badarg(env);
}
res = enif_alloc_resource(U8_1D, sizeof(struct futhark_u8_1d *));
if(res == NULL) return enif_make_badarg(env);
struct futhark_u8_1d * tmp = futhark_new_u8_1d(*ctx, (const uint8_t *)bin.data, bin.size / sizeof(uint8_t));
const int64_t *shape = futhark_shape_u8_1d(*ctx, tmp);
*res = tmp;
ret = enif_make_resource(env, res);
enif_release_resource(res);
return enif_make_tuple2(env, atom_ok, ret);
}
static ERL_NIF_TERM futhark_u8_1d_to_binary_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
struct futhark_context **ctx;
struct futhark_u8_1d **xs;
ErlNifBinary binary;
ERL_NIF_TERM ret;
if(argc != 2) {
return enif_make_badarg(env);
}
if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
return enif_make_badarg(env);
}
if(!enif_get_resource(env, argv[1], U8_1D, (void**) &xs)) {
return enif_make_badarg(env);
}
const int64_t *shape = futhark_shape_u8_1d(*ctx, *xs);
enif_alloc_binary(shape[0] * sizeof(uint8_t), &binary);
if (futhark_values_u8_1d(*ctx, *xs, (uint8_t *)(binary.data)) != 0) return enif_make_badarg(env);
futhark_context_sync(*ctx);
ret = enif_make_binary(env, &binary);
return enif_make_tuple2(env, atom_ok, ret);
}
static ERL_NIF_TERM futhark_entry_add_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
struct futhark_context **ctx;
struct futhark_u8_1d **xs;
struct futhark_u8_1d **ys;
struct futhark_u8_1d **res;
ERL_NIF_TERM ret;
if(argc != 3) {
return enif_make_badarg(env);
}
if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
return enif_make_badarg(env);
}
if(!enif_get_resource(env, argv[1], U8_1D, (void**) &xs)) {
return enif_make_badarg(env);
}
if(!enif_get_resource(env, argv[2], U8_1D, (void**) &ys)) {
return enif_make_badarg(env);
}
res = enif_alloc_resource(U8_1D, sizeof(struct futhark_u8_1d *));
if(res == NULL) return enif_make_badarg(env);
if (futhark_entry_add(*ctx, res, *xs, *ys) != 0) return enif_make_badarg(env);
ret = enif_make_resource(env, res);
enif_release_resource(res);
return enif_make_tuple2(env, atom_ok, ret);
}
static ERL_NIF_TERM futhark_entry_add_i64_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
struct futhark_context **ctx;
struct futhark_i64_1d **xs;
struct futhark_i64_1d **ys;
struct futhark_i64_1d **res;
ERL_NIF_TERM ret;
if(argc != 3) {
return enif_make_badarg(env);
}
if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
return enif_make_badarg(env);
}
if(!enif_get_resource(env, argv[1], I64_1D, (void**) &xs)) {
return enif_make_badarg(env);
}
if(!enif_get_resource(env, argv[2], I64_1D, (void**) &ys)) {
return enif_make_badarg(env);
}
res = enif_alloc_resource(I64_1D, sizeof(struct futhark_i64_1d *));
if(res == NULL) return enif_make_badarg(env);
if (futhark_entry_add_i64(*ctx, res, *xs, *ys) != 0) return enif_make_badarg(env);
ret = enif_make_resource(env, res);
enif_release_resource(res);
return enif_make_tuple2(env, atom_ok, ret);
}
static ErlNifFunc nif_funcs[] = {
{"futhark_context_config_new", 0, futhark_context_config_new_nif},
{"futhark_context_new", 1, futhark_context_new_nif},
{"futhark_new_i64_1d", 2, futhark_new_i64_1d_nif},
{"futhark_i64_1d_to_binary", 2, futhark_i64_1d_to_binary_nif},
{"futhark_new_u8_1d", 2, futhark_new_u8_1d_nif},
{"futhark_u8_1d_to_binary", 2, futhark_u8_1d_to_binary_nif},
{"futhark_entry_add", 3, futhark_entry_add_nif},
{"futhark_entry_add_i64", 3, futhark_entry_add_i64_nif},
{"futhark_context_sync", 1, futhark_context_sync_nif}
};
ERL_NIF_INIT(Elixir.Map.NIF, nif_funcs, &load, NULL, NULL, NULL)
c("lib_map.ex")
{:ok, cfg} = Map.NIF.futhark_context_config_new()
{:ok, ctx} = Map.NIF.futhark_context_new(cfg)
xs_binary = <<0, 1>>
{:ok, xs} = Map.NIF.futhark_new_u8_1d(ctx, xs_binary)
{:ok, ^xs_binary} = Map.NIF.futhark_u8_1d_to_binary(ctx, xs)
{:ok, ys} = Map.NIF.futhark_new_u8_1d(ctx, <<1, 4>>)
{:ok, zs} = Map.NIF.futhark_entry_add(ctx, xs, ys)
{:ok, <<1, 5>> = zs_binary} = Map.NIF.futhark_u8_1d_to_binary(ctx, zs)
xs_binary = <<1::integer-signed-64-little>>
{:ok, xs} = Map.NIF.futhark_new_i64_1d(ctx, xs_binary)
{:ok, ^xs_binary} = Map.NIF.futhark_i64_1d_to_binary(ctx, xs)
{:ok, ys} = Map.NIF.futhark_new_i64_1d(ctx, <<1279::integer-signed-64-little>>)
{:ok, zs} = Map.NIF.futhark_entry_add_i64(ctx, xs, ys)
{:ok, <<1280::integer-signed-64-little>> = zs_binary} = Map.NIF.futhark_i64_1d_to_binary(ctx, zs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment