Last active
June 21, 2023 11:48
-
-
Save Munksgaard/f2f1d5fa15e7deb8cd1fa6fad9b5edac to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Generated by Futhark 0.25.0 (prerelease - include info below when reporting bugs) | |
// git: 6a2e6e1 (Sun Apr 30 19:03:19 2023 +0200) [modified] | |
// We need to define _GNU_SOURCE before | |
// _any_ headers files are imported to get | |
// the usage statistics of a thread (i.e. have RUSAGE_THREAD) on GNU/Linux | |
// https://manpages.courier-mta.org/htmlman2/getrusage.2.html | |
#ifndef _GNU_SOURCE // Avoid possible double-definition warning. | |
#define _GNU_SOURCE | |
#endif | |
#ifdef __clang__ | |
#pragma clang diagnostic ignored "-Wunused-function" | |
#pragma clang diagnostic ignored "-Wunused-variable" | |
#pragma clang diagnostic ignored "-Wparentheses" | |
#pragma clang diagnostic ignored "-Wunused-label" | |
#elif __GNUC__ | |
#pragma GCC diagnostic ignored "-Wunused-function" | |
#pragma GCC diagnostic ignored "-Wunused-variable" | |
#pragma GCC diagnostic ignored "-Wparentheses" | |
#pragma GCC diagnostic ignored "-Wunused-label" | |
#pragma GCC diagnostic ignored "-Wunused-but-set-variable" | |
#endif | |
// Headers | |
#include <stdint.h> | |
#include <stddef.h> | |
#include <stdbool.h> | |
#include <stdio.h> | |
#include <float.h> | |
#define CL_TARGET_OPENCL_VERSION 120 | |
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS | |
#ifdef __APPLE__ | |
#define CL_SILENCE_DEPRECATION | |
#include <OpenCL/cl.h> | |
#else | |
#include <CL/cl.h> | |
#endif | |
#ifdef __cplusplus | |
extern "C" { | |
#endif | |
// Initialisation | |
struct futhark_context_config; | |
struct futhark_context_config *futhark_context_config_new(void); | |
void futhark_context_config_free(struct futhark_context_config *cfg); | |
int futhark_context_config_set_tuning_param(struct futhark_context_config *cfg, const char *param_name, size_t new_value); | |
struct futhark_context; | |
struct futhark_context *futhark_context_new(struct futhark_context_config *cfg); | |
void futhark_context_free(struct futhark_context *cfg); | |
void futhark_context_config_add_build_option(struct futhark_context_config *cfg, const char *opt); | |
void futhark_context_config_set_device(struct futhark_context_config *cfg, const char *s); | |
void futhark_context_config_set_platform(struct futhark_context_config *cfg, const char *s); | |
void futhark_context_config_select_device_interactively(struct futhark_context_config *cfg); | |
void futhark_context_config_list_devices(struct futhark_context_config *cfg); | |
void futhark_context_config_dump_program_to(struct futhark_context_config *cfg, const char *s); | |
void futhark_context_config_load_program_from(struct futhark_context_config *cfg, const char *s); | |
void futhark_context_config_dump_binary_to(struct futhark_context_config *cfg, const char *s); | |
void futhark_context_config_load_binary_from(struct futhark_context_config *cfg, const char *s); | |
void futhark_context_config_set_default_group_size(struct futhark_context_config *cfg, int size); | |
void futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg, int size); | |
void futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg, int size); | |
void futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg, int size); | |
void futhark_context_config_set_default_threshold(struct futhark_context_config *cfg, int size); | |
void futhark_context_config_set_command_queue(struct futhark_context_config *cfg, cl_command_queue); | |
void futhark_context_config_set_debugging(struct futhark_context_config *cfg, int flag); | |
void futhark_context_config_set_profiling(struct futhark_context_config *cfg, int flag); | |
void futhark_context_config_set_logging(struct futhark_context_config *cfg, int flag); | |
int futhark_get_tuning_param_count(void); | |
const char *futhark_get_tuning_param_name(int); | |
const char *futhark_get_tuning_param_class(int); | |
// Arrays | |
struct futhark_i64_1d; | |
struct futhark_i64_1d *futhark_new_i64_1d(struct futhark_context *ctx, const int64_t *data, int64_t dim0); | |
struct futhark_i64_1d *futhark_new_raw_i64_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0); | |
int futhark_free_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr); | |
int futhark_values_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr, int64_t *data); | |
cl_mem futhark_values_raw_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr); | |
const int64_t *futhark_shape_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr); | |
struct futhark_u8_1d; | |
struct futhark_u8_1d *futhark_new_u8_1d(struct futhark_context *ctx, const uint8_t *data, int64_t dim0); | |
struct futhark_u8_1d *futhark_new_raw_u8_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0); | |
int futhark_free_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr); | |
int futhark_values_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr, uint8_t *data); | |
cl_mem futhark_values_raw_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr); | |
const int64_t *futhark_shape_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr); | |
// Opaque values | |
// Entry points | |
int futhark_entry_add(struct futhark_context *ctx, struct futhark_u8_1d **out0, const struct futhark_u8_1d *in0, const struct futhark_u8_1d *in1); | |
int futhark_entry_add_i64(struct futhark_context *ctx, struct futhark_i64_1d **out0, const struct futhark_i64_1d *in0, const struct futhark_i64_1d *in1); | |
// Miscellaneous | |
int futhark_context_sync(struct futhark_context *ctx); | |
cl_command_queue futhark_context_get_command_queue(struct futhark_context *ctx); | |
void futhark_context_config_set_cache_file(struct futhark_context_config *cfg, const char *f); | |
char *futhark_context_report(struct futhark_context *ctx); | |
char *futhark_context_get_error(struct futhark_context *ctx); | |
void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f); | |
void futhark_context_pause_profiling(struct futhark_context *ctx); | |
void futhark_context_unpause_profiling(struct futhark_context *ctx); | |
int futhark_context_clear_caches(struct futhark_context *ctx); | |
#define FUTHARK_BACKEND_opencl | |
#define FUTHARK_SUCCESS 0 | |
#define FUTHARK_PROGRAM_ERROR 2 | |
#define FUTHARK_OUT_OF_MEMORY 3 | |
#ifdef __cplusplus | |
} | |
#endif | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdbool.h> | |
#include <math.h> | |
#include <stdint.h> | |
// If NDEBUG is set, the assert() macro will do nothing. Since Futhark | |
// (unfortunately) makes use of assert() for error detection (and even some | |
// side effects), we want to avoid that. | |
#undef NDEBUG | |
#include <assert.h> | |
#include <stdarg.h> | |
// Start of util.h. | |
// | |
// Various helper functions that are useful in all generated C code. | |
#include <errno.h> | |
#include <string.h> | |
static const char *fut_progname = "(embedded Futhark)"; | |
static void futhark_panic(int eval, const char *fmt, ...) __attribute__((noreturn)); | |
static char* msgprintf(const char *s, ...); | |
static void* slurp_file(const char *filename, size_t *size); | |
static int dump_file(const char *file, const void *buf, size_t n); | |
struct str_builder; | |
static void str_builder_init(struct str_builder *b); | |
static void str_builder(struct str_builder *b, const char *s, ...); | |
static char *strclone(const char *str); | |
static void futhark_panic(int eval, const char *fmt, ...) { | |
va_list ap; | |
va_start(ap, fmt); | |
fprintf(stderr, "%s: ", fut_progname); | |
vfprintf(stderr, fmt, ap); | |
va_end(ap); | |
exit(eval); | |
} | |
// For generating arbitrary-sized error messages. It is the callers | |
// responsibility to free the buffer at some point. | |
static char* msgprintf(const char *s, ...) { | |
va_list vl; | |
va_start(vl, s); | |
size_t needed = 1 + (size_t)vsnprintf(NULL, 0, s, vl); | |
char *buffer = (char*) malloc(needed); | |
va_start(vl, s); // Must re-init. | |
vsnprintf(buffer, needed, s, vl); | |
return buffer; | |
} | |
static inline void check_err(int errval, int sets_errno, const char *fun, int line, | |
const char *msg, ...) { | |
if (errval) { | |
char errnum[10]; | |
va_list vl; | |
va_start(vl, msg); | |
fprintf(stderr, "ERROR: "); | |
vfprintf(stderr, msg, vl); | |
fprintf(stderr, " in %s() at line %d with error code %s\n", | |
fun, line, | |
sets_errno ? strerror(errno) : errnum); | |
exit(errval); | |
} | |
} | |
#define CHECK_ERR(err, ...) check_err(err, 0, __func__, __LINE__, __VA_ARGS__) | |
#define CHECK_ERRNO(err, ...) check_err(err, 1, __func__, __LINE__, __VA_ARGS__) | |
// Read the rest of an open file into a NUL-terminated string; returns | |
// NULL on error. | |
static void* fslurp_file(FILE *f, size_t *size) { | |
long start = ftell(f); | |
fseek(f, 0, SEEK_END); | |
long src_size = ftell(f)-start; | |
fseek(f, start, SEEK_SET); | |
unsigned char *s = (unsigned char*) malloc((size_t)src_size + 1); | |
if (fread(s, 1, (size_t)src_size, f) != (size_t)src_size) { | |
free(s); | |
s = NULL; | |
} else { | |
s[src_size] = '\0'; | |
} | |
if (size) { | |
*size = (size_t)src_size; | |
} | |
return s; | |
} | |
// Read a file into a NUL-terminated string; returns NULL on error. | |
static void* slurp_file(const char *filename, size_t *size) { | |
FILE *f = fopen(filename, "rb"); // To avoid Windows messing with linebreaks. | |
if (f == NULL) return NULL; | |
unsigned char *s = fslurp_file(f, size); | |
fclose(f); | |
return s; | |
} | |
// Dump 'n' bytes from 'buf' into the file at the designated location. | |
// Returns 0 on success. | |
static int dump_file(const char *file, const void *buf, size_t n) { | |
FILE *f = fopen(file, "w"); | |
if (f == NULL) { | |
return 1; | |
} | |
if (fwrite(buf, sizeof(char), n, f) != n) { | |
return 1; | |
} | |
if (fclose(f) != 0) { | |
return 1; | |
} | |
return 0; | |
} | |
struct str_builder { | |
char *str; | |
size_t capacity; // Size of buffer. | |
size_t used; // Bytes used, *not* including final zero. | |
}; | |
static void str_builder_init(struct str_builder *b) { | |
b->capacity = 10; | |
b->used = 0; | |
b->str = malloc(b->capacity); | |
b->str[0] = 0; | |
} | |
static void str_builder(struct str_builder *b, const char *s, ...) { | |
va_list vl; | |
va_start(vl, s); | |
size_t needed = (size_t)vsnprintf(NULL, 0, s, vl); | |
while (b->capacity < b->used + needed + 1) { | |
b->capacity *= 2; | |
b->str = realloc(b->str, b->capacity); | |
} | |
va_start(vl, s); // Must re-init. | |
vsnprintf(b->str+b->used, b->capacity-b->used, s, vl); | |
b->used += needed; | |
} | |
static char *strclone(const char *str) { | |
size_t size = strlen(str) + 1; | |
char *copy = (char*) malloc(size); | |
if (copy == NULL) { | |
return NULL; | |
} | |
memcpy(copy, str, size); | |
return copy; | |
} | |
// End of util.h. | |
// Start of cache.h | |
#define CACHE_HASH_SIZE 8 // In 32-bit words. | |
struct cache_hash { | |
uint32_t hash[CACHE_HASH_SIZE]; | |
}; | |
// Initialise a blank cache. | |
static void cache_hash_init(struct cache_hash *c); | |
// Hash some bytes and add them to the accumulated hash. | |
static void cache_hash(struct cache_hash *out, const char *in, size_t n); | |
// Try to restore cache contents from a file with the given name. | |
// Assumes the cache is invalid if it contains the given hash. | |
// Allocates memory and reads the cache conents, which is returned in | |
// *buf with size *buflen. If the cache is successfully loaded, this | |
// function returns 0. Otherwise it returns nonzero. Errno is set if | |
// the failure to load the cache is due to anything except invalid | |
// cache conents. Note that failing to restore the cache is not | |
// necessarily a problem: it might just be invalid or not created yet. | |
static int cache_restore(const char *fname, const struct cache_hash *hash, | |
unsigned char **buf, size_t *buflen); | |
// Store cache contents in the given file, with the given hash. | |
static int cache_store(const char *fname, const struct cache_hash *hash, | |
const unsigned char *buf, size_t buflen); | |
// Now for the implementation. | |
static void cache_hash_init(struct cache_hash *c) { | |
memset(c->hash, 0, CACHE_HASH_SIZE * sizeof(uint32_t)); | |
} | |
static void cache_hash(struct cache_hash *out, const char *in, size_t n) { | |
// Adaptation of djb2 for larger output size by storing intermediate | |
// states. | |
uint32_t hash = 5381; | |
for (size_t i = 0; i < n; i++) { | |
hash = ((hash << 5) + hash) + in[i]; | |
out->hash[i % CACHE_HASH_SIZE] ^= hash; | |
} | |
} | |
#define CACHE_HEADER_SIZE 8 | |
static const char cache_header[CACHE_HEADER_SIZE] = "FUTHARK\0"; | |
static int cache_restore(const char *fname, const struct cache_hash *hash, | |
unsigned char **buf, size_t *buflen) { | |
FILE *f = fopen(fname, "rb"); | |
if (f == NULL) { | |
return 1; | |
} | |
char f_header[CACHE_HEADER_SIZE]; | |
if (fread(f_header, sizeof(char), CACHE_HEADER_SIZE, f) != CACHE_HEADER_SIZE) { | |
goto error; | |
} | |
if (memcmp(f_header, cache_header, CACHE_HEADER_SIZE) != 0) { | |
goto error; | |
} | |
if (fseek(f, 0, SEEK_END) != 0) { | |
goto error; | |
} | |
int64_t f_size = (int64_t)ftell(f); | |
if (fseek(f, CACHE_HEADER_SIZE, SEEK_SET) != 0) { | |
goto error; | |
} | |
int64_t expected_size; | |
if (fread(&expected_size, sizeof(int64_t), 1, f) != 1) { | |
goto error; | |
} | |
if (f_size != expected_size) { | |
errno = 0; | |
goto error; | |
} | |
int32_t f_hash[CACHE_HASH_SIZE]; | |
if (fread(f_hash, sizeof(int32_t), CACHE_HASH_SIZE, f) != CACHE_HASH_SIZE) { | |
goto error; | |
} | |
if (memcmp(f_hash, hash->hash, CACHE_HASH_SIZE) != 0) { | |
errno = 0; | |
goto error; | |
} | |
*buflen = f_size - CACHE_HEADER_SIZE - sizeof(int64_t) - CACHE_HASH_SIZE*sizeof(int32_t); | |
*buf = malloc(*buflen); | |
if (fread(*buf, sizeof(char), *buflen, f) != *buflen) { | |
free(*buf); | |
goto error; | |
} | |
fclose(f); | |
return 0; | |
error: | |
fclose(f); | |
return 1; | |
} | |
static int cache_store(const char *fname, const struct cache_hash *hash, | |
const unsigned char *buf, size_t buflen) { | |
FILE *f = fopen(fname, "wb"); | |
if (f == NULL) { | |
return 1; | |
} | |
if (fwrite(cache_header, CACHE_HEADER_SIZE, 1, f) != 1) { | |
goto error; | |
} | |
int64_t size = CACHE_HEADER_SIZE + sizeof(int64_t) + CACHE_HASH_SIZE*sizeof(int32_t) + buflen; | |
if (fwrite(&size, sizeof(size), 1, f) != 1) { | |
goto error; | |
} | |
if (fwrite(hash->hash, sizeof(int32_t), CACHE_HASH_SIZE, f) != CACHE_HASH_SIZE) { | |
goto error; | |
} | |
if (fwrite(buf, sizeof(unsigned char), buflen, f) != buflen) { | |
goto error; | |
} | |
fclose(f); | |
return 0; | |
error: | |
fclose(f); | |
return 1; | |
} | |
// End of cache.h | |
// Start of half.h. | |
// Conversion functions are from http://half.sourceforge.net/, but | |
// translated to C. | |
// | |
// Copyright (c) 2012-2021 Christian Rau | |
// | |
// Permission is hereby granted, free of charge, to any person obtaining a copy | |
// of this software and associated documentation files (the "Software"), to deal | |
// in the Software without restriction, including without limitation the rights | |
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
// copies of the Software, and to permit persons to whom the Software is | |
// furnished to do so, subject to the following conditions: | |
// | |
// The above copyright notice and this permission notice shall be included in | |
// all copies or substantial portions of the Software. | |
// | |
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
// THE SOFTWARE. | |
#ifndef __OPENCL_VERSION__ | |
#define __constant | |
#endif | |
__constant static const uint16_t base_table[512] = { | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | |
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, | |
0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, | |
0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, | |
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, | |
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, | |
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, | |
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, | |
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, | |
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, | |
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, | |
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, | |
0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, | |
0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, | |
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, | |
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, | |
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, | |
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, | |
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, | |
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, | |
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 }; | |
__constant static const unsigned char shift_table[512] = { | |
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, | |
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, | |
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, | |
24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, | |
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, | |
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, | |
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, | |
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, | |
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, | |
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, | |
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, | |
24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, | |
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, | |
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, | |
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, | |
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 }; | |
__constant static const uint32_t mantissa_table[2048] = { | |
0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000, | |
0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, | |
0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, | |
0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000, | |
0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000, | |
0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, | |
0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, | |
0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, | |
0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, | |
0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, | |
0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, | |
0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, | |
0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, | |
0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, | |
0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, | |
0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, | |
0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, | |
0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, | |
0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000, | |
0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, | |
0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, | |
0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, | |
0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, | |
0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, | |
0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, | |
0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, | |
0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, | |
0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, | |
0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, | |
0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, | |
0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, | |
0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000, | |
0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, | |
0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, | |
0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, | |
0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, | |
0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000, | |
0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, | |
0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, | |
0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, | |
0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, | |
0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, | |
0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, | |
0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, | |
0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, | |
0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000, | |
0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, | |
0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, | |
0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000, | |
0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000, | |
0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, | |
0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, | |
0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, | |
0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000, | |
0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, | |
0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, | |
0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, | |
0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000, | |
0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, | |
0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, | |
0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, | |
0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, | |
0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, | |
0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, | |
0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, | |
0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, | |
0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, | |
0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000, | |
0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, | |
0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, | |
0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, | |
0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, | |
0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, | |
0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, | |
0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, | |
0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, | |
0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000, | |
0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, | |
0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, | |
0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, | |
0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, | |
0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000, | |
0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, | |
0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, | |
0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, | |
0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, | |
0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, | |
0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, | |
0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, | |
0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, | |
0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, | |
0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, | |
0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, | |
0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, | |
0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000, | |
0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, | |
0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, | |
0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, | |
0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, | |
0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, | |
0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, | |
0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, | |
0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, | |
0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, | |
0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, | |
0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, | |
0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, | |
0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, | |
0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, | |
0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, | |
0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, | |
0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, | |
0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000, | |
0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, | |
0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, | |
0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, | |
0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000, | |
0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, | |
0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, | |
0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, | |
0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, | |
0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, | |
0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, | |
0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, | |
0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, | |
0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, | |
0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000, | |
0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 }; | |
__constant static const uint32_t exponent_table[64] = { | |
0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000, | |
0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, | |
0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, | |
0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 }; | |
__constant static const unsigned short offset_table[64] = { | |
0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, | |
0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 }; | |
static uint16_t float2halfbits(float value) { | |
union { float x; uint32_t y; } u; | |
u.x = value; | |
uint32_t bits = u.y; | |
uint16_t hbits = base_table[bits>>23] + (uint16_t)((bits&0x7FFFFF)>>shift_table[bits>>23]);; | |
return hbits; | |
} | |
static float halfbits2float(uint16_t value) { | |
uint32_t bits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10]; | |
union { uint32_t x; float y; } u; | |
u.x = bits; | |
return u.y; | |
} | |
static uint16_t halfbitsnextafter(uint16_t from, uint16_t to) { | |
int fabs = from & 0x7FFF, tabs = to & 0x7FFF; | |
if(fabs > 0x7C00 || tabs > 0x7C00) { | |
return ((from&0x7FFF)>0x7C00) ? (from|0x200) : (to|0x200); | |
} | |
if(from == to || !(fabs|tabs)) { | |
return to; | |
} | |
if(!fabs) { | |
return (to&0x8000)+1; | |
} | |
unsigned int out = | |
from + | |
(((from>>15)^(unsigned int)((from^(0x8000|(0x8000-(from>>15))))<(to^(0x8000|(0x8000-(to>>15))))))<<1) | |
- 1; | |
return out; | |
} | |
// End of half.h. | |
// Start of timing.h. | |
// The function get_wall_time() returns the wall time in microseconds | |
// (with an unspecified offset). | |
#ifdef _WIN32 | |
#include <windows.h> | |
static int64_t get_wall_time(void) { | |
LARGE_INTEGER time,freq; | |
assert(QueryPerformanceFrequency(&freq)); | |
assert(QueryPerformanceCounter(&time)); | |
return ((double)time.QuadPart / freq.QuadPart) * 1000000; | |
} | |
#else | |
// Assuming POSIX | |
#include <time.h> | |
#include <sys/time.h> | |
static int64_t get_wall_time(void) { | |
struct timeval time; | |
assert(gettimeofday(&time,NULL) == 0); | |
return time.tv_sec * 1000000 + time.tv_usec; | |
} | |
static int64_t get_wall_time_ns(void) { | |
struct timespec time; | |
assert(clock_gettime(CLOCK_REALTIME, &time) == 0); | |
return time.tv_sec * 1000000000 + time.tv_nsec; | |
} | |
#endif | |
// End of timing.h. | |
// Start of lock.h. | |
// A very simple cross-platform implementation of locks. Uses | |
// pthreads on Unix and some Windows thing there. Futhark's | |
// host-level code is not multithreaded, but user code may be, so we | |
// need some mechanism for ensuring atomic access to API functions. | |
// This is that mechanism. It is not exposed to user code at all, so | |
// we do not have to worry about name collisions. | |
#ifdef _WIN32 | |
typedef HANDLE lock_t; | |
static void create_lock(lock_t *lock) { | |
*lock = CreateMutex(NULL, // Default security attributes. | |
FALSE, // Initially unlocked. | |
NULL); // Unnamed. | |
} | |
static void lock_lock(lock_t *lock) { | |
assert(WaitForSingleObject(*lock, INFINITE) == WAIT_OBJECT_0); | |
} | |
static void lock_unlock(lock_t *lock) { | |
assert(ReleaseMutex(*lock)); | |
} | |
static void free_lock(lock_t *lock) { | |
CloseHandle(*lock); | |
} | |
#else | |
// Assuming POSIX | |
#include <pthread.h> | |
typedef pthread_mutex_t lock_t; | |
static void create_lock(lock_t *lock) { | |
int r = pthread_mutex_init(lock, NULL); | |
assert(r == 0); | |
} | |
static void lock_lock(lock_t *lock) { | |
int r = pthread_mutex_lock(lock); | |
assert(r == 0); | |
} | |
static void lock_unlock(lock_t *lock) { | |
int r = pthread_mutex_unlock(lock); | |
assert(r == 0); | |
} | |
static void free_lock(lock_t *lock) { | |
// Nothing to do for pthreads. | |
(void)lock; | |
} | |
#endif | |
// End of lock.h. | |
// Start of free_list.h. | |
typedef uintptr_t fl_mem; | |
// An entry in the free list. May be invalid, to avoid having to | |
// deallocate entries as soon as they are removed. There is also a | |
// tag, to help with memory reuse. | |
struct free_list_entry { | |
size_t size; | |
fl_mem mem; | |
const char *tag; | |
unsigned char valid; | |
}; | |
struct free_list { | |
struct free_list_entry *entries; // Pointer to entries. | |
int capacity; // Number of entries. | |
int used; // Number of valid entries. | |
lock_t lock; // Thread safety. | |
}; | |
static void free_list_init(struct free_list *l) { | |
l->capacity = 30; // Picked arbitrarily. | |
l->used = 0; | |
l->entries = (struct free_list_entry*) malloc(sizeof(struct free_list_entry) * l->capacity); | |
for (int i = 0; i < l->capacity; i++) { | |
l->entries[i].valid = 0; | |
} | |
create_lock(&l->lock); | |
} | |
// Remove invalid entries from the free list. | |
static void free_list_pack(struct free_list *l) { | |
lock_lock(&l->lock); | |
int p = 0; | |
for (int i = 0; i < l->capacity; i++) { | |
if (l->entries[i].valid) { | |
l->entries[p] = l->entries[i]; | |
if (i > p) { | |
l->entries[i].valid = 0; | |
} | |
p++; | |
} | |
} | |
// Now p is the number of used elements. We don't want it to go | |
// less than the default capacity (although in practice it's OK as | |
// long as it doesn't become 1). | |
if (p < 30) { | |
p = 30; | |
} | |
l->entries = realloc(l->entries, p * sizeof(struct free_list_entry)); | |
l->capacity = p; | |
lock_unlock(&l->lock); | |
} | |
static void free_list_destroy(struct free_list *l) { | |
assert(l->used == 0); | |
free(l->entries); | |
free_lock(&l->lock); | |
} | |
// Not part of the interface, so no locking. | |
static int free_list_find_invalid(struct free_list *l) { | |
int i; | |
for (i = 0; i < l->capacity; i++) { | |
if (!l->entries[i].valid) { | |
break; | |
} | |
} | |
return i; | |
} | |
static void free_list_insert(struct free_list *l, size_t size, fl_mem mem, const char *tag) { | |
lock_lock(&l->lock); | |
int i = free_list_find_invalid(l); | |
if (i == l->capacity) { | |
// List is full; so we have to grow it. | |
int new_capacity = l->capacity * 2 * sizeof(struct free_list_entry); | |
l->entries = realloc(l->entries, new_capacity); | |
for (int j = 0; j < l->capacity; j++) { | |
l->entries[j+l->capacity].valid = 0; | |
} | |
l->capacity *= 2; | |
} | |
// Now 'i' points to the first invalid entry. | |
l->entries[i].valid = 1; | |
l->entries[i].size = size; | |
l->entries[i].mem = mem; | |
l->entries[i].tag = tag; | |
l->used++; | |
lock_unlock(&l->lock); | |
} | |
// Determine whether this entry in the free list is acceptable for | |
// satisfying the request. Not public, so no locking. | |
static bool free_list_acceptable(size_t size, const char* tag, struct free_list_entry *entry) { | |
// We check not just the hard requirement (is the entry acceptable | |
// and big enough?) but also put a cap on how much wasted space | |
// (internal fragmentation) we allow. This is necessarily a | |
// heuristic, and a crude one. | |
if (!entry->valid) { | |
return false; | |
} | |
if (size > entry->size) { | |
return false; | |
} | |
// We know the block fits. Now the question is whether it is too | |
// big. Our policy is as follows: | |
// | |
// 1) We don't care about wasted space below 4096 bytes (to avoid | |
// churn in tiny allocations). | |
// | |
// 2) If the tag matches, we allow _any_ amount of wasted space. | |
// | |
// 3) Otherwise we allow up to 50% wasted space. | |
if (entry->size < 4096) { | |
return true; | |
} | |
if (entry->tag == tag) { | |
return true; | |
} | |
if (entry->size < size * 2) { | |
return true; | |
} | |
return false; | |
} | |
// Find and remove a memory block of the indicated tag, or if that | |
// does not exist, another memory block with exactly the desired size. | |
// Returns 0 on success. | |
static int free_list_find(struct free_list *l, size_t size, const char *tag, | |
size_t *size_out, fl_mem *mem_out) { | |
lock_lock(&l->lock); | |
int size_match = -1; | |
int i; | |
int ret = 1; | |
for (i = 0; i < l->capacity; i++) { | |
if (free_list_acceptable(size, tag, &l->entries[i]) && | |
(size_match < 0 || l->entries[i].size < l->entries[size_match].size)) { | |
// If this entry is valid, has sufficient size, and is smaller than the | |
// best entry found so far, use this entry. | |
size_match = i; | |
} | |
} | |
if (size_match >= 0) { | |
l->entries[size_match].valid = 0; | |
*size_out = l->entries[size_match].size; | |
*mem_out = l->entries[size_match].mem; | |
l->used--; | |
ret = 0; | |
} | |
lock_unlock(&l->lock); | |
return ret; | |
} | |
// Remove the first block in the free list. Returns 0 if a block was | |
// removed, and nonzero if the free list was already empty. | |
static int free_list_first(struct free_list *l, fl_mem *mem_out) { | |
lock_lock(&l->lock); | |
int ret = 1; | |
for (int i = 0; i < l->capacity; i++) { | |
if (l->entries[i].valid) { | |
l->entries[i].valid = 0; | |
*mem_out = l->entries[i].mem; | |
l->used--; | |
ret = 0; | |
break; | |
} | |
} | |
lock_unlock(&l->lock); | |
return ret; | |
} | |
// End of free_list.h. | |
#ifdef _MSC_VER | |
#define inline __inline | |
#endif | |
#include <string.h> | |
#include <string.h> | |
#include <errno.h> | |
#include <assert.h> | |
#include <ctype.h> | |
#define CL_TARGET_OPENCL_VERSION 120 | |
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS | |
#ifdef __APPLE__ | |
#define CL_SILENCE_DEPRECATION | |
#include <OpenCL/cl.h> | |
#else | |
#include <CL/cl.h> | |
#endif | |
#define FUTHARK_F64_ENABLED | |
// Start of scalar.h. | |
// Implementation of the primitive scalar operations. Very | |
// repetitive. This code is inserted directly into both CUDA and | |
// OpenCL programs, as well as the CPU code, so it has some #ifdefs to | |
// work everywhere. Some operations are defined as macros because | |
// this allows us to use them as constant expressions in things like | |
// array sizes and static initialisers. | |
// Some of the #ifdefs are because OpenCL uses type-generic functions | |
// for some operations (e.g. sqrt), while C and CUDA sensibly use | |
// distinct functions for different precisions (e.g. sqrtf() and | |
// sqrt()). This is quite annoying. Due to C's unfortunate casting | |
// rules, it is also really easy to accidentally implement | |
// floating-point functions in the wrong precision, so be careful. | |
// Double-precision definitions are only included if the preprocessor | |
// macro FUTHARK_F64_ENABLED is set. | |
static inline uint8_t add8(uint8_t x, uint8_t y) { | |
return x + y; | |
} | |
static inline uint16_t add16(uint16_t x, uint16_t y) { | |
return x + y; | |
} | |
static inline uint32_t add32(uint32_t x, uint32_t y) { | |
return x + y; | |
} | |
static inline uint64_t add64(uint64_t x, uint64_t y) { | |
return x + y; | |
} | |
static inline uint8_t sub8(uint8_t x, uint8_t y) { | |
return x - y; | |
} | |
static inline uint16_t sub16(uint16_t x, uint16_t y) { | |
return x - y; | |
} | |
static inline uint32_t sub32(uint32_t x, uint32_t y) { | |
return x - y; | |
} | |
static inline uint64_t sub64(uint64_t x, uint64_t y) { | |
return x - y; | |
} | |
static inline uint8_t mul8(uint8_t x, uint8_t y) { | |
return x * y; | |
} | |
static inline uint16_t mul16(uint16_t x, uint16_t y) { | |
return x * y; | |
} | |
static inline uint32_t mul32(uint32_t x, uint32_t y) { | |
return x * y; | |
} | |
static inline uint64_t mul64(uint64_t x, uint64_t y) { | |
return x * y; | |
} | |
#if ISPC | |
static inline uint8_t udiv8(uint8_t x, uint8_t y) { | |
// This strange pattern is used to prevent the ISPC compiler from | |
// causing SIGFPEs and bogus results on divisions where inactive lanes | |
// have 0-valued divisors. It ensures that any inactive lane instead | |
// has a divisor of 1. https://github.com/ispc/ispc/issues/2292 | |
uint8_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return x / ys; | |
} | |
static inline uint16_t udiv16(uint16_t x, uint16_t y) { | |
uint16_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return x / ys; | |
} | |
static inline uint32_t udiv32(uint32_t x, uint32_t y) { | |
uint32_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return x / ys; | |
} | |
static inline uint64_t udiv64(uint64_t x, uint64_t y) { | |
uint64_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return x / ys; | |
} | |
static inline uint8_t udiv_up8(uint8_t x, uint8_t y) { | |
uint8_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return (x + y - 1) / ys; | |
} | |
static inline uint16_t udiv_up16(uint16_t x, uint16_t y) { | |
uint16_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return (x + y - 1) / ys; | |
} | |
static inline uint32_t udiv_up32(uint32_t x, uint32_t y) { | |
uint32_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return (x + y - 1) / ys; | |
} | |
static inline uint64_t udiv_up64(uint64_t x, uint64_t y) { | |
uint64_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return (x + y - 1) / ys; | |
} | |
static inline uint8_t umod8(uint8_t x, uint8_t y) { | |
uint8_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return x % ys; | |
} | |
static inline uint16_t umod16(uint16_t x, uint16_t y) { | |
uint16_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return x % ys; | |
} | |
static inline uint32_t umod32(uint32_t x, uint32_t y) { | |
uint32_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return x % ys; | |
} | |
static inline uint64_t umod64(uint64_t x, uint64_t y) { | |
uint64_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return x % ys; | |
} | |
static inline uint8_t udiv_safe8(uint8_t x, uint8_t y) { | |
uint8_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : x / ys; | |
} | |
static inline uint16_t udiv_safe16(uint16_t x, uint16_t y) { | |
uint16_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : x / ys; | |
} | |
static inline uint32_t udiv_safe32(uint32_t x, uint32_t y) { | |
uint32_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : x / ys; | |
} | |
static inline uint64_t udiv_safe64(uint64_t x, uint64_t y) { | |
uint64_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : x / ys; | |
} | |
static inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) { | |
uint8_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : (x + y - 1) / ys; | |
} | |
static inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) { | |
uint16_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : (x + y - 1) / ys; | |
} | |
static inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) { | |
uint32_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : (x + y - 1) / ys; | |
} | |
static inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) { | |
uint64_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : (x + y - 1) / ys; | |
} | |
static inline uint8_t umod_safe8(uint8_t x, uint8_t y) { | |
uint8_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : x % ys; | |
} | |
static inline uint16_t umod_safe16(uint16_t x, uint16_t y) { | |
uint16_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : x % ys; | |
} | |
static inline uint32_t umod_safe32(uint32_t x, uint32_t y) { | |
uint32_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : x % ys; | |
} | |
static inline uint64_t umod_safe64(uint64_t x, uint64_t y) { | |
uint64_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : x % ys; | |
} | |
static inline int8_t sdiv8(int8_t x, int8_t y) { | |
int8_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
int8_t q = x / ys; | |
int8_t r = x % ys; | |
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); | |
} | |
static inline int16_t sdiv16(int16_t x, int16_t y) { | |
int16_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
int16_t q = x / ys; | |
int16_t r = x % ys; | |
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); | |
} | |
static inline int32_t sdiv32(int32_t x, int32_t y) { | |
int32_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
int32_t q = x / ys; | |
int32_t r = x % ys; | |
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); | |
} | |
static inline int64_t sdiv64(int64_t x, int64_t y) { | |
int64_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
int64_t q = x / ys; | |
int64_t r = x % ys; | |
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); | |
} | |
static inline int8_t sdiv_up8(int8_t x, int8_t y) { | |
return sdiv8(x + y - 1, y); | |
} | |
static inline int16_t sdiv_up16(int16_t x, int16_t y) { | |
return sdiv16(x + y - 1, y); | |
} | |
static inline int32_t sdiv_up32(int32_t x, int32_t y) { | |
return sdiv32(x + y - 1, y); | |
} | |
static inline int64_t sdiv_up64(int64_t x, int64_t y) { | |
return sdiv64(x + y - 1, y); | |
} | |
static inline int8_t smod8(int8_t x, int8_t y) { | |
int8_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
int8_t r = x % ys; | |
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); | |
} | |
static inline int16_t smod16(int16_t x, int16_t y) { | |
int16_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
int16_t r = x % ys; | |
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); | |
} | |
static inline int32_t smod32(int32_t x, int32_t y) { | |
int32_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
int32_t r = x % ys; | |
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); | |
} | |
static inline int64_t smod64(int64_t x, int64_t y) { | |
int64_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
int64_t r = x % ys; | |
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); | |
} | |
static inline int8_t sdiv_safe8(int8_t x, int8_t y) { | |
return y == 0 ? 0 : sdiv8(x, y); | |
} | |
static inline int16_t sdiv_safe16(int16_t x, int16_t y) { | |
return y == 0 ? 0 : sdiv16(x, y); | |
} | |
static inline int32_t sdiv_safe32(int32_t x, int32_t y) { | |
return y == 0 ? 0 : sdiv32(x, y); | |
} | |
static inline int64_t sdiv_safe64(int64_t x, int64_t y) { | |
return y == 0 ? 0 : sdiv64(x, y); | |
} | |
static inline int8_t sdiv_up_safe8(int8_t x, int8_t y) { | |
return sdiv_safe8(x + y - 1, y); | |
} | |
static inline int16_t sdiv_up_safe16(int16_t x, int16_t y) { | |
return sdiv_safe16(x + y - 1, y); | |
} | |
static inline int32_t sdiv_up_safe32(int32_t x, int32_t y) { | |
return sdiv_safe32(x + y - 1, y); | |
} | |
static inline int64_t sdiv_up_safe64(int64_t x, int64_t y) { | |
return sdiv_safe64(x + y - 1, y); | |
} | |
static inline int8_t smod_safe8(int8_t x, int8_t y) { | |
return y == 0 ? 0 : smod8(x, y); | |
} | |
static inline int16_t smod_safe16(int16_t x, int16_t y) { | |
return y == 0 ? 0 : smod16(x, y); | |
} | |
static inline int32_t smod_safe32(int32_t x, int32_t y) { | |
return y == 0 ? 0 : smod32(x, y); | |
} | |
static inline int64_t smod_safe64(int64_t x, int64_t y) { | |
return y == 0 ? 0 : smod64(x, y); | |
} | |
static inline int8_t squot8(int8_t x, int8_t y) { | |
int8_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return x / ys; | |
} | |
static inline int16_t squot16(int16_t x, int16_t y) { | |
int16_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return x / ys; | |
} | |
static inline int32_t squot32(int32_t x, int32_t y) { | |
int32_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return x / ys; | |
} | |
static inline int64_t squot64(int64_t x, int64_t y) { | |
int64_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return x / ys; | |
} | |
static inline int8_t srem8(int8_t x, int8_t y) { | |
int8_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return x % ys; | |
} | |
static inline int16_t srem16(int16_t x, int16_t y) { | |
int16_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return x % ys; | |
} | |
static inline int32_t srem32(int32_t x, int32_t y) { | |
int32_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return x % ys; | |
} | |
static inline int64_t srem64(int64_t x, int64_t y) { | |
int8_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return x % ys; | |
} | |
static inline int8_t squot_safe8(int8_t x, int8_t y) { | |
int8_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : x / ys; | |
} | |
static inline int16_t squot_safe16(int16_t x, int16_t y) { | |
int16_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : x / ys; | |
} | |
static inline int32_t squot_safe32(int32_t x, int32_t y) { | |
int32_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : x / ys; | |
} | |
static inline int64_t squot_safe64(int64_t x, int64_t y) { | |
int64_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : x / ys; | |
} | |
static inline int8_t srem_safe8(int8_t x, int8_t y) { | |
int8_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : x % ys; | |
} | |
static inline int16_t srem_safe16(int16_t x, int16_t y) { | |
int16_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : x % ys; | |
} | |
static inline int32_t srem_safe32(int32_t x, int32_t y) { | |
int32_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : x % ys; | |
} | |
static inline int64_t srem_safe64(int64_t x, int64_t y) { | |
int64_t ys = 1; | |
foreach_active(i){ | |
ys = y; | |
} | |
return y == 0 ? 0 : x % ys; | |
} | |
#else | |
static inline uint8_t udiv8(uint8_t x, uint8_t y) { | |
return x / y; | |
} | |
static inline uint16_t udiv16(uint16_t x, uint16_t y) { | |
return x / y; | |
} | |
static inline uint32_t udiv32(uint32_t x, uint32_t y) { | |
return x / y; | |
} | |
static inline uint64_t udiv64(uint64_t x, uint64_t y) { | |
return x / y; | |
} | |
static inline uint8_t udiv_up8(uint8_t x, uint8_t y) { | |
return (x + y - 1) / y; | |
} | |
static inline uint16_t udiv_up16(uint16_t x, uint16_t y) { | |
return (x + y - 1) / y; | |
} | |
static inline uint32_t udiv_up32(uint32_t x, uint32_t y) { | |
return (x + y - 1) / y; | |
} | |
static inline uint64_t udiv_up64(uint64_t x, uint64_t y) { | |
return (x + y - 1) / y; | |
} | |
static inline uint8_t umod8(uint8_t x, uint8_t y) { | |
return x % y; | |
} | |
static inline uint16_t umod16(uint16_t x, uint16_t y) { | |
return x % y; | |
} | |
static inline uint32_t umod32(uint32_t x, uint32_t y) { | |
return x % y; | |
} | |
static inline uint64_t umod64(uint64_t x, uint64_t y) { | |
return x % y; | |
} | |
static inline uint8_t udiv_safe8(uint8_t x, uint8_t y) { | |
return y == 0 ? 0 : x / y; | |
} | |
static inline uint16_t udiv_safe16(uint16_t x, uint16_t y) { | |
return y == 0 ? 0 : x / y; | |
} | |
static inline uint32_t udiv_safe32(uint32_t x, uint32_t y) { | |
return y == 0 ? 0 : x / y; | |
} | |
static inline uint64_t udiv_safe64(uint64_t x, uint64_t y) { | |
return y == 0 ? 0 : x / y; | |
} | |
static inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) { | |
return y == 0 ? 0 : (x + y - 1) / y; | |
} | |
static inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) { | |
return y == 0 ? 0 : (x + y - 1) / y; | |
} | |
static inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) { | |
return y == 0 ? 0 : (x + y - 1) / y; | |
} | |
static inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) { | |
return y == 0 ? 0 : (x + y - 1) / y; | |
} | |
static inline uint8_t umod_safe8(uint8_t x, uint8_t y) { | |
return y == 0 ? 0 : x % y; | |
} | |
static inline uint16_t umod_safe16(uint16_t x, uint16_t y) { | |
return y == 0 ? 0 : x % y; | |
} | |
static inline uint32_t umod_safe32(uint32_t x, uint32_t y) { | |
return y == 0 ? 0 : x % y; | |
} | |
static inline uint64_t umod_safe64(uint64_t x, uint64_t y) { | |
return y == 0 ? 0 : x % y; | |
} | |
static inline int8_t sdiv8(int8_t x, int8_t y) { | |
int8_t q = x / y; | |
int8_t r = x % y; | |
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); | |
} | |
static inline int16_t sdiv16(int16_t x, int16_t y) { | |
int16_t q = x / y; | |
int16_t r = x % y; | |
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); | |
} | |
static inline int32_t sdiv32(int32_t x, int32_t y) { | |
int32_t q = x / y; | |
int32_t r = x % y; | |
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); | |
} | |
static inline int64_t sdiv64(int64_t x, int64_t y) { | |
int64_t q = x / y; | |
int64_t r = x % y; | |
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); | |
} | |
static inline int8_t sdiv_up8(int8_t x, int8_t y) { | |
return sdiv8(x + y - 1, y); | |
} | |
static inline int16_t sdiv_up16(int16_t x, int16_t y) { | |
return sdiv16(x + y - 1, y); | |
} | |
static inline int32_t sdiv_up32(int32_t x, int32_t y) { | |
return sdiv32(x + y - 1, y); | |
} | |
static inline int64_t sdiv_up64(int64_t x, int64_t y) { | |
return sdiv64(x + y - 1, y); | |
} | |
static inline int8_t smod8(int8_t x, int8_t y) { | |
int8_t r = x % y; | |
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); | |
} | |
static inline int16_t smod16(int16_t x, int16_t y) { | |
int16_t r = x % y; | |
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); | |
} | |
static inline int32_t smod32(int32_t x, int32_t y) { | |
int32_t r = x % y; | |
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); | |
} | |
static inline int64_t smod64(int64_t x, int64_t y) { | |
int64_t r = x % y; | |
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); | |
} | |
static inline int8_t sdiv_safe8(int8_t x, int8_t y) { | |
return y == 0 ? 0 : sdiv8(x, y); | |
} | |
static inline int16_t sdiv_safe16(int16_t x, int16_t y) { | |
return y == 0 ? 0 : sdiv16(x, y); | |
} | |
static inline int32_t sdiv_safe32(int32_t x, int32_t y) { | |
return y == 0 ? 0 : sdiv32(x, y); | |
} | |
static inline int64_t sdiv_safe64(int64_t x, int64_t y) { | |
return y == 0 ? 0 : sdiv64(x, y); | |
} | |
static inline int8_t sdiv_up_safe8(int8_t x, int8_t y) { | |
return sdiv_safe8(x + y - 1, y); | |
} | |
static inline int16_t sdiv_up_safe16(int16_t x, int16_t y) { | |
return sdiv_safe16(x + y - 1, y); | |
} | |
static inline int32_t sdiv_up_safe32(int32_t x, int32_t y) { | |
return sdiv_safe32(x + y - 1, y); | |
} | |
static inline int64_t sdiv_up_safe64(int64_t x, int64_t y) { | |
return sdiv_safe64(x + y - 1, y); | |
} | |
static inline int8_t smod_safe8(int8_t x, int8_t y) { | |
return y == 0 ? 0 : smod8(x, y); | |
} | |
static inline int16_t smod_safe16(int16_t x, int16_t y) { | |
return y == 0 ? 0 : smod16(x, y); | |
} | |
static inline int32_t smod_safe32(int32_t x, int32_t y) { | |
return y == 0 ? 0 : smod32(x, y); | |
} | |
static inline int64_t smod_safe64(int64_t x, int64_t y) { | |
return y == 0 ? 0 : smod64(x, y); | |
} | |
static inline int8_t squot8(int8_t x, int8_t y) { | |
return x / y; | |
} | |
static inline int16_t squot16(int16_t x, int16_t y) { | |
return x / y; | |
} | |
static inline int32_t squot32(int32_t x, int32_t y) { | |
return x / y; | |
} | |
static inline int64_t squot64(int64_t x, int64_t y) { | |
return x / y; | |
} | |
static inline int8_t srem8(int8_t x, int8_t y) { | |
return x % y; | |
} | |
static inline int16_t srem16(int16_t x, int16_t y) { | |
return x % y; | |
} | |
static inline int32_t srem32(int32_t x, int32_t y) { | |
return x % y; | |
} | |
static inline int64_t srem64(int64_t x, int64_t y) { | |
return x % y; | |
} | |
static inline int8_t squot_safe8(int8_t x, int8_t y) { | |
return y == 0 ? 0 : x / y; | |
} | |
static inline int16_t squot_safe16(int16_t x, int16_t y) { | |
return y == 0 ? 0 : x / y; | |
} | |
static inline int32_t squot_safe32(int32_t x, int32_t y) { | |
return y == 0 ? 0 : x / y; | |
} | |
static inline int64_t squot_safe64(int64_t x, int64_t y) { | |
return y == 0 ? 0 : x / y; | |
} | |
static inline int8_t srem_safe8(int8_t x, int8_t y) { | |
return y == 0 ? 0 : x % y; | |
} | |
static inline int16_t srem_safe16(int16_t x, int16_t y) { | |
return y == 0 ? 0 : x % y; | |
} | |
static inline int32_t srem_safe32(int32_t x, int32_t y) { | |
return y == 0 ? 0 : x % y; | |
} | |
static inline int64_t srem_safe64(int64_t x, int64_t y) { | |
return y == 0 ? 0 : x % y; | |
} | |
#endif | |
static inline int8_t smin8(int8_t x, int8_t y) { | |
return x < y ? x : y; | |
} | |
static inline int16_t smin16(int16_t x, int16_t y) { | |
return x < y ? x : y; | |
} | |
static inline int32_t smin32(int32_t x, int32_t y) { | |
return x < y ? x : y; | |
} | |
static inline int64_t smin64(int64_t x, int64_t y) { | |
return x < y ? x : y; | |
} | |
static inline uint8_t umin8(uint8_t x, uint8_t y) { | |
return x < y ? x : y; | |
} | |
static inline uint16_t umin16(uint16_t x, uint16_t y) { | |
return x < y ? x : y; | |
} | |
static inline uint32_t umin32(uint32_t x, uint32_t y) { | |
return x < y ? x : y; | |
} | |
static inline uint64_t umin64(uint64_t x, uint64_t y) { | |
return x < y ? x : y; | |
} | |
static inline int8_t smax8(int8_t x, int8_t y) { | |
return x < y ? y : x; | |
} | |
static inline int16_t smax16(int16_t x, int16_t y) { | |
return x < y ? y : x; | |
} | |
static inline int32_t smax32(int32_t x, int32_t y) { | |
return x < y ? y : x; | |
} | |
static inline int64_t smax64(int64_t x, int64_t y) { | |
return x < y ? y : x; | |
} | |
static inline uint8_t umax8(uint8_t x, uint8_t y) { | |
return x < y ? y : x; | |
} | |
static inline uint16_t umax16(uint16_t x, uint16_t y) { | |
return x < y ? y : x; | |
} | |
static inline uint32_t umax32(uint32_t x, uint32_t y) { | |
return x < y ? y : x; | |
} | |
static inline uint64_t umax64(uint64_t x, uint64_t y) { | |
return x < y ? y : x; | |
} | |
static inline uint8_t shl8(uint8_t x, uint8_t y) { | |
return (uint8_t)(x << y); | |
} | |
static inline uint16_t shl16(uint16_t x, uint16_t y) { | |
return (uint16_t)(x << y); | |
} | |
static inline uint32_t shl32(uint32_t x, uint32_t y) { | |
return x << y; | |
} | |
static inline uint64_t shl64(uint64_t x, uint64_t y) { | |
return x << y; | |
} | |
static inline uint8_t lshr8(uint8_t x, uint8_t y) { | |
return x >> y; | |
} | |
static inline uint16_t lshr16(uint16_t x, uint16_t y) { | |
return x >> y; | |
} | |
static inline uint32_t lshr32(uint32_t x, uint32_t y) { | |
return x >> y; | |
} | |
static inline uint64_t lshr64(uint64_t x, uint64_t y) { | |
return x >> y; | |
} | |
static inline int8_t ashr8(int8_t x, int8_t y) { | |
return x >> y; | |
} | |
static inline int16_t ashr16(int16_t x, int16_t y) { | |
return x >> y; | |
} | |
static inline int32_t ashr32(int32_t x, int32_t y) { | |
return x >> y; | |
} | |
static inline int64_t ashr64(int64_t x, int64_t y) { | |
return x >> y; | |
} | |
static inline uint8_t and8(uint8_t x, uint8_t y) { | |
return x & y; | |
} | |
static inline uint16_t and16(uint16_t x, uint16_t y) { | |
return x & y; | |
} | |
static inline uint32_t and32(uint32_t x, uint32_t y) { | |
return x & y; | |
} | |
static inline uint64_t and64(uint64_t x, uint64_t y) { | |
return x & y; | |
} | |
static inline uint8_t or8(uint8_t x, uint8_t y) { | |
return x | y; | |
} | |
static inline uint16_t or16(uint16_t x, uint16_t y) { | |
return x | y; | |
} | |
static inline uint32_t or32(uint32_t x, uint32_t y) { | |
return x | y; | |
} | |
static inline uint64_t or64(uint64_t x, uint64_t y) { | |
return x | y; | |
} | |
static inline uint8_t xor8(uint8_t x, uint8_t y) { | |
return x ^ y; | |
} | |
static inline uint16_t xor16(uint16_t x, uint16_t y) { | |
return x ^ y; | |
} | |
static inline uint32_t xor32(uint32_t x, uint32_t y) { | |
return x ^ y; | |
} | |
static inline uint64_t xor64(uint64_t x, uint64_t y) { | |
return x ^ y; | |
} | |
static inline bool ult8(uint8_t x, uint8_t y) { | |
return x < y; | |
} | |
static inline bool ult16(uint16_t x, uint16_t y) { | |
return x < y; | |
} | |
static inline bool ult32(uint32_t x, uint32_t y) { | |
return x < y; | |
} | |
static inline bool ult64(uint64_t x, uint64_t y) { | |
return x < y; | |
} | |
static inline bool ule8(uint8_t x, uint8_t y) { | |
return x <= y; | |
} | |
static inline bool ule16(uint16_t x, uint16_t y) { | |
return x <= y; | |
} | |
static inline bool ule32(uint32_t x, uint32_t y) { | |
return x <= y; | |
} | |
static inline bool ule64(uint64_t x, uint64_t y) { | |
return x <= y; | |
} | |
static inline bool slt8(int8_t x, int8_t y) { | |
return x < y; | |
} | |
static inline bool slt16(int16_t x, int16_t y) { | |
return x < y; | |
} | |
static inline bool slt32(int32_t x, int32_t y) { | |
return x < y; | |
} | |
static inline bool slt64(int64_t x, int64_t y) { | |
return x < y; | |
} | |
static inline bool sle8(int8_t x, int8_t y) { | |
return x <= y; | |
} | |
static inline bool sle16(int16_t x, int16_t y) { | |
return x <= y; | |
} | |
static inline bool sle32(int32_t x, int32_t y) { | |
return x <= y; | |
} | |
static inline bool sle64(int64_t x, int64_t y) { | |
return x <= y; | |
} | |
static inline uint8_t pow8(uint8_t x, uint8_t y) { | |
uint8_t res = 1, rem = y; | |
while (rem != 0) { | |
if (rem & 1) | |
res *= x; | |
rem >>= 1; | |
x *= x; | |
} | |
return res; | |
} | |
static inline uint16_t pow16(uint16_t x, uint16_t y) { | |
uint16_t res = 1, rem = y; | |
while (rem != 0) { | |
if (rem & 1) | |
res *= x; | |
rem >>= 1; | |
x *= x; | |
} | |
return res; | |
} | |
static inline uint32_t pow32(uint32_t x, uint32_t y) { | |
uint32_t res = 1, rem = y; | |
while (rem != 0) { | |
if (rem & 1) | |
res *= x; | |
rem >>= 1; | |
x *= x; | |
} | |
return res; | |
} | |
static inline uint64_t pow64(uint64_t x, uint64_t y) { | |
uint64_t res = 1, rem = y; | |
while (rem != 0) { | |
if (rem & 1) | |
res *= x; | |
rem >>= 1; | |
x *= x; | |
} | |
return res; | |
} | |
static inline bool itob_i8_bool(int8_t x) { | |
return x != 0; | |
} | |
static inline bool itob_i16_bool(int16_t x) { | |
return x != 0; | |
} | |
static inline bool itob_i32_bool(int32_t x) { | |
return x != 0; | |
} | |
static inline bool itob_i64_bool(int64_t x) { | |
return x != 0; | |
} | |
static inline int8_t btoi_bool_i8(bool x) { | |
return x; | |
} | |
static inline int16_t btoi_bool_i16(bool x) { | |
return x; | |
} | |
static inline int32_t btoi_bool_i32(bool x) { | |
return x; | |
} | |
static inline int64_t btoi_bool_i64(bool x) { | |
return x; | |
} | |
#define sext_i8_i8(x) ((int8_t) (int8_t) (x)) | |
#define sext_i8_i16(x) ((int16_t) (int8_t) (x)) | |
#define sext_i8_i32(x) ((int32_t) (int8_t) (x)) | |
#define sext_i8_i64(x) ((int64_t) (int8_t) (x)) | |
#define sext_i16_i8(x) ((int8_t) (int16_t) (x)) | |
#define sext_i16_i16(x) ((int16_t) (int16_t) (x)) | |
#define sext_i16_i32(x) ((int32_t) (int16_t) (x)) | |
#define sext_i16_i64(x) ((int64_t) (int16_t) (x)) | |
#define sext_i32_i8(x) ((int8_t) (int32_t) (x)) | |
#define sext_i32_i16(x) ((int16_t) (int32_t) (x)) | |
#define sext_i32_i32(x) ((int32_t) (int32_t) (x)) | |
#define sext_i32_i64(x) ((int64_t) (int32_t) (x)) | |
#define sext_i64_i8(x) ((int8_t) (int64_t) (x)) | |
#define sext_i64_i16(x) ((int16_t) (int64_t) (x)) | |
#define sext_i64_i32(x) ((int32_t) (int64_t) (x)) | |
#define sext_i64_i64(x) ((int64_t) (int64_t) (x)) | |
#define zext_i8_i8(x) ((int8_t) (uint8_t) (x)) | |
#define zext_i8_i16(x) ((int16_t) (uint8_t) (x)) | |
#define zext_i8_i32(x) ((int32_t) (uint8_t) (x)) | |
#define zext_i8_i64(x) ((int64_t) (uint8_t) (x)) | |
#define zext_i16_i8(x) ((int8_t) (uint16_t) (x)) | |
#define zext_i16_i16(x) ((int16_t) (uint16_t) (x)) | |
#define zext_i16_i32(x) ((int32_t) (uint16_t) (x)) | |
#define zext_i16_i64(x) ((int64_t) (uint16_t) (x)) | |
#define zext_i32_i8(x) ((int8_t) (uint32_t) (x)) | |
#define zext_i32_i16(x) ((int16_t) (uint32_t) (x)) | |
#define zext_i32_i32(x) ((int32_t) (uint32_t) (x)) | |
#define zext_i32_i64(x) ((int64_t) (uint32_t) (x)) | |
#define zext_i64_i8(x) ((int8_t) (uint64_t) (x)) | |
#define zext_i64_i16(x) ((int16_t) (uint64_t) (x)) | |
#define zext_i64_i32(x) ((int32_t) (uint64_t) (x)) | |
#define zext_i64_i64(x) ((int64_t) (uint64_t) (x)) | |
static int8_t abs8(int8_t x) { | |
return (int8_t)abs(x); | |
} | |
static int16_t abs16(int16_t x) { | |
return (int16_t)abs(x); | |
} | |
static int32_t abs32(int32_t x) { | |
return abs(x); | |
} | |
static int64_t abs64(int64_t x) { | |
#if defined(__OPENCL_VERSION__) || defined(ISPC) | |
return abs(x); | |
#else | |
return llabs(x); | |
#endif | |
} | |
#if defined(__OPENCL_VERSION__) | |
static int32_t futrts_popc8(int8_t x) { | |
return popcount(x); | |
} | |
static int32_t futrts_popc16(int16_t x) { | |
return popcount(x); | |
} | |
static int32_t futrts_popc32(int32_t x) { | |
return popcount(x); | |
} | |
static int32_t futrts_popc64(int64_t x) { | |
return popcount(x); | |
} | |
#elif defined(__CUDA_ARCH__) | |
static int32_t futrts_popc8(int8_t x) { | |
return __popc(zext_i8_i32(x)); | |
} | |
static int32_t futrts_popc16(int16_t x) { | |
return __popc(zext_i16_i32(x)); | |
} | |
static int32_t futrts_popc32(int32_t x) { | |
return __popc(x); | |
} | |
static int32_t futrts_popc64(int64_t x) { | |
return __popcll(x); | |
} | |
#else // Not OpenCL or CUDA, but plain C. | |
static int32_t futrts_popc8(uint8_t x) { | |
int c = 0; | |
for (; x; ++c) { x &= x - 1; } | |
return c; | |
} | |
static int32_t futrts_popc16(uint16_t x) { | |
int c = 0; | |
for (; x; ++c) { x &= x - 1; } | |
return c; | |
} | |
static int32_t futrts_popc32(uint32_t x) { | |
int c = 0; | |
for (; x; ++c) { x &= x - 1; } | |
return c; | |
} | |
static int32_t futrts_popc64(uint64_t x) { | |
int c = 0; | |
for (; x; ++c) { x &= x - 1; } | |
return c; | |
} | |
#endif | |
#if defined(__OPENCL_VERSION__) | |
static uint8_t futrts_umul_hi8 ( uint8_t a, uint8_t b) { return mul_hi(a, b); } | |
static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return mul_hi(a, b); } | |
static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return mul_hi(a, b); } | |
static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return mul_hi(a, b); } | |
static uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return mul_hi(a, b); } | |
static uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return mul_hi(a, b); } | |
static uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return mul_hi(a, b); } | |
static uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return mul_hi(a, b); } | |
#elif defined(__CUDA_ARCH__) | |
static uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } | |
static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } | |
static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return __umulhi(a, b); } | |
static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return __umul64hi(a, b); } | |
static uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; } | |
static uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; } | |
static uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return __mulhi(a, b); } | |
static uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return __mul64hi(a, b); } | |
#elif ISPC | |
static uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } | |
static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } | |
static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; } | |
static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { | |
uint64_t ah = a >> 32; | |
uint64_t al = a & 0xffffffff; | |
uint64_t bh = b >> 32; | |
uint64_t bl = b & 0xffffffff; | |
uint64_t p1 = al * bl; | |
uint64_t p2 = al * bh; | |
uint64_t p3 = ah * bl; | |
uint64_t p4 = ah * bh; | |
uint64_t p1h = p1 >> 32; | |
uint64_t p2h = p2 >> 32; | |
uint64_t p3h = p3 >> 32; | |
uint64_t p2l = p2 & 0xffffffff; | |
uint64_t p3l = p3 & 0xffffffff; | |
uint64_t l = p1h + p2l + p3l; | |
uint64_t m = (p2 >> 32) + (p3 >> 32); | |
uint64_t h = (l >> 32) + m + p4; | |
return h; | |
} | |
static int8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } | |
static int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } | |
static int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; } | |
static int64_t futrts_smul_hi64(int64_t a, int64_t b) { | |
uint64_t ah = a >> 32; | |
uint64_t al = a & 0xffffffff; | |
uint64_t bh = b >> 32; | |
uint64_t bl = b & 0xffffffff; | |
uint64_t p1 = al * bl; | |
int64_t p2 = al * bh; | |
int64_t p3 = ah * bl; | |
uint64_t p4 = ah * bh; | |
uint64_t p1h = p1 >> 32; | |
uint64_t p2h = p2 >> 32; | |
uint64_t p3h = p3 >> 32; | |
uint64_t p2l = p2 & 0xffffffff; | |
uint64_t p3l = p3 & 0xffffffff; | |
uint64_t l = p1h + p2l + p3l; | |
uint64_t m = (p2 >> 32) + (p3 >> 32); | |
uint64_t h = (l >> 32) + m + p4; | |
return h; | |
} | |
#else // Not OpenCL, ISPC, or CUDA, but plain C. | |
static uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } | |
static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } | |
static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; } | |
static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return ((__uint128_t)a) * ((__uint128_t)b) >> 64; } | |
static int8_t futrts_smul_hi8(int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; } | |
static int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; } | |
static int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((int64_t)a) * ((int64_t)b) >> 32; } | |
static int64_t futrts_smul_hi64(int64_t a, int64_t b) { return ((__int128_t)a) * ((__int128_t)b) >> 64; } | |
#endif | |
#if defined(__OPENCL_VERSION__) | |
static uint8_t futrts_umad_hi8 ( uint8_t a, uint8_t b, uint8_t c) { return mad_hi(a, b, c); } | |
static uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return mad_hi(a, b, c); } | |
static uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return mad_hi(a, b, c); } | |
static uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return mad_hi(a, b, c); } | |
static uint8_t futrts_smad_hi8( int8_t a, int8_t b, int8_t c) { return mad_hi(a, b, c); } | |
static uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return mad_hi(a, b, c); } | |
static uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return mad_hi(a, b, c); } | |
static uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return mad_hi(a, b, c); } | |
#else // Not OpenCL | |
static uint8_t futrts_umad_hi8( uint8_t a, uint8_t b, uint8_t c) { return futrts_umul_hi8(a, b) + c; } | |
static uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return futrts_umul_hi16(a, b) + c; } | |
static uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return futrts_umul_hi32(a, b) + c; } | |
static uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return futrts_umul_hi64(a, b) + c; } | |
static uint8_t futrts_smad_hi8 ( int8_t a, int8_t b, int8_t c) { return futrts_smul_hi8(a, b) + c; } | |
static uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return futrts_smul_hi16(a, b) + c; } | |
static uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return futrts_smul_hi32(a, b) + c; } | |
static uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return futrts_smul_hi64(a, b) + c; } | |
#endif | |
#if defined(__OPENCL_VERSION__) | |
static int32_t futrts_clzz8(int8_t x) { | |
return clz(x); | |
} | |
static int32_t futrts_clzz16(int16_t x) { | |
return clz(x); | |
} | |
static int32_t futrts_clzz32(int32_t x) { | |
return clz(x); | |
} | |
static int32_t futrts_clzz64(int64_t x) { | |
return clz(x); | |
} | |
#elif defined(__CUDA_ARCH__) | |
static int32_t futrts_clzz8(int8_t x) { | |
return __clz(zext_i8_i32(x)) - 24; | |
} | |
static int32_t futrts_clzz16(int16_t x) { | |
return __clz(zext_i16_i32(x)) - 16; | |
} | |
static int32_t futrts_clzz32(int32_t x) { | |
return __clz(x); | |
} | |
static int32_t futrts_clzz64(int64_t x) { | |
return __clzll(x); | |
} | |
#elif ISPC | |
static int32_t futrts_clzz8(int8_t x) { | |
return count_leading_zeros((int32_t)(uint8_t)x)-24; | |
} | |
static int32_t futrts_clzz16(int16_t x) { | |
return count_leading_zeros((int32_t)(uint16_t)x)-16; | |
} | |
static int32_t futrts_clzz32(int32_t x) { | |
return count_leading_zeros(x); | |
} | |
static int32_t futrts_clzz64(int64_t x) { | |
return count_leading_zeros(x); | |
} | |
#else // Not OpenCL, ISPC or CUDA, but plain C. | |
static int32_t futrts_clzz8(int8_t x) { | |
return x == 0 ? 8 : __builtin_clz((uint32_t)zext_i8_i32(x)) - 24; | |
} | |
static int32_t futrts_clzz16(int16_t x) { | |
return x == 0 ? 16 : __builtin_clz((uint32_t)zext_i16_i32(x)) - 16; | |
} | |
static int32_t futrts_clzz32(int32_t x) { | |
return x == 0 ? 32 : __builtin_clz((uint32_t)x); | |
} | |
static int32_t futrts_clzz64(int64_t x) { | |
return x == 0 ? 64 : __builtin_clzll((uint64_t)x); | |
} | |
#endif | |
#if defined(__OPENCL_VERSION__) | |
static int32_t futrts_ctzz8(int8_t x) { | |
int i = 0; | |
for (; i < 8 && (x & 1) == 0; i++, x >>= 1) | |
; | |
return i; | |
} | |
static int32_t futrts_ctzz16(int16_t x) { | |
int i = 0; | |
for (; i < 16 && (x & 1) == 0; i++, x >>= 1) | |
; | |
return i; | |
} | |
static int32_t futrts_ctzz32(int32_t x) { | |
int i = 0; | |
for (; i < 32 && (x & 1) == 0; i++, x >>= 1) | |
; | |
return i; | |
} | |
static int32_t futrts_ctzz64(int64_t x) { | |
int i = 0; | |
for (; i < 64 && (x & 1) == 0; i++, x >>= 1) | |
; | |
return i; | |
} | |
#elif defined(__CUDA_ARCH__) | |
static int32_t futrts_ctzz8(int8_t x) { | |
int y = __ffs(x); | |
return y == 0 ? 8 : y - 1; | |
} | |
static int32_t futrts_ctzz16(int16_t x) { | |
int y = __ffs(x); | |
return y == 0 ? 16 : y - 1; | |
} | |
static int32_t futrts_ctzz32(int32_t x) { | |
int y = __ffs(x); | |
return y == 0 ? 32 : y - 1; | |
} | |
static int32_t futrts_ctzz64(int64_t x) { | |
int y = __ffsll(x); | |
return y == 0 ? 64 : y - 1; | |
} | |
#elif ISPC | |
static int32_t futrts_ctzz8(int8_t x) { | |
return x == 0 ? 8 : count_trailing_zeros((int32_t)x); | |
} | |
static int32_t futrts_ctzz16(int16_t x) { | |
return x == 0 ? 16 : count_trailing_zeros((int32_t)x); | |
} | |
static int32_t futrts_ctzz32(int32_t x) { | |
return count_trailing_zeros(x); | |
} | |
static int32_t futrts_ctzz64(int64_t x) { | |
return count_trailing_zeros(x); | |
} | |
#else // Not OpenCL or CUDA, but plain C. | |
static int32_t futrts_ctzz8(int8_t x) { | |
return x == 0 ? 8 : __builtin_ctz((uint32_t)x); | |
} | |
static int32_t futrts_ctzz16(int16_t x) { | |
return x == 0 ? 16 : __builtin_ctz((uint32_t)x); | |
} | |
static int32_t futrts_ctzz32(int32_t x) { | |
return x == 0 ? 32 : __builtin_ctz((uint32_t)x); | |
} | |
static int32_t futrts_ctzz64(int64_t x) { | |
return x == 0 ? 64 : __builtin_ctzll((uint64_t)x); | |
} | |
#endif | |
static inline float fdiv32(float x, float y) { | |
return x / y; | |
} | |
static inline float fadd32(float x, float y) { | |
return x + y; | |
} | |
static inline float fsub32(float x, float y) { | |
return x - y; | |
} | |
static inline float fmul32(float x, float y) { | |
return x * y; | |
} | |
static inline bool cmplt32(float x, float y) { | |
return x < y; | |
} | |
static inline bool cmple32(float x, float y) { | |
return x <= y; | |
} | |
static inline float sitofp_i8_f32(int8_t x) { | |
return (float) x; | |
} | |
static inline float sitofp_i16_f32(int16_t x) { | |
return (float) x; | |
} | |
static inline float sitofp_i32_f32(int32_t x) { | |
return (float) x; | |
} | |
static inline float sitofp_i64_f32(int64_t x) { | |
return (float) x; | |
} | |
static inline float uitofp_i8_f32(uint8_t x) { | |
return (float) x; | |
} | |
static inline float uitofp_i16_f32(uint16_t x) { | |
return (float) x; | |
} | |
static inline float uitofp_i32_f32(uint32_t x) { | |
return (float) x; | |
} | |
static inline float uitofp_i64_f32(uint64_t x) { | |
return (float) x; | |
} | |
#ifdef __OPENCL_VERSION__ | |
static inline float fabs32(float x) { | |
return fabs(x); | |
} | |
static inline float fmax32(float x, float y) { | |
return fmax(x, y); | |
} | |
static inline float fmin32(float x, float y) { | |
return fmin(x, y); | |
} | |
static inline float fpow32(float x, float y) { | |
return pow(x, y); | |
} | |
#elif ISPC | |
static inline float fabs32(float x) { | |
return abs(x); | |
} | |
static inline float fmax32(float x, float y) { | |
return isnan(x) ? y : isnan(y) ? x : max(x, y); | |
} | |
static inline float fmin32(float x, float y) { | |
return isnan(x) ? y : isnan(y) ? x : min(x, y); | |
} | |
static inline float fpow32(float a, float b) { | |
float ret; | |
foreach_active (i) { | |
uniform float r = __stdlib_powf(extract(a, i), extract(b, i)); | |
ret = insert(ret, i, r); | |
} | |
return ret; | |
} | |
#else // Not OpenCL, but CUDA or plain C. | |
static inline float fabs32(float x) { | |
return fabsf(x); | |
} | |
static inline float fmax32(float x, float y) { | |
return fmaxf(x, y); | |
} | |
static inline float fmin32(float x, float y) { | |
return fminf(x, y); | |
} | |
static inline float fpow32(float x, float y) { | |
return powf(x, y); | |
} | |
#endif | |
static inline bool futrts_isnan32(float x) { | |
return isnan(x); | |
} | |
#if ISPC | |
static inline bool futrts_isinf32(float x) { | |
return !isnan(x) && isnan(x - x); | |
} | |
static inline bool futrts_isfinite32(float x) { | |
return !isnan(x) && !futrts_isinf32(x); | |
} | |
#else | |
static inline bool futrts_isinf32(float x) { | |
return isinf(x); | |
} | |
#endif | |
static inline int8_t fptosi_f32_i8(float x) { | |
if (futrts_isnan32(x) || futrts_isinf32(x)) { | |
return 0; | |
} else { | |
return (int8_t) x; | |
} | |
} | |
static inline int16_t fptosi_f32_i16(float x) { | |
if (futrts_isnan32(x) || futrts_isinf32(x)) { | |
return 0; | |
} else { | |
return (int16_t) x; | |
} | |
} | |
static inline int32_t fptosi_f32_i32(float x) { | |
if (futrts_isnan32(x) || futrts_isinf32(x)) { | |
return 0; | |
} else { | |
return (int32_t) x; | |
} | |
} | |
static inline int64_t fptosi_f32_i64(float x) { | |
if (futrts_isnan32(x) || futrts_isinf32(x)) { | |
return 0; | |
} else { | |
return (int64_t) x; | |
}; | |
} | |
static inline uint8_t fptoui_f32_i8(float x) { | |
if (futrts_isnan32(x) || futrts_isinf32(x)) { | |
return 0; | |
} else { | |
return (uint8_t) (int8_t) x; | |
} | |
} | |
static inline uint16_t fptoui_f32_i16(float x) { | |
if (futrts_isnan32(x) || futrts_isinf32(x)) { | |
return 0; | |
} else { | |
return (uint16_t) (int16_t) x; | |
} | |
} | |
static inline uint32_t fptoui_f32_i32(float x) { | |
if (futrts_isnan32(x) || futrts_isinf32(x)) { | |
return 0; | |
} else { | |
return (uint32_t) (int32_t) x; | |
} | |
} | |
static inline uint64_t fptoui_f32_i64(float x) { | |
if (futrts_isnan32(x) || futrts_isinf32(x)) { | |
return 0; | |
} else { | |
return (uint64_t) (int64_t) x; | |
} | |
} | |
static inline bool ftob_f32_bool(float x) { | |
return x != 0; | |
} | |
static inline float btof_bool_f32(bool x) { | |
return x ? 1 : 0; | |
} | |
#ifdef __OPENCL_VERSION__ | |
static inline float futrts_log32(float x) { | |
return log(x); | |
} | |
static inline float futrts_log2_32(float x) { | |
return log2(x); | |
} | |
static inline float futrts_log10_32(float x) { | |
return log10(x); | |
} | |
static inline float futrts_log1p_32(float x) { | |
return log1p(x); | |
} | |
static inline float futrts_sqrt32(float x) { | |
return sqrt(x); | |
} | |
static inline float futrts_cbrt32(float x) { | |
return cbrt(x); | |
} | |
static inline float futrts_exp32(float x) { | |
return exp(x); | |
} | |
static inline float futrts_cos32(float x) { | |
return cos(x); | |
} | |
static inline float futrts_sin32(float x) { | |
return sin(x); | |
} | |
static inline float futrts_tan32(float x) { | |
return tan(x); | |
} | |
static inline float futrts_acos32(float x) { | |
return acos(x); | |
} | |
static inline float futrts_asin32(float x) { | |
return asin(x); | |
} | |
static inline float futrts_atan32(float x) { | |
return atan(x); | |
} | |
static inline float futrts_cosh32(float x) { | |
return cosh(x); | |
} | |
static inline float futrts_sinh32(float x) { | |
return sinh(x); | |
} | |
static inline float futrts_tanh32(float x) { | |
return tanh(x); | |
} | |
static inline float futrts_acosh32(float x) { | |
return acosh(x); | |
} | |
static inline float futrts_asinh32(float x) { | |
return asinh(x); | |
} | |
static inline float futrts_atanh32(float x) { | |
return atanh(x); | |
} | |
static inline float futrts_atan2_32(float x, float y) { | |
return atan2(x, y); | |
} | |
static inline float futrts_hypot32(float x, float y) { | |
return hypot(x, y); | |
} | |
static inline float futrts_gamma32(float x) { | |
return tgamma(x); | |
} | |
static inline float futrts_lgamma32(float x) { | |
return lgamma(x); | |
} | |
static inline float futrts_erf32(float x) { | |
return erf(x); | |
} | |
static inline float futrts_erfc32(float x) { | |
return erfc(x); | |
} | |
static inline float fmod32(float x, float y) { | |
return fmod(x, y); | |
} | |
static inline float futrts_round32(float x) { | |
return rint(x); | |
} | |
static inline float futrts_floor32(float x) { | |
return floor(x); | |
} | |
static inline float futrts_ceil32(float x) { | |
return ceil(x); | |
} | |
static inline float futrts_nextafter32(float x, float y) { | |
return nextafter(x, y); | |
} | |
static inline float futrts_lerp32(float v0, float v1, float t) { | |
return mix(v0, v1, t); | |
} | |
static inline float futrts_mad32(float a, float b, float c) { | |
return mad(a, b, c); | |
} | |
static inline float futrts_fma32(float a, float b, float c) { | |
return fma(a, b, c); | |
} | |
#elif ISPC | |
static inline float futrts_log32(float x) { | |
return futrts_isfinite32(x) || (futrts_isinf32(x) && x < 0)? log(x) : x; | |
} | |
static inline float futrts_log2_32(float x) { | |
return futrts_log32(x) / log(2.0f); | |
} | |
static inline float futrts_log10_32(float x) { | |
return futrts_log32(x) / log(10.0f); | |
} | |
static inline float futrts_log1p_32(float x) { | |
if(x == -1.0f || (futrts_isinf32(x) && x > 0.0f)) return x / 0.0f; | |
float y = 1.0f + x; | |
float z = y - 1.0f; | |
return log(y) - (z-x)/y; | |
} | |
static inline float futrts_sqrt32(float x) { | |
return sqrt(x); | |
} | |
extern "C" unmasked uniform float cbrtf(uniform float); | |
static inline float futrts_cbrt32(float x) { | |
float res; | |
foreach_active (i) { | |
uniform float r = cbrtf(extract(x, i)); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
static inline float futrts_exp32(float x) { | |
return exp(x); | |
} | |
static inline float futrts_cos32(float x) { | |
return cos(x); | |
} | |
static inline float futrts_sin32(float x) { | |
return sin(x); | |
} | |
static inline float futrts_tan32(float x) { | |
return tan(x); | |
} | |
static inline float futrts_acos32(float x) { | |
return acos(x); | |
} | |
static inline float futrts_asin32(float x) { | |
return asin(x); | |
} | |
static inline float futrts_atan32(float x) { | |
return atan(x); | |
} | |
static inline float futrts_cosh32(float x) { | |
return (exp(x)+exp(-x)) / 2.0f; | |
} | |
static inline float futrts_sinh32(float x) { | |
return (exp(x)-exp(-x)) / 2.0f; | |
} | |
static inline float futrts_tanh32(float x) { | |
return futrts_sinh32(x)/futrts_cosh32(x); | |
} | |
static inline float futrts_acosh32(float x) { | |
float f = x+sqrt(x*x-1); | |
if(futrts_isfinite32(f)) return log(f); | |
return f; | |
} | |
static inline float futrts_asinh32(float x) { | |
float f = x+sqrt(x*x+1); | |
if(futrts_isfinite32(f)) return log(f); | |
return f; | |
} | |
static inline float futrts_atanh32(float x) { | |
float f = (1+x)/(1-x); | |
if(futrts_isfinite32(f)) return log(f)/2.0f; | |
return f; | |
} | |
static inline float futrts_atan2_32(float x, float y) { | |
return (x == 0.0f && y == 0.0f) ? 0.0f : atan2(x, y); | |
} | |
static inline float futrts_hypot32(float x, float y) { | |
if (futrts_isfinite32(x) && futrts_isfinite32(y)) { | |
x = abs(x); | |
y = abs(y); | |
float a; | |
float b; | |
if (x >= y){ | |
a = x; | |
b = y; | |
} else { | |
a = y; | |
b = x; | |
} | |
if(b == 0){ | |
return a; | |
} | |
int e; | |
float an; | |
float bn; | |
an = frexp (a, &e); | |
bn = ldexp (b, - e); | |
float cn; | |
cn = sqrt (an * an + bn * bn); | |
return ldexp (cn, e); | |
} else { | |
if (futrts_isinf32(x) || futrts_isinf32(y)) return INFINITY; | |
else return x + y; | |
} | |
} | |
extern "C" unmasked uniform float tgammaf(uniform float x); | |
static inline float futrts_gamma32(float x) { | |
float res; | |
foreach_active (i) { | |
uniform float r = tgammaf(extract(x, i)); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
extern "C" unmasked uniform float lgammaf(uniform float x); | |
static inline float futrts_lgamma32(float x) { | |
float res; | |
foreach_active (i) { | |
uniform float r = lgammaf(extract(x, i)); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
extern "C" unmasked uniform float erff(uniform float x); | |
static inline float futrts_erf32(float x) { | |
float res; | |
foreach_active (i) { | |
uniform float r = erff(extract(x, i)); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
extern "C" unmasked uniform float erfcf(uniform float x); | |
static inline float futrts_erfc32(float x) { | |
float res; | |
foreach_active (i) { | |
uniform float r = erfcf(extract(x, i)); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
static inline float fmod32(float x, float y) { | |
return x - y * trunc(x/y); | |
} | |
static inline float futrts_round32(float x) { | |
return round(x); | |
} | |
static inline float futrts_floor32(float x) { | |
return floor(x); | |
} | |
static inline float futrts_ceil32(float x) { | |
return ceil(x); | |
} | |
extern "C" unmasked uniform float nextafterf(uniform float x, uniform float y); | |
static inline float futrts_nextafter32(float x, float y) { | |
float res; | |
foreach_active (i) { | |
uniform float r = nextafterf(extract(x, i), extract(y, i)); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
static inline float futrts_lerp32(float v0, float v1, float t) { | |
return v0 + (v1 - v0) * t; | |
} | |
static inline float futrts_mad32(float a, float b, float c) { | |
return a * b + c; | |
} | |
static inline float futrts_fma32(float a, float b, float c) { | |
return a * b + c; | |
} | |
#else // Not OpenCL or ISPC, but CUDA or plain C. | |
static inline float futrts_log32(float x) { | |
return logf(x); | |
} | |
static inline float futrts_log2_32(float x) { | |
return log2f(x); | |
} | |
static inline float futrts_log10_32(float x) { | |
return log10f(x); | |
} | |
static inline float futrts_log1p_32(float x) { | |
return log1pf(x); | |
} | |
static inline float futrts_sqrt32(float x) { | |
return sqrtf(x); | |
} | |
static inline float futrts_cbrt32(float x) { | |
return cbrtf(x); | |
} | |
static inline float futrts_exp32(float x) { | |
return expf(x); | |
} | |
static inline float futrts_cos32(float x) { | |
return cosf(x); | |
} | |
static inline float futrts_sin32(float x) { | |
return sinf(x); | |
} | |
static inline float futrts_tan32(float x) { | |
return tanf(x); | |
} | |
static inline float futrts_acos32(float x) { | |
return acosf(x); | |
} | |
static inline float futrts_asin32(float x) { | |
return asinf(x); | |
} | |
static inline float futrts_atan32(float x) { | |
return atanf(x); | |
} | |
static inline float futrts_cosh32(float x) { | |
return coshf(x); | |
} | |
static inline float futrts_sinh32(float x) { | |
return sinhf(x); | |
} | |
static inline float futrts_tanh32(float x) { | |
return tanhf(x); | |
} | |
static inline float futrts_acosh32(float x) { | |
return acoshf(x); | |
} | |
static inline float futrts_asinh32(float x) { | |
return asinhf(x); | |
} | |
static inline float futrts_atanh32(float x) { | |
return atanhf(x); | |
} | |
static inline float futrts_atan2_32(float x, float y) { | |
return atan2f(x, y); | |
} | |
static inline float futrts_hypot32(float x, float y) { | |
return hypotf(x, y); | |
} | |
static inline float futrts_gamma32(float x) { | |
return tgammaf(x); | |
} | |
static inline float futrts_lgamma32(float x) { | |
return lgammaf(x); | |
} | |
static inline float futrts_erf32(float x) { | |
return erff(x); | |
} | |
static inline float futrts_erfc32(float x) { | |
return erfcf(x); | |
} | |
static inline float fmod32(float x, float y) { | |
return fmodf(x, y); | |
} | |
static inline float futrts_round32(float x) { | |
return rintf(x); | |
} | |
static inline float futrts_floor32(float x) { | |
return floorf(x); | |
} | |
static inline float futrts_ceil32(float x) { | |
return ceilf(x); | |
} | |
static inline float futrts_nextafter32(float x, float y) { | |
return nextafterf(x, y); | |
} | |
static inline float futrts_lerp32(float v0, float v1, float t) { | |
return v0 + (v1 - v0) * t; | |
} | |
static inline float futrts_mad32(float a, float b, float c) { | |
return a * b + c; | |
} | |
static inline float futrts_fma32(float a, float b, float c) { | |
return fmaf(a, b, c); | |
} | |
#endif | |
#if ISPC | |
static inline int32_t futrts_to_bits32(float x) { | |
return intbits(x); | |
} | |
static inline float futrts_from_bits32(int32_t x) { | |
return floatbits(x); | |
} | |
#else | |
static inline int32_t futrts_to_bits32(float x) { | |
union { | |
float f; | |
int32_t t; | |
} p; | |
p.f = x; | |
return p.t; | |
} | |
static inline float futrts_from_bits32(int32_t x) { | |
union { | |
int32_t f; | |
float t; | |
} p; | |
p.f = x; | |
return p.t; | |
} | |
#endif | |
static inline float fsignum32(float x) { | |
return futrts_isnan32(x) ? x : (x > 0 ? 1 : 0) - (x < 0 ? 1 : 0); | |
} | |
#ifdef FUTHARK_F64_ENABLED | |
#if ISPC | |
static inline bool futrts_isinf64(float x) { | |
return !isnan(x) && isnan(x - x); | |
} | |
static inline bool futrts_isfinite64(float x) { | |
return !isnan(x) && !futrts_isinf64(x); | |
} | |
static inline double fdiv64(double x, double y) { | |
return x / y; | |
} | |
static inline double fadd64(double x, double y) { | |
return x + y; | |
} | |
static inline double fsub64(double x, double y) { | |
return x - y; | |
} | |
static inline double fmul64(double x, double y) { | |
return x * y; | |
} | |
static inline bool cmplt64(double x, double y) { | |
return x < y; | |
} | |
static inline bool cmple64(double x, double y) { | |
return x <= y; | |
} | |
static inline double sitofp_i8_f64(int8_t x) { | |
return (double) x; | |
} | |
static inline double sitofp_i16_f64(int16_t x) { | |
return (double) x; | |
} | |
static inline double sitofp_i32_f64(int32_t x) { | |
return (double) x; | |
} | |
static inline double sitofp_i64_f64(int64_t x) { | |
return (double) x; | |
} | |
static inline double uitofp_i8_f64(uint8_t x) { | |
return (double) x; | |
} | |
static inline double uitofp_i16_f64(uint16_t x) { | |
return (double) x; | |
} | |
static inline double uitofp_i32_f64(uint32_t x) { | |
return (double) x; | |
} | |
static inline double uitofp_i64_f64(uint64_t x) { | |
return (double) x; | |
} | |
static inline double fabs64(double x) { | |
return abs(x); | |
} | |
static inline double fmax64(double x, double y) { | |
return isnan(x) ? y : isnan(y) ? x : max(x, y); | |
} | |
static inline double fmin64(double x, double y) { | |
return isnan(x) ? y : isnan(y) ? x : min(x, y); | |
} | |
static inline double fpow64(double a, double b) { | |
float ret; | |
foreach_active (i) { | |
uniform float r = __stdlib_powf(extract(a, i), extract(b, i)); | |
ret = insert(ret, i, r); | |
} | |
return ret; | |
} | |
static inline double futrts_log64(double x) { | |
return futrts_isfinite64(x) || (futrts_isinf64(x) && x < 0)? log(x) : x; | |
} | |
static inline double futrts_log2_64(double x) { | |
return futrts_log64(x)/log(2.0d); | |
} | |
static inline double futrts_log10_64(double x) { | |
return futrts_log64(x)/log(10.0d); | |
} | |
static inline double futrts_log1p_64(double x) { | |
if(x == -1.0d || (futrts_isinf64(x) && x > 0.0d)) return x / 0.0d; | |
double y = 1.0d + x; | |
double z = y - 1.0d; | |
return log(y) - (z-x)/y; | |
} | |
static inline double futrts_sqrt64(double x) { | |
return sqrt(x); | |
} | |
extern "C" unmasked uniform double cbrt(uniform double); | |
static inline double futrts_cbrt64(double x) { | |
double res; | |
foreach_active (i) { | |
uniform double r = cbrtf(extract(x, i)); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
static inline double futrts_exp64(double x) { | |
return exp(x); | |
} | |
static inline double futrts_cos64(double x) { | |
return cos(x); | |
} | |
static inline double futrts_sin64(double x) { | |
return sin(x); | |
} | |
static inline double futrts_tan64(double x) { | |
return tan(x); | |
} | |
static inline double futrts_acos64(double x) { | |
return acos(x); | |
} | |
static inline double futrts_asin64(double x) { | |
return asin(x); | |
} | |
static inline double futrts_atan64(double x) { | |
return atan(x); | |
} | |
static inline double futrts_cosh64(double x) { | |
return (exp(x)+exp(-x)) / 2.0d; | |
} | |
static inline double futrts_sinh64(double x) { | |
return (exp(x)-exp(-x)) / 2.0d; | |
} | |
static inline double futrts_tanh64(double x) { | |
return futrts_sinh64(x)/futrts_cosh64(x); | |
} | |
static inline double futrts_acosh64(double x) { | |
double f = x+sqrt(x*x-1.0d); | |
if(futrts_isfinite64(f)) return log(f); | |
return f; | |
} | |
static inline double futrts_asinh64(double x) { | |
double f = x+sqrt(x*x+1.0d); | |
if(futrts_isfinite64(f)) return log(f); | |
return f; | |
} | |
static inline double futrts_atanh64(double x) { | |
double f = (1.0d+x)/(1.0d-x); | |
if(futrts_isfinite64(f)) return log(f)/2.0d; | |
return f; | |
} | |
static inline double futrts_atan2_64(double x, double y) { | |
return atan2(x, y); | |
} | |
extern "C" unmasked uniform double hypot(uniform double x, uniform double y); | |
static inline double futrts_hypot64(double x, double y) { | |
double res; | |
foreach_active (i) { | |
uniform double r = hypot(extract(x, i), extract(y, i)); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
extern "C" unmasked uniform double tgamma(uniform double x); | |
static inline double futrts_gamma64(double x) { | |
double res; | |
foreach_active (i) { | |
uniform double r = tgamma(extract(x, i)); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
extern "C" unmasked uniform double lgamma(uniform double x); | |
static inline double futrts_lgamma64(double x) { | |
double res; | |
foreach_active (i) { | |
uniform double r = lgamma(extract(x, i)); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
extern "C" unmasked uniform double erf(uniform double x); | |
static inline double futrts_erf64(double x) { | |
double res; | |
foreach_active (i) { | |
uniform double r = erf(extract(x, i)); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
extern "C" unmasked uniform double erfc(uniform double x); | |
static inline double futrts_erfc64(double x) { | |
double res; | |
foreach_active (i) { | |
uniform double r = erfc(extract(x, i)); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
static inline double futrts_fma64(double a, double b, double c) { | |
return a * b + c; | |
} | |
static inline double futrts_round64(double x) { | |
return round(x); | |
} | |
static inline double futrts_ceil64(double x) { | |
return ceil(x); | |
} | |
extern "C" unmasked uniform double nextafter(uniform float x, uniform double y); | |
static inline float futrts_nextafter64(double x, double y) { | |
double res; | |
foreach_active (i) { | |
uniform double r = nextafter(extract(x, i), extract(y, i)); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
static inline double futrts_floor64(double x) { | |
return floor(x); | |
} | |
static inline bool futrts_isnan64(double x) { | |
return isnan(x); | |
} | |
static inline int8_t fptosi_f64_i8(double x) { | |
if (futrts_isnan64(x) || futrts_isinf64(x)) { | |
return 0; | |
} else { | |
return (int8_t) x; | |
} | |
} | |
static inline int16_t fptosi_f64_i16(double x) { | |
if (futrts_isnan64(x) || futrts_isinf64(x)) { | |
return 0; | |
} else { | |
return (int16_t) x; | |
} | |
} | |
static inline int32_t fptosi_f64_i32(double x) { | |
if (futrts_isnan64(x) || futrts_isinf64(x)) { | |
return 0; | |
} else { | |
return (int32_t) x; | |
} | |
} | |
static inline int64_t fptosi_f64_i64(double x) { | |
if (futrts_isnan64(x) || futrts_isinf64(x)) { | |
return 0; | |
} else { | |
return (int64_t) x; | |
} | |
} | |
static inline uint8_t fptoui_f64_i8(double x) { | |
if (futrts_isnan64(x) || futrts_isinf64(x)) { | |
return 0; | |
} else { | |
return (uint8_t) (int8_t) x; | |
} | |
} | |
static inline uint16_t fptoui_f64_i16(double x) { | |
if (futrts_isnan64(x) || futrts_isinf64(x)) { | |
return 0; | |
} else { | |
return (uint16_t) (int16_t) x; | |
} | |
} | |
static inline uint32_t fptoui_f64_i32(double x) { | |
if (futrts_isnan64(x) || futrts_isinf64(x)) { | |
return 0; | |
} else { | |
return (uint32_t) (int32_t) x; | |
} | |
} | |
static inline uint64_t fptoui_f64_i64(double x) { | |
if (futrts_isnan64(x) || futrts_isinf64(x)) { | |
return 0; | |
} else { | |
return (uint64_t) (int64_t) x; | |
} | |
} | |
static inline bool ftob_f64_bool(double x) { | |
return x != 0.0; | |
} | |
static inline double btof_bool_f64(bool x) { | |
return x ? 1.0 : 0.0; | |
} | |
static inline int64_t futrts_to_bits64(double x) { | |
int64_t res; | |
foreach_active (i) { | |
uniform double tmp = extract(x, i); | |
uniform int64_t r = *((uniform int64_t* uniform)&tmp); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
static inline double futrts_from_bits64(int64_t x) { | |
double res; | |
foreach_active (i) { | |
uniform int64_t tmp = extract(x, i); | |
uniform double r = *((uniform double* uniform)&tmp); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
static inline double fmod64(double x, double y) { | |
return x - y * trunc(x/y); | |
} | |
static inline double fsignum64(double x) { | |
return futrts_isnan64(x) ? x : (x > 0 ? 1.0d : 0.0d) - (x < 0 ? 1.0d : 0.0d); | |
} | |
static inline double futrts_lerp64(double v0, double v1, double t) { | |
return v0 + (v1 - v0) * t; | |
} | |
static inline double futrts_mad64(double a, double b, double c) { | |
return a * b + c; | |
} | |
static inline float fpconv_f32_f32(float x) { | |
return (float) x; | |
} | |
static inline double fpconv_f32_f64(float x) { | |
return (double) x; | |
} | |
static inline float fpconv_f64_f32(double x) { | |
return (float) x; | |
} | |
static inline double fpconv_f64_f64(double x) { | |
return (double) x; | |
} | |
#else | |
static inline double fdiv64(double x, double y) { | |
return x / y; | |
} | |
static inline double fadd64(double x, double y) { | |
return x + y; | |
} | |
static inline double fsub64(double x, double y) { | |
return x - y; | |
} | |
static inline double fmul64(double x, double y) { | |
return x * y; | |
} | |
static inline bool cmplt64(double x, double y) { | |
return x < y; | |
} | |
static inline bool cmple64(double x, double y) { | |
return x <= y; | |
} | |
static inline double sitofp_i8_f64(int8_t x) { | |
return (double) x; | |
} | |
static inline double sitofp_i16_f64(int16_t x) { | |
return (double) x; | |
} | |
static inline double sitofp_i32_f64(int32_t x) { | |
return (double) x; | |
} | |
static inline double sitofp_i64_f64(int64_t x) { | |
return (double) x; | |
} | |
static inline double uitofp_i8_f64(uint8_t x) { | |
return (double) x; | |
} | |
static inline double uitofp_i16_f64(uint16_t x) { | |
return (double) x; | |
} | |
static inline double uitofp_i32_f64(uint32_t x) { | |
return (double) x; | |
} | |
static inline double uitofp_i64_f64(uint64_t x) { | |
return (double) x; | |
} | |
static inline double fabs64(double x) { | |
return fabs(x); | |
} | |
static inline double fmax64(double x, double y) { | |
return fmax(x, y); | |
} | |
static inline double fmin64(double x, double y) { | |
return fmin(x, y); | |
} | |
static inline double fpow64(double x, double y) { | |
return pow(x, y); | |
} | |
static inline double futrts_log64(double x) { | |
return log(x); | |
} | |
static inline double futrts_log2_64(double x) { | |
return log2(x); | |
} | |
static inline double futrts_log10_64(double x) { | |
return log10(x); | |
} | |
static inline double futrts_log1p_64(double x) { | |
return log1p(x); | |
} | |
static inline double futrts_sqrt64(double x) { | |
return sqrt(x); | |
} | |
static inline double futrts_cbrt64(double x) { | |
return cbrt(x); | |
} | |
static inline double futrts_exp64(double x) { | |
return exp(x); | |
} | |
static inline double futrts_cos64(double x) { | |
return cos(x); | |
} | |
static inline double futrts_sin64(double x) { | |
return sin(x); | |
} | |
static inline double futrts_tan64(double x) { | |
return tan(x); | |
} | |
static inline double futrts_acos64(double x) { | |
return acos(x); | |
} | |
static inline double futrts_asin64(double x) { | |
return asin(x); | |
} | |
static inline double futrts_atan64(double x) { | |
return atan(x); | |
} | |
static inline double futrts_cosh64(double x) { | |
return cosh(x); | |
} | |
static inline double futrts_sinh64(double x) { | |
return sinh(x); | |
} | |
static inline double futrts_tanh64(double x) { | |
return tanh(x); | |
} | |
static inline double futrts_acosh64(double x) { | |
return acosh(x); | |
} | |
static inline double futrts_asinh64(double x) { | |
return asinh(x); | |
} | |
static inline double futrts_atanh64(double x) { | |
return atanh(x); | |
} | |
static inline double futrts_atan2_64(double x, double y) { | |
return atan2(x, y); | |
} | |
static inline double futrts_hypot64(double x, double y) { | |
return hypot(x, y); | |
} | |
static inline double futrts_gamma64(double x) { | |
return tgamma(x); | |
} | |
static inline double futrts_lgamma64(double x) { | |
return lgamma(x); | |
} | |
static inline double futrts_erf64(double x) { | |
return erf(x); | |
} | |
static inline double futrts_erfc64(double x) { | |
return erfc(x); | |
} | |
static inline double futrts_fma64(double a, double b, double c) { | |
return fma(a, b, c); | |
} | |
static inline double futrts_round64(double x) { | |
return rint(x); | |
} | |
static inline double futrts_ceil64(double x) { | |
return ceil(x); | |
} | |
static inline float futrts_nextafter64(float x, float y) { | |
return nextafter(x, y); | |
} | |
static inline double futrts_floor64(double x) { | |
return floor(x); | |
} | |
static inline bool futrts_isnan64(double x) { | |
return isnan(x); | |
} | |
static inline bool futrts_isinf64(double x) { | |
return isinf(x); | |
} | |
static inline int8_t fptosi_f64_i8(double x) { | |
if (futrts_isnan64(x) || futrts_isinf64(x)) { | |
return 0; | |
} else { | |
return (int8_t) x; | |
} | |
} | |
static inline int16_t fptosi_f64_i16(double x) { | |
if (futrts_isnan64(x) || futrts_isinf64(x)) { | |
return 0; | |
} else { | |
return (int16_t) x; | |
} | |
} | |
static inline int32_t fptosi_f64_i32(double x) { | |
if (futrts_isnan64(x) || futrts_isinf64(x)) { | |
return 0; | |
} else { | |
return (int32_t) x; | |
} | |
} | |
static inline int64_t fptosi_f64_i64(double x) { | |
if (futrts_isnan64(x) || futrts_isinf64(x)) { | |
return 0; | |
} else { | |
return (int64_t) x; | |
} | |
} | |
static inline uint8_t fptoui_f64_i8(double x) { | |
if (futrts_isnan64(x) || futrts_isinf64(x)) { | |
return 0; | |
} else { | |
return (uint8_t) (int8_t) x; | |
} | |
} | |
static inline uint16_t fptoui_f64_i16(double x) { | |
if (futrts_isnan64(x) || futrts_isinf64(x)) { | |
return 0; | |
} else { | |
return (uint16_t) (int16_t) x; | |
} | |
} | |
static inline uint32_t fptoui_f64_i32(double x) { | |
if (futrts_isnan64(x) || futrts_isinf64(x)) { | |
return 0; | |
} else { | |
return (uint32_t) (int32_t) x; | |
} | |
} | |
static inline uint64_t fptoui_f64_i64(double x) { | |
if (futrts_isnan64(x) || futrts_isinf64(x)) { | |
return 0; | |
} else { | |
return (uint64_t) (int64_t) x; | |
} | |
} | |
static inline bool ftob_f64_bool(double x) { | |
return x != 0; | |
} | |
static inline double btof_bool_f64(bool x) { | |
return x ? 1 : 0; | |
} | |
static inline int64_t futrts_to_bits64(double x) { | |
union { | |
double f; | |
int64_t t; | |
} p; | |
p.f = x; | |
return p.t; | |
} | |
static inline double futrts_from_bits64(int64_t x) { | |
union { | |
int64_t f; | |
double t; | |
} p; | |
p.f = x; | |
return p.t; | |
} | |
static inline double fmod64(double x, double y) { | |
return fmod(x, y); | |
} | |
static inline double fsignum64(double x) { | |
return futrts_isnan64(x) ? x : (x > 0) - (x < 0); | |
} | |
static inline double futrts_lerp64(double v0, double v1, double t) { | |
#ifdef __OPENCL_VERSION__ | |
return mix(v0, v1, t); | |
#else | |
return v0 + (v1 - v0) * t; | |
#endif | |
} | |
static inline double futrts_mad64(double a, double b, double c) { | |
#ifdef __OPENCL_VERSION__ | |
return mad(a, b, c); | |
#else | |
return a * b + c; | |
#endif | |
} | |
static inline float fpconv_f32_f32(float x) { | |
return (float) x; | |
} | |
static inline double fpconv_f32_f64(float x) { | |
return (double) x; | |
} | |
static inline float fpconv_f64_f32(double x) { | |
return (float) x; | |
} | |
static inline double fpconv_f64_f64(double x) { | |
return (double) x; | |
} | |
#endif | |
#endif | |
// End of scalar.h. | |
// Start of scalar_f16.h. | |
// Half-precision is emulated if needed (e.g. in straight C) with the | |
// native type used if possible. The emulation works by typedef'ing | |
// 'float' to 'f16', and then implementing all operations on single | |
// precision. To cut down on duplication, we use the same code for | |
// those Futhark functions that require just operators or casts. The | |
// in-memory representation for arrays will still be 16 bits even | |
// under emulation, so the compiler will have to be careful when | |
// generating reads or writes. | |
#if !defined(cl_khr_fp16) && !(defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) && !(defined(ISPC)) | |
#define EMULATE_F16 | |
#endif | |
#if !defined(EMULATE_F16) && defined(__OPENCL_VERSION__) | |
#pragma OPENCL EXTENSION cl_khr_fp16 : enable | |
#endif | |
#ifdef EMULATE_F16 | |
// Note that the half-precision storage format is still 16 bits - the | |
// compiler will have to be real careful! | |
typedef float f16; | |
#elif ISPC | |
typedef float16 f16; | |
#else | |
#ifdef __CUDA_ARCH__ | |
#include <cuda_fp16.h> | |
#endif | |
typedef half f16; | |
#endif | |
// Some of these functions convert to single precision because half | |
// precision versions are not available. | |
static inline f16 fadd16(f16 x, f16 y) { | |
return x + y; | |
} | |
static inline f16 fsub16(f16 x, f16 y) { | |
return x - y; | |
} | |
static inline f16 fmul16(f16 x, f16 y) { | |
return x * y; | |
} | |
static inline bool cmplt16(f16 x, f16 y) { | |
return x < y; | |
} | |
static inline bool cmple16(f16 x, f16 y) { | |
return x <= y; | |
} | |
static inline f16 sitofp_i8_f16(int8_t x) { | |
return (f16) x; | |
} | |
static inline f16 sitofp_i16_f16(int16_t x) { | |
return (f16) x; | |
} | |
static inline f16 sitofp_i32_f16(int32_t x) { | |
return (f16) x; | |
} | |
static inline f16 sitofp_i64_f16(int64_t x) { | |
return (f16) x; | |
} | |
static inline f16 uitofp_i8_f16(uint8_t x) { | |
return (f16) x; | |
} | |
static inline f16 uitofp_i16_f16(uint16_t x) { | |
return (f16) x; | |
} | |
static inline f16 uitofp_i32_f16(uint32_t x) { | |
return (f16) x; | |
} | |
static inline f16 uitofp_i64_f16(uint64_t x) { | |
return (f16) x; | |
} | |
static inline int8_t fptosi_f16_i8(f16 x) { | |
return (int8_t) (float) x; | |
} | |
static inline int16_t fptosi_f16_i16(f16 x) { | |
return (int16_t) x; | |
} | |
static inline int32_t fptosi_f16_i32(f16 x) { | |
return (int32_t) x; | |
} | |
static inline int64_t fptosi_f16_i64(f16 x) { | |
return (int64_t) x; | |
} | |
static inline uint8_t fptoui_f16_i8(f16 x) { | |
return (uint8_t) (float) x; | |
} | |
static inline uint16_t fptoui_f16_i16(f16 x) { | |
return (uint16_t) x; | |
} | |
static inline uint32_t fptoui_f16_i32(f16 x) { | |
return (uint32_t) x; | |
} | |
static inline uint64_t fptoui_f16_i64(f16 x) { | |
return (uint64_t) x; | |
} | |
static inline bool ftob_f16_bool(f16 x) { | |
return x != (f16)0; | |
} | |
static inline f16 btof_bool_f16(bool x) { | |
return x ? 1 : 0; | |
} | |
#ifndef EMULATE_F16 | |
static inline bool futrts_isnan16(f16 x) { | |
return isnan((float)x); | |
} | |
#ifdef __OPENCL_VERSION__ | |
static inline f16 fabs16(f16 x) { | |
return fabs(x); | |
} | |
static inline f16 fmax16(f16 x, f16 y) { | |
return fmax(x, y); | |
} | |
static inline f16 fmin16(f16 x, f16 y) { | |
return fmin(x, y); | |
} | |
static inline f16 fpow16(f16 x, f16 y) { | |
return pow(x, y); | |
} | |
#elif ISPC | |
static inline f16 fabs16(f16 x) { | |
return abs(x); | |
} | |
static inline f16 fmax16(f16 x, f16 y) { | |
return futrts_isnan16(x) ? y : futrts_isnan16(y) ? x : max(x, y); | |
} | |
static inline f16 fmin16(f16 x, f16 y) { | |
return futrts_isnan16(x) ? y : futrts_isnan16(y) ? x : min(x, y); | |
} | |
static inline f16 fpow16(f16 x, f16 y) { | |
return pow(x, y); | |
} | |
#else // Assuming CUDA. | |
static inline f16 fabs16(f16 x) { | |
return fabsf(x); | |
} | |
static inline f16 fmax16(f16 x, f16 y) { | |
return fmaxf(x, y); | |
} | |
static inline f16 fmin16(f16 x, f16 y) { | |
return fminf(x, y); | |
} | |
static inline f16 fpow16(f16 x, f16 y) { | |
return powf(x, y); | |
} | |
#endif | |
#if ISPC | |
static inline bool futrts_isinf16(float x) { | |
return !futrts_isnan16(x) && futrts_isnan16(x - x); | |
} | |
static inline bool futrts_isfinite16(float x) { | |
return !futrts_isnan16(x) && !futrts_isinf16(x); | |
} | |
#else | |
static inline bool futrts_isinf16(f16 x) { | |
return isinf((float)x); | |
} | |
#endif | |
#ifdef __OPENCL_VERSION__ | |
static inline f16 futrts_log16(f16 x) { | |
return log(x); | |
} | |
static inline f16 futrts_log2_16(f16 x) { | |
return log2(x); | |
} | |
static inline f16 futrts_log10_16(f16 x) { | |
return log10(x); | |
} | |
static inline f16 futrts_log1p_16(f16 x) { | |
return log1p(x); | |
} | |
static inline f16 futrts_sqrt16(f16 x) { | |
return sqrt(x); | |
} | |
static inline f16 futrts_cbrt16(f16 x) { | |
return cbrt(x); | |
} | |
static inline f16 futrts_exp16(f16 x) { | |
return exp(x); | |
} | |
static inline f16 futrts_cos16(f16 x) { | |
return cos(x); | |
} | |
static inline f16 futrts_sin16(f16 x) { | |
return sin(x); | |
} | |
static inline f16 futrts_tan16(f16 x) { | |
return tan(x); | |
} | |
static inline f16 futrts_acos16(f16 x) { | |
return acos(x); | |
} | |
static inline f16 futrts_asin16(f16 x) { | |
return asin(x); | |
} | |
static inline f16 futrts_atan16(f16 x) { | |
return atan(x); | |
} | |
static inline f16 futrts_cosh16(f16 x) { | |
return cosh(x); | |
} | |
static inline f16 futrts_sinh16(f16 x) { | |
return sinh(x); | |
} | |
static inline f16 futrts_tanh16(f16 x) { | |
return tanh(x); | |
} | |
static inline f16 futrts_acosh16(f16 x) { | |
return acosh(x); | |
} | |
static inline f16 futrts_asinh16(f16 x) { | |
return asinh(x); | |
} | |
static inline f16 futrts_atanh16(f16 x) { | |
return atanh(x); | |
} | |
static inline f16 futrts_atan2_16(f16 x, f16 y) { | |
return atan2(x, y); | |
} | |
static inline f16 futrts_hypot16(f16 x, f16 y) { | |
return hypot(x, y); | |
} | |
static inline f16 futrts_gamma16(f16 x) { | |
return tgamma(x); | |
} | |
static inline f16 futrts_lgamma16(f16 x) { | |
return lgamma(x); | |
} | |
static inline f16 futrts_erf16(f16 x) { | |
return erf(x); | |
} | |
static inline f16 futrts_erfc16(f16 x) { | |
return erfc(x); | |
} | |
static inline f16 fmod16(f16 x, f16 y) { | |
return fmod(x, y); | |
} | |
static inline f16 futrts_round16(f16 x) { | |
return rint(x); | |
} | |
static inline f16 futrts_floor16(f16 x) { | |
return floor(x); | |
} | |
static inline f16 futrts_ceil16(f16 x) { | |
return ceil(x); | |
} | |
static inline f16 futrts_nextafter16(f16 x, f16 y) { | |
return nextafter(x, y); | |
} | |
static inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) { | |
return mix(v0, v1, t); | |
} | |
static inline f16 futrts_mad16(f16 a, f16 b, f16 c) { | |
return mad(a, b, c); | |
} | |
static inline f16 futrts_fma16(f16 a, f16 b, f16 c) { | |
return fma(a, b, c); | |
} | |
#elif ISPC | |
static inline f16 futrts_log16(f16 x) { | |
return futrts_isfinite16(x) || (futrts_isinf16(x) && x < 0) ? log(x) : x; | |
} | |
static inline f16 futrts_log2_16(f16 x) { | |
return futrts_log16(x) / log(2.0f16); | |
} | |
static inline f16 futrts_log10_16(f16 x) { | |
return futrts_log16(x) / log(10.0f16); | |
} | |
static inline f16 futrts_log1p_16(f16 x) { | |
if(x == -1.0f16 || (futrts_isinf16(x) && x > 0.0f16)) return x / 0.0f16; | |
f16 y = 1.0f16 + x; | |
f16 z = y - 1.0f16; | |
return log(y) - (z-x)/y; | |
} | |
static inline f16 futrts_sqrt16(f16 x) { | |
return (float16)sqrt((float)x); | |
} | |
static inline f16 futrts_exp16(f16 x) { | |
return exp(x); | |
} | |
static inline f16 futrts_cos16(f16 x) { | |
return (float16)cos((float)x); | |
} | |
static inline f16 futrts_sin16(f16 x) { | |
return (float16)sin((float)x); | |
} | |
static inline f16 futrts_tan16(f16 x) { | |
return (float16)tan((float)x); | |
} | |
static inline f16 futrts_acos16(f16 x) { | |
return (float16)acos((float)x); | |
} | |
static inline f16 futrts_asin16(f16 x) { | |
return (float16)asin((float)x); | |
} | |
static inline f16 futrts_atan16(f16 x) { | |
return (float16)atan((float)x); | |
} | |
static inline f16 futrts_cosh16(f16 x) { | |
return (exp(x)+exp(-x)) / 2.0f16; | |
} | |
static inline f16 futrts_sinh16(f16 x) { | |
return (exp(x)-exp(-x)) / 2.0f16; | |
} | |
static inline f16 futrts_tanh16(f16 x) { | |
return futrts_sinh16(x)/futrts_cosh16(x); | |
} | |
static inline f16 futrts_acosh16(f16 x) { | |
float16 f = x+(float16)sqrt((float)(x*x-1)); | |
if(futrts_isfinite16(f)) return log(f); | |
return f; | |
} | |
static inline f16 futrts_asinh16(f16 x) { | |
float16 f = x+(float16)sqrt((float)(x*x+1)); | |
if(futrts_isfinite16(f)) return log(f); | |
return f; | |
} | |
static inline f16 futrts_atanh16(f16 x) { | |
float16 f = (1+x)/(1-x); | |
if(futrts_isfinite16(f)) return log(f)/2.0f16; | |
return f; | |
} | |
static inline f16 futrts_atan2_16(f16 x, f16 y) { | |
return (float16)atan2((float)x, (float)y); | |
} | |
static inline f16 futrts_hypot16(f16 x, f16 y) { | |
return (float16)futrts_hypot32((float)x, (float)y); | |
} | |
extern "C" unmasked uniform float tgammaf(uniform float x); | |
static inline f16 futrts_gamma16(f16 x) { | |
f16 res; | |
foreach_active (i) { | |
uniform f16 r = (f16)tgammaf(extract((float)x, i)); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
extern "C" unmasked uniform float lgammaf(uniform float x); | |
static inline f16 futrts_lgamma16(f16 x) { | |
f16 res; | |
foreach_active (i) { | |
uniform f16 r = (f16)lgammaf(extract((float)x, i)); | |
res = insert(res, i, r); | |
} | |
return res; | |
} | |
static inline f16 futrts_cbrt16(f16 x) { | |
f16 res = (f16)futrts_cbrt32((float)x); | |
return res; | |
} | |
static inline f16 futrts_erf16(f16 x) { | |
f16 res = (f16)futrts_erf32((float)x); | |
return res; | |
} | |
static inline f16 futrts_erfc16(f16 x) { | |
f16 res = (f16)futrts_erfc32((float)x); | |
return res; | |
} | |
static inline f16 fmod16(f16 x, f16 y) { | |
return x - y * (float16)trunc((float) (x/y)); | |
} | |
static inline f16 futrts_round16(f16 x) { | |
return (float16)round((float)x); | |
} | |
static inline f16 futrts_floor16(f16 x) { | |
return (float16)floor((float)x); | |
} | |
static inline f16 futrts_ceil16(f16 x) { | |
return (float16)ceil((float)x); | |
} | |
static inline f16 futrts_nextafter16(f16 x, f16 y) { | |
return (float16)futrts_nextafter32((float)x, (float) y); | |
} | |
static inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) { | |
return v0 + (v1 - v0) * t; | |
} | |
static inline f16 futrts_mad16(f16 a, f16 b, f16 c) { | |
return a * b + c; | |
} | |
static inline f16 futrts_fma16(f16 a, f16 b, f16 c) { | |
return a * b + c; | |
} | |
#else // Assume CUDA. | |
static inline f16 futrts_log16(f16 x) { | |
return hlog(x); | |
} | |
static inline f16 futrts_log2_16(f16 x) { | |
return hlog2(x); | |
} | |
static inline f16 futrts_log10_16(f16 x) { | |
return hlog10(x); | |
} | |
static inline f16 futrts_log1p_16(f16 x) { | |
return (f16)log1pf((float)x); | |
} | |
static inline f16 futrts_sqrt16(f16 x) { | |
return hsqrt(x); | |
} | |
static inline f16 futrts_cbrt16(f16 x) { | |
return cbrtf(x); | |
} | |
static inline f16 futrts_exp16(f16 x) { | |
return hexp(x); | |
} | |
static inline f16 futrts_cos16(f16 x) { | |
return hcos(x); | |
} | |
static inline f16 futrts_sin16(f16 x) { | |
return hsin(x); | |
} | |
static inline f16 futrts_tan16(f16 x) { | |
return tanf(x); | |
} | |
static inline f16 futrts_acos16(f16 x) { | |
return acosf(x); | |
} | |
static inline f16 futrts_asin16(f16 x) { | |
return asinf(x); | |
} | |
static inline f16 futrts_atan16(f16 x) { | |
return atanf(x); | |
} | |
static inline f16 futrts_cosh16(f16 x) { | |
return coshf(x); | |
} | |
static inline f16 futrts_sinh16(f16 x) { | |
return sinhf(x); | |
} | |
static inline f16 futrts_tanh16(f16 x) { | |
return tanhf(x); | |
} | |
static inline f16 futrts_acosh16(f16 x) { | |
return acoshf(x); | |
} | |
static inline f16 futrts_asinh16(f16 x) { | |
return asinhf(x); | |
} | |
static inline f16 futrts_atanh16(f16 x) { | |
return atanhf(x); | |
} | |
static inline f16 futrts_atan2_16(f16 x, f16 y) { | |
return atan2f(x, y); | |
} | |
static inline f16 futrts_hypot16(f16 x, f16 y) { | |
return hypotf(x, y); | |
} | |
static inline f16 futrts_gamma16(f16 x) { | |
return tgammaf(x); | |
} | |
static inline f16 futrts_lgamma16(f16 x) { | |
return lgammaf(x); | |
} | |
static inline f16 futrts_erf16(f16 x) { | |
return erff(x); | |
} | |
static inline f16 futrts_erfc16(f16 x) { | |
return erfcf(x); | |
} | |
static inline f16 fmod16(f16 x, f16 y) { | |
return fmodf(x, y); | |
} | |
static inline f16 futrts_round16(f16 x) { | |
return rintf(x); | |
} | |
static inline f16 futrts_floor16(f16 x) { | |
return hfloor(x); | |
} | |
static inline f16 futrts_ceil16(f16 x) { | |
return hceil(x); | |
} | |
static inline f16 futrts_nextafter16(f16 x, f16 y) { | |
return __ushort_as_half(halfbitsnextafter(__half_as_ushort(x), __half_as_ushort(y))); | |
} | |
static inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) { | |
return v0 + (v1 - v0) * t; | |
} | |
static inline f16 futrts_mad16(f16 a, f16 b, f16 c) { | |
return a * b + c; | |
} | |
static inline f16 futrts_fma16(f16 a, f16 b, f16 c) { | |
return fmaf(a, b, c); | |
} | |
#endif | |
// The CUDA __half type cannot be put in unions for some reason, so we | |
// use bespoke conversion functions instead. | |
#ifdef __CUDA_ARCH__ | |
static inline int16_t futrts_to_bits16(f16 x) { | |
return __half_as_ushort(x); | |
} | |
static inline f16 futrts_from_bits16(int16_t x) { | |
return __ushort_as_half(x); | |
} | |
#elif ISPC | |
static inline int16_t futrts_to_bits16(f16 x) { | |
varying int16_t y = *((varying int16_t * uniform)&x); | |
return y; | |
} | |
static inline f16 futrts_from_bits16(int16_t x) { | |
varying f16 y = *((varying f16 * uniform)&x); | |
return y; | |
} | |
#else | |
static inline int16_t futrts_to_bits16(f16 x) { | |
union { | |
f16 f; | |
int16_t t; | |
} p; | |
p.f = x; | |
return p.t; | |
} | |
static inline f16 futrts_from_bits16(int16_t x) { | |
union { | |
int16_t f; | |
f16 t; | |
} p; | |
p.f = x; | |
return p.t; | |
} | |
#endif | |
#else // No native f16 - emulate. | |
static inline f16 fabs16(f16 x) { | |
return fabs32(x); | |
} | |
static inline f16 fmax16(f16 x, f16 y) { | |
return fmax32(x, y); | |
} | |
static inline f16 fmin16(f16 x, f16 y) { | |
return fmin32(x, y); | |
} | |
static inline f16 fpow16(f16 x, f16 y) { | |
return fpow32(x, y); | |
} | |
static inline bool futrts_isnan16(f16 x) { | |
return futrts_isnan32(x); | |
} | |
static inline bool futrts_isinf16(f16 x) { | |
return futrts_isinf32(x); | |
} | |
static inline f16 futrts_log16(f16 x) { | |
return futrts_log32(x); | |
} | |
static inline f16 futrts_log2_16(f16 x) { | |
return futrts_log2_32(x); | |
} | |
static inline f16 futrts_log10_16(f16 x) { | |
return futrts_log10_32(x); | |
} | |
static inline f16 futrts_log1p_16(f16 x) { | |
return futrts_log1p_32(x); | |
} | |
static inline f16 futrts_sqrt16(f16 x) { | |
return futrts_sqrt32(x); | |
} | |
static inline f16 futrts_cbrt16(f16 x) { | |
return futrts_cbrt32(x); | |
} | |
static inline f16 futrts_exp16(f16 x) { | |
return futrts_exp32(x); | |
} | |
static inline f16 futrts_cos16(f16 x) { | |
return futrts_cos32(x); | |
} | |
static inline f16 futrts_sin16(f16 x) { | |
return futrts_sin32(x); | |
} | |
static inline f16 futrts_tan16(f16 x) { | |
return futrts_tan32(x); | |
} | |
static inline f16 futrts_acos16(f16 x) { | |
return futrts_acos32(x); | |
} | |
static inline f16 futrts_asin16(f16 x) { | |
return futrts_asin32(x); | |
} | |
static inline f16 futrts_atan16(f16 x) { | |
return futrts_atan32(x); | |
} | |
static inline f16 futrts_cosh16(f16 x) { | |
return futrts_cosh32(x); | |
} | |
static inline f16 futrts_sinh16(f16 x) { | |
return futrts_sinh32(x); | |
} | |
static inline f16 futrts_tanh16(f16 x) { | |
return futrts_tanh32(x); | |
} | |
static inline f16 futrts_acosh16(f16 x) { | |
return futrts_acosh32(x); | |
} | |
static inline f16 futrts_asinh16(f16 x) { | |
return futrts_asinh32(x); | |
} | |
static inline f16 futrts_atanh16(f16 x) { | |
return futrts_atanh32(x); | |
} | |
static inline f16 futrts_atan2_16(f16 x, f16 y) { | |
return futrts_atan2_32(x, y); | |
} | |
static inline f16 futrts_hypot16(f16 x, f16 y) { | |
return futrts_hypot32(x, y); | |
} | |
static inline f16 futrts_gamma16(f16 x) { | |
return futrts_gamma32(x); | |
} | |
static inline f16 futrts_lgamma16(f16 x) { | |
return futrts_lgamma32(x); | |
} | |
static inline f16 futrts_erf16(f16 x) { | |
return futrts_erf32(x); | |
} | |
static inline f16 futrts_erfc16(f16 x) { | |
return futrts_erfc32(x); | |
} | |
static inline f16 fmod16(f16 x, f16 y) { | |
return fmod32(x, y); | |
} | |
static inline f16 futrts_round16(f16 x) { | |
return futrts_round32(x); | |
} | |
static inline f16 futrts_floor16(f16 x) { | |
return futrts_floor32(x); | |
} | |
static inline f16 futrts_ceil16(f16 x) { | |
return futrts_ceil32(x); | |
} | |
static inline f16 futrts_nextafter16(f16 x, f16 y) { | |
return halfbits2float(halfbitsnextafter(float2halfbits(x), float2halfbits(y))); | |
} | |
static inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) { | |
return futrts_lerp32(v0, v1, t); | |
} | |
static inline f16 futrts_mad16(f16 a, f16 b, f16 c) { | |
return futrts_mad32(a, b, c); | |
} | |
static inline f16 futrts_fma16(f16 a, f16 b, f16 c) { | |
return futrts_fma32(a, b, c); | |
} | |
// Even when we are using an OpenCL that does not support cl_khr_fp16, | |
// it must still support vload_half for actually creating a | |
// half-precision number, which can then be efficiently converted to a | |
// float. Similarly for vstore_half. | |
#ifdef __OPENCL_VERSION__ | |
static inline int16_t futrts_to_bits16(f16 x) { | |
int16_t y; | |
// Violating strict aliasing here. | |
vstore_half((float)x, 0, (half*)&y); | |
return y; | |
} | |
static inline f16 futrts_from_bits16(int16_t x) { | |
return (f16)vload_half(0, (half*)&x); | |
} | |
#else | |
static inline int16_t futrts_to_bits16(f16 x) { | |
return (int16_t)float2halfbits(x); | |
} | |
static inline f16 futrts_from_bits16(int16_t x) { | |
return halfbits2float((uint16_t)x); | |
} | |
static inline f16 fsignum16(f16 x) { | |
return futrts_isnan16(x) ? x : (x > 0 ? 1 : 0) - (x < 0 ? 1 : 0); | |
} | |
#endif | |
#endif | |
static inline float fpconv_f16_f16(f16 x) { | |
return x; | |
} | |
static inline float fpconv_f16_f32(f16 x) { | |
return x; | |
} | |
static inline f16 fpconv_f32_f16(float x) { | |
return (f16) x; | |
} | |
#ifdef FUTHARK_F64_ENABLED | |
static inline double fpconv_f16_f64(f16 x) { | |
return (double) x; | |
} | |
#if ISPC | |
static inline f16 fpconv_f64_f16(double x) { | |
return (f16) ((float)x); | |
} | |
#else | |
static inline f16 fpconv_f64_f16(double x) { | |
return (f16) x; | |
} | |
#endif | |
#endif | |
// End of scalar_f16.h. | |
// Start of context_prototypes.h | |
// | |
// Prototypes for the functions in context.h, or that will be called | |
// from those functions, that need to be available very early. | |
struct futhark_context_config; | |
struct futhark_context; | |
static void set_error(struct futhark_context* ctx, char *error); | |
// These are called in context/config new/free functions and contain | |
// shared setup. They are generated by the compiler itself. | |
static int init_constants(struct futhark_context*); | |
static int free_constants(struct futhark_context*); | |
static void setup_program(struct futhark_context* ctx); | |
static void teardown_program(struct futhark_context *ctx); | |
// Allocate host memory. Must be freed with host_free(). | |
static void host_alloc(struct futhark_context* ctx, size_t size, const char* tag, size_t* size_out, void** mem_out); | |
// Allocate memory allocated with host_alloc(). | |
static void host_free(struct futhark_context* ctx, size_t size, const char* tag, void* mem); | |
// Functions that must be defined by the backend. | |
static void backend_context_config_setup(struct futhark_context_config* cfg); | |
static void backend_context_config_teardown(struct futhark_context_config* cfg); | |
static int backend_context_setup(struct futhark_context *ctx); | |
static void backend_context_teardown(struct futhark_context *ctx); | |
// End of of context_prototypes.h | |
struct memblock_device { | |
int *references; | |
cl_mem mem; | |
int64_t size; | |
const char *desc; | |
}; | |
struct memblock { | |
int *references; | |
unsigned char *mem; | |
int64_t size; | |
const char *desc; | |
}; | |
struct constants { | |
int dummy; | |
}; | |
struct tuning_params { | |
int64_t *addzisegmap_group_sizze_6879; | |
int64_t *add_i64zisegmap_group_sizze_6899; | |
}; | |
static const int num_tuning_params = 2; | |
static const char *tuning_param_names[] = {"add.segmap_group_size_6879", "add_i64.segmap_group_size_6899", NULL}; | |
static const char *tuning_param_vars[] = {"addzisegmap_group_sizze_6879", "add_i64zisegmap_group_sizze_6899", NULL}; | |
static const char *tuning_param_classes[] = {"group_size", "group_size", NULL}; | |
static int64_t tuning_param_defaults[] = {0, 0, 0}; | |
static const int max_failure_args = 0; | |
static const int f64_required = 0; | |
static const char *opencl_program[] = {"\n// Clang-based OpenCL implementations need this for 'static' to work.\n#ifdef cl_clang_storage_class_specifiers\n#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable\n#endif\n#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n\n// Some OpenCL programs dislike empty progams, or programs with no kernels.\n// Declare a dummy kernel to ensure they remain our friends.\n__kernel void dummy_kernel(__global unsigned char *dummy, int n)\n{\n const int thread_gid = get_global_id(0);\n if (thread_gid >= n) return;\n}\n\n#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n\ntypedef char int8_t;\ntypedef short int16_t;\ntypedef int int32_t;\ntypedef long int64_t;\n\ntypedef uchar uint8_t;\ntypedef ushort uint16_t;\ntypedef uint uint32_t;\ntypedef ulong uint64_t;\n\n// NVIDIAs OpenCL does not create device-wide memory fences (see #734), so we\n// use inline assembly if we detect we are on an NVIDIA GPU.\n#ifdef cl_nv_pragma_unroll\nstatic inline void mem_fence_global() {\n asm(\"membar.gl;\");\n}\n#else\nstatic inline void mem_fence_global() {\n mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n}\n#endif\nstatic inline void mem_fence_local() {\n mem_fence(CLK_LOCAL_MEM_FENCE);\n}\n// Start of half.h.\n\n// Conversion functions are from http://half.sourceforge.net/, but\n// translated to C.\n//\n// Copyright (c) 2012-2021 Christian Rau\n//\n// Permission is hereby granted, free of charge, to any person obtaining a copy\n// of this software and associated documentation files (the \"Software\"), to deal\n// in the Software without restriction, including without limitation the rights\n// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n// copies of the Software, and to permit persons to whom the Software is\n// furnished to do so, subject to the following conditions:\n//\n// The above copyright notice and this permission notice shall be included in\n// all copies or substantial portions of the Softwa", "re.\n//\n// THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n// THE SOFTWARE.\n\n#ifndef __OPENCL_VERSION__\n#define __constant\n#endif\n\n__constant static const uint16_t base_table[512] = {\n 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100,\n 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00,\n 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00,\n 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C0", "0, 0x7C00, 0x7C00, 0x7C00,\n 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100,\n 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00,\n 0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00,\n 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n 0xFC00, 0xFC00, 0xFC0", | |
"0, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 };\n\n__constant static const unsigned char shift_table[512] = {\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,\n 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 2", "4, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,\n 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 };\n\n__constant static const uint32_t mantissa_table[2048] = {\n 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000,\n 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000,\n 0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000,\n 0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000,\n 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000,\n 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA00", "00, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000,\n 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000,\n 0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000,\n 0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000,\n 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000,\n 0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000,\n 0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000,\n 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000,\n 0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000,\n 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000,\n 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A00", | |
"00, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000,\n 0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000,\n 0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000,\n 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000,\n 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000,\n 0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000,\n 0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000,\n 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000,\n 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000,\n 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000,\n 0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF80", "00,\n 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000,\n 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000,\n 0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000,\n 0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000,\n 0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000,\n 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000,\n 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000,\n 0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000,\n 0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000,\n 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000,\n 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x3811", "0000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000,\n 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000,\n 0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000,\n 0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000,\n 0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000,\n 0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000,\n 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000,\n 0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000,\n 0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000,\n 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000,\n 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A", | |
"4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000,\n 0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000,\n 0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000,\n 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000,\n 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000,\n 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000,\n 0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000,\n 0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000,\n 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000,\n 0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000,\n 0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x3863", "8000, 0x3863C000,\n 0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000,\n 0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000,\n 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000,\n 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000,\n 0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000,\n 0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000,\n 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000,\n 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000,\n 0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000,\n 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000,\n 0x38060000, 0x38062000, 0x38064000, 0x38", "066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000,\n 0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000,\n 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000,\n 0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000,\n 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000,\n 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000,\n 0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000,\n 0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000,\n 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000,\n 0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000,\n 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x38", | |
"1B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000,\n 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000,\n 0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000,\n 0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000,\n 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000,\n 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000,\n 0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000,\n 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000,\n 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000,\n 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000,\n 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x38", "2FA000, 0x382FC000, 0x382FE000,\n 0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000,\n 0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000,\n 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000,\n 0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000,\n 0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000,\n 0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000,\n 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000,\n 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000,\n 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000,\n 0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000,\n 0x38440000, 0x38442000, 0x", "38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000,\n 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000,\n 0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000,\n 0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000,\n 0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000,\n 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000,\n 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,\n 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000,\n 0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000,\n 0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000,\n 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x", | |
"3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000,\n 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000,\n 0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000,\n 0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000,\n 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000,\n 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000,\n 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000,\n 0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000,\n 0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000,\n 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000,\n 0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x", "386D8000, 0x386DA000, 0x386DC000, 0x386DE000,\n 0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000,\n 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000,\n 0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000,\n 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000,\n 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000,\n 0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000,\n 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000,\n 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000,\n 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 };\n__constant static const uint32_t exponent_table[64] = {\n 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06", "000000, 0x06800000, 0x07000000, 0x07800000,\n 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000,\n 0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,\n 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 };\n__constant static const unsigned short offset_table[64] = {\n 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,\n 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 };\n\nstatic uint16_t float2halfbits(float value) {\n union { float x; uint32_t y; } u;\n u.x = value;\n uint32_t bits = u.y;\n\n uint16_t hbits = base_table[bits>>23] + (uint16_t)((bits&0x7FFFFF)>>shift_table[bits>>23]);;\n\n return hbits;\n}\n\nstatic float halfbits2float(uint16_t value) {\n uint32_t bits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10];\n\n union { uint32_t x; float y; } u;\n u.x = bits;\n return u.y;\n}\n\nstatic uint16_t halfbitsnextafter(uint16_t from, uint16_t to) {\n int fabs = from & 0x7FFF, tabs = to & 0x7FFF;\n if(fabs > 0x7C00 || tabs > 0x7C00) {\n return ((from&0x7FFF)>0x7C00) ? (from|0x200) : (to|0x200);\n }\n if(from == to || !(fabs|tabs)) {\n return to;\n }\n if(!fabs) {\n return (to&0x8000)+1;\n }\n unsigned int out =\n from +\n (((from>>15)^(unsigned int)((from^(0x8000|(0x8000-(from>>15))))<(to^(0x8000|(0x8000-(to>>15))))))<<1)\n - 1;\n return out;", | |
"\n}\n\n// End of half.h.\n// Start of scalar.h.\n\n// Implementation of the primitive scalar operations. Very\n// repetitive. This code is inserted directly into both CUDA and\n// OpenCL programs, as well as the CPU code, so it has some #ifdefs to\n// work everywhere. Some operations are defined as macros because\n// this allows us to use them as constant expressions in things like\n// array sizes and static initialisers.\n\n// Some of the #ifdefs are because OpenCL uses type-generic functions\n// for some operations (e.g. sqrt), while C and CUDA sensibly use\n// distinct functions for different precisions (e.g. sqrtf() and\n// sqrt()). This is quite annoying. Due to C's unfortunate casting\n// rules, it is also really easy to accidentally implement\n// floating-point functions in the wrong precision, so be careful.\n\n// Double-precision definitions are only included if the preprocessor\n// macro FUTHARK_F64_ENABLED is set.\n\nstatic inline uint8_t add8(uint8_t x, uint8_t y) {\n return x + y;\n}\n\nstatic inline uint16_t add16(uint16_t x, uint16_t y) {\n return x + y;\n}\n\nstatic inline uint32_t add32(uint32_t x, uint32_t y) {\n return x + y;\n}\n\nstatic inline uint64_t add64(uint64_t x, uint64_t y) {\n return x + y;\n}\n\nstatic inline uint8_t sub8(uint8_t x, uint8_t y) {\n return x - y;\n}\n\nstatic inline uint16_t sub16(uint16_t x, uint16_t y) {\n return x - y;\n}\n\nstatic inline uint32_t sub32(uint32_t x, uint32_t y) {\n return x - y;\n}\n\nstatic inline uint64_t sub64(uint64_t x, uint64_t y) {\n return x - y;\n}\n\nstatic inline uint8_t mul8(uint8_t x, uint8_t y) {\n return x * y;\n}\n\nstatic inline uint16_t mul16(uint16_t x, uint16_t y) {\n return x * y;\n}\n\nstatic inline uint32_t mul32(uint32_t x, uint32_t y) {\n return x * y;\n}\n\nstatic inline uint64_t mul64(uint64_t x, uint64_t y) {\n return x * y;\n}\n\n#if ISPC\n\nstatic inline uint8_t udiv8(uint8_t x, uint8_t y) {\n // This strange pattern is used to prevent the ISPC compiler from\n // causing SIGFPEs and bogus results on divisions where inactive lan", "es\n // have 0-valued divisors. It ensures that any inactive lane instead\n // has a divisor of 1. https://github.com/ispc/ispc/issues/2292\n uint8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n\n return x / ys;\n}\n\nstatic inline uint16_t udiv16(uint16_t x, uint16_t y) {\n uint16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x / ys;\n}\n\nstatic inline uint32_t udiv32(uint32_t x, uint32_t y) {\n uint32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n\n return x / ys;\n}\n\nstatic inline uint64_t udiv64(uint64_t x, uint64_t y) {\n uint64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n\n return x / ys;\n}\n\nstatic inline uint8_t udiv_up8(uint8_t x, uint8_t y) {\n uint8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n\n return (x + y - 1) / ys;\n}\n\nstatic inline uint16_t udiv_up16(uint16_t x, uint16_t y) {\n uint16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return (x + y - 1) / ys;\n}\n\nstatic inline uint32_t udiv_up32(uint32_t x, uint32_t y) {\n uint32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return (x + y - 1) / ys;\n}\n\nstatic inline uint64_t udiv_up64(uint64_t x, uint64_t y) {\n uint64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return (x + y - 1) / ys;\n}\n\nstatic inline uint8_t umod8(uint8_t x, uint8_t y) {\n uint8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x % ys;\n}\n\nstatic inline uint16_t umod16(uint16_t x, uint16_t y) {\n uint16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n\n return x % ys;\n}\n\nstatic inline uint32_t umod32(uint32_t x, uint32_t y) {\n uint32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x % ys;\n}\n\nstatic inline uint64_t umod64(uint64_t x, uint64_t y) {\n uint64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x % ys;\n}\n\nstatic inline uint8_t udiv_safe8(uint8_t x, uint8_t y) {\n uint8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x / ys;\n}\n\nstatic inline uint16_t udiv_safe16(uint16_t x, uint16_t y) {\n uint16_t ys = 1;\n foreach_active(i){\n ", "ys = y;\n }\n \n return y == 0 ? 0 : x / ys;\n}\n\nstatic inline uint32_t udiv_safe32(uint32_t x, uint32_t y) {\n uint32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x / ys;\n}\n\nstatic inline uint64_t udiv_safe64(uint64_t x, uint64_t y) {\n uint64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x / ys;\n}\n\nstatic inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) {\n uint8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : (x + y - 1) / ys;\n}\n\nstatic inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) {\n uint16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : (x + y - 1) / ys;\n}\n\nstatic inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) {\n uint32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : (x + y - 1) / ys;\n}\n\nstatic inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) {\n uint64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : (x + y - 1) / ys;\n}\n\nstatic inline uint8_t umod_safe8(uint8_t x, uint8_t y) {\n uint8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x % ys;\n}\n\nstatic inline uint16_t umod_safe16(uint16_t x, uint16_t y) {\n uint16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x % ys;\n}\n\nstatic inline uint32_t umod_safe32(uint32_t x, uint32_t y) {\n uint32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x % ys;\n}\n\nstatic inline uint64_t umod_safe64(uint64_t x, uint64_t y) {\n uint64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x % ys;\n}\n\nstatic inline int8_t sdiv8(int8_t x, int8_t y) {\n int8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n int8_t q = x / ys;\n int8_t r = x % ys;\n\n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int16_t sdiv16(int16_t x, int16_t y) {\n int16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n int16_t q = x / ys;\n int16_t r = x % ys;\n\n return q - ((r != 0", | |
" && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int32_t sdiv32(int32_t x, int32_t y) {\n int32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n int32_t q = x / ys;\n int32_t r = x % ys;\n\n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int64_t sdiv64(int64_t x, int64_t y) {\n int64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n int64_t q = x / ys;\n int64_t r = x % ys;\n\n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int8_t sdiv_up8(int8_t x, int8_t y) {\n return sdiv8(x + y - 1, y);\n}\n\nstatic inline int16_t sdiv_up16(int16_t x, int16_t y) {\n return sdiv16(x + y - 1, y);\n}\n\nstatic inline int32_t sdiv_up32(int32_t x, int32_t y) {\n return sdiv32(x + y - 1, y);\n}\n\nstatic inline int64_t sdiv_up64(int64_t x, int64_t y) {\n return sdiv64(x + y - 1, y);\n}\n\nstatic inline int8_t smod8(int8_t x, int8_t y) {\n int8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n int8_t r = x % ys;\n\n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int16_t smod16(int16_t x, int16_t y) {\n int16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n int16_t r = x % ys;\n\n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int32_t smod32(int32_t x, int32_t y) {\n int32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n int32_t r = x % ys;\n\n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int64_t smod64(int64_t x, int64_t y) {\n int64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n int64_t r = x % ys;\n\n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int8_t sdiv_safe8(int8_t x, int8_t y) {\n return y == 0 ? 0 : sdiv8(x, y);\n}\n\nstatic inline int16_t sdiv_safe16(int16_t x, int16_t y) {\n return y == 0 ? 0 : sdiv16(x, y);\n}\n\nstatic inline int32_t sdiv_safe32(int32_t x, int32_t y) {\n return y == 0 ? 0 : sdiv32(x, y);\n}\n\nstatic inline int64_t sdiv_safe64(int64_t x, int64_t y) {\n return y == 0 ? 0 : sdi", "v64(x, y);\n}\n\nstatic inline int8_t sdiv_up_safe8(int8_t x, int8_t y) {\n return sdiv_safe8(x + y - 1, y);\n}\n\nstatic inline int16_t sdiv_up_safe16(int16_t x, int16_t y) {\n return sdiv_safe16(x + y - 1, y);\n}\n\nstatic inline int32_t sdiv_up_safe32(int32_t x, int32_t y) {\n return sdiv_safe32(x + y - 1, y);\n}\n\nstatic inline int64_t sdiv_up_safe64(int64_t x, int64_t y) {\n return sdiv_safe64(x + y - 1, y);\n}\n\nstatic inline int8_t smod_safe8(int8_t x, int8_t y) {\n return y == 0 ? 0 : smod8(x, y);\n}\n\nstatic inline int16_t smod_safe16(int16_t x, int16_t y) {\n return y == 0 ? 0 : smod16(x, y);\n}\n\nstatic inline int32_t smod_safe32(int32_t x, int32_t y) {\n return y == 0 ? 0 : smod32(x, y);\n}\n\nstatic inline int64_t smod_safe64(int64_t x, int64_t y) {\n return y == 0 ? 0 : smod64(x, y);\n}\n\nstatic inline int8_t squot8(int8_t x, int8_t y) {\n int8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x / ys;\n}\n\nstatic inline int16_t squot16(int16_t x, int16_t y) {\n int16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x / ys;\n}\n\nstatic inline int32_t squot32(int32_t x, int32_t y) {\n int32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x / ys;\n}\n\nstatic inline int64_t squot64(int64_t x, int64_t y) {\n int64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x / ys;\n}\n\nstatic inline int8_t srem8(int8_t x, int8_t y) {\n int8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x % ys;\n}\n\nstatic inline int16_t srem16(int16_t x, int16_t y) {\n int16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x % ys;\n}\n\nstatic inline int32_t srem32(int32_t x, int32_t y) {\n int32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x % ys;\n}\n\nstatic inline int64_t srem64(int64_t x, int64_t y) {\n int8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return x % ys;\n}\n\nstatic inline int8_t squot_safe8(int8_t x, int8_t y) {\n int8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x / ys;\n}\n\nstatic inline int16_t ", "squot_safe16(int16_t x, int16_t y) {\n int16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x / ys;\n}\n\nstatic inline int32_t squot_safe32(int32_t x, int32_t y) {\n int32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x / ys;\n}\n\nstatic inline int64_t squot_safe64(int64_t x, int64_t y) {\n int64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x / ys;\n}\n\nstatic inline int8_t srem_safe8(int8_t x, int8_t y) {\n int8_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x % ys;\n}\n\nstatic inline int16_t srem_safe16(int16_t x, int16_t y) {\n int16_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x % ys;\n}\n\nstatic inline int32_t srem_safe32(int32_t x, int32_t y) {\n int32_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x % ys;\n}\n\nstatic inline int64_t srem_safe64(int64_t x, int64_t y) {\n int64_t ys = 1;\n foreach_active(i){\n ys = y;\n }\n \n return y == 0 ? 0 : x % ys;\n}\n\n#else\n\nstatic inline uint8_t udiv8(uint8_t x, uint8_t y) {\n return x / y;\n}\n\nstatic inline uint16_t udiv16(uint16_t x, uint16_t y) {\n return x / y;\n}\n\nstatic inline uint32_t udiv32(uint32_t x, uint32_t y) {\n return x / y;\n}\n\nstatic inline uint64_t udiv64(uint64_t x, uint64_t y) {\n return x / y;\n}\n\nstatic inline uint8_t udiv_up8(uint8_t x, uint8_t y) {\n return (x + y - 1) / y;\n}\n\nstatic inline uint16_t udiv_up16(uint16_t x, uint16_t y) {\n return (x + y - 1) / y;\n}\n\nstatic inline uint32_t udiv_up32(uint32_t x, uint32_t y) {\n return (x + y - 1) / y;\n}\n\nstatic inline uint64_t udiv_up64(uint64_t x, uint64_t y) {\n return (x + y - 1) / y;\n}\n\nstatic inline uint8_t umod8(uint8_t x, uint8_t y) {\n return x % y;\n}\n\nstatic inline uint16_t umod16(uint16_t x, uint16_t y) {\n return x % y;\n}\n\nstatic inline uint32_t umod32(uint32_t x, uint32_t y) {\n return x % y;\n}\n\nstatic inline uint64_t umod64(uint64_t x, uint64_t y) {\n return x % y;\n}\n\nstatic inline uint8_t udiv_safe8(u", | |
"int8_t x, uint8_t y) {\n return y == 0 ? 0 : x / y;\n}\n\nstatic inline uint16_t udiv_safe16(uint16_t x, uint16_t y) {\n return y == 0 ? 0 : x / y;\n}\n\nstatic inline uint32_t udiv_safe32(uint32_t x, uint32_t y) {\n return y == 0 ? 0 : x / y;\n}\n\nstatic inline uint64_t udiv_safe64(uint64_t x, uint64_t y) {\n return y == 0 ? 0 : x / y;\n}\n\nstatic inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) {\n return y == 0 ? 0 : (x + y - 1) / y;\n}\n\nstatic inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) {\n return y == 0 ? 0 : (x + y - 1) / y;\n}\n\nstatic inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) {\n return y == 0 ? 0 : (x + y - 1) / y;\n}\n\nstatic inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) {\n return y == 0 ? 0 : (x + y - 1) / y;\n}\n\nstatic inline uint8_t umod_safe8(uint8_t x, uint8_t y) {\n return y == 0 ? 0 : x % y;\n}\n\nstatic inline uint16_t umod_safe16(uint16_t x, uint16_t y) {\n return y == 0 ? 0 : x % y;\n}\n\nstatic inline uint32_t umod_safe32(uint32_t x, uint32_t y) {\n return y == 0 ? 0 : x % y;\n}\n\nstatic inline uint64_t umod_safe64(uint64_t x, uint64_t y) {\n return y == 0 ? 0 : x % y;\n}\n\nstatic inline int8_t sdiv8(int8_t x, int8_t y) {\n int8_t q = x / y;\n int8_t r = x % y;\n\n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int16_t sdiv16(int16_t x, int16_t y) {\n int16_t q = x / y;\n int16_t r = x % y;\n\n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int32_t sdiv32(int32_t x, int32_t y) {\n int32_t q = x / y;\n int32_t r = x % y;\n\n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int64_t sdiv64(int64_t x, int64_t y) {\n int64_t q = x / y;\n int64_t r = x % y;\n\n return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int8_t sdiv_up8(int8_t x, int8_t y) {\n return sdiv8(x + y - 1, y);\n}\n\nstatic inline int16_t sdiv_up16(int16_t x, int16_t y) {\n return sdiv16(x + y - 1, y);\n}\n\nstatic inline int32_t sdiv_up32(int32_t x, int32_t y) {\n return sdiv32(x + y - 1, y);\n}\n\nstatic inline int64", "_t sdiv_up64(int64_t x, int64_t y) {\n return sdiv64(x + y - 1, y);\n}\n\nstatic inline int8_t smod8(int8_t x, int8_t y) {\n int8_t r = x % y;\n\n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int16_t smod16(int16_t x, int16_t y) {\n int16_t r = x % y;\n\n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int32_t smod32(int32_t x, int32_t y) {\n int32_t r = x % y;\n\n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int64_t smod64(int64_t x, int64_t y) {\n int64_t r = x % y;\n\n return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int8_t sdiv_safe8(int8_t x, int8_t y) {\n return y == 0 ? 0 : sdiv8(x, y);\n}\n\nstatic inline int16_t sdiv_safe16(int16_t x, int16_t y) {\n return y == 0 ? 0 : sdiv16(x, y);\n}\n\nstatic inline int32_t sdiv_safe32(int32_t x, int32_t y) {\n return y == 0 ? 0 : sdiv32(x, y);\n}\n\nstatic inline int64_t sdiv_safe64(int64_t x, int64_t y) {\n return y == 0 ? 0 : sdiv64(x, y);\n}\n\nstatic inline int8_t sdiv_up_safe8(int8_t x, int8_t y) {\n return sdiv_safe8(x + y - 1, y);\n}\n\nstatic inline int16_t sdiv_up_safe16(int16_t x, int16_t y) {\n return sdiv_safe16(x + y - 1, y);\n}\n\nstatic inline int32_t sdiv_up_safe32(int32_t x, int32_t y) {\n return sdiv_safe32(x + y - 1, y);\n}\n\nstatic inline int64_t sdiv_up_safe64(int64_t x, int64_t y) {\n return sdiv_safe64(x + y - 1, y);\n}\n\nstatic inline int8_t smod_safe8(int8_t x, int8_t y) {\n return y == 0 ? 0 : smod8(x, y);\n}\n\nstatic inline int16_t smod_safe16(int16_t x, int16_t y) {\n return y == 0 ? 0 : smod16(x, y);\n}\n\nstatic inline int32_t smod_safe32(int32_t x, int32_t y) {\n return y == 0 ? 0 : smod32(x, y);\n}\n\nstatic inline int64_t smod_safe64(int64_t x, int64_t y) {\n return y == 0 ? 0 : smod64(x, y);\n}\n\nstatic inline int8_t squot8(int8_t x, int8_t y) {\n return x / y;\n}\n\nstatic inline int16_t squot16(int16_t x, int16_t y) {\n return x / y;\n}\n\nstatic inline int32_t squot32(int32", "_t x, int32_t y) {\n return x / y;\n}\n\nstatic inline int64_t squot64(int64_t x, int64_t y) {\n return x / y;\n}\n\nstatic inline int8_t srem8(int8_t x, int8_t y) {\n return x % y;\n}\n\nstatic inline int16_t srem16(int16_t x, int16_t y) {\n return x % y;\n}\n\nstatic inline int32_t srem32(int32_t x, int32_t y) {\n return x % y;\n}\n\nstatic inline int64_t srem64(int64_t x, int64_t y) {\n return x % y;\n}\n\nstatic inline int8_t squot_safe8(int8_t x, int8_t y) {\n return y == 0 ? 0 : x / y;\n}\n\nstatic inline int16_t squot_safe16(int16_t x, int16_t y) {\n return y == 0 ? 0 : x / y;\n}\n\nstatic inline int32_t squot_safe32(int32_t x, int32_t y) {\n return y == 0 ? 0 : x / y;\n}\n\nstatic inline int64_t squot_safe64(int64_t x, int64_t y) {\n return y == 0 ? 0 : x / y;\n}\n\nstatic inline int8_t srem_safe8(int8_t x, int8_t y) {\n return y == 0 ? 0 : x % y;\n}\n\nstatic inline int16_t srem_safe16(int16_t x, int16_t y) {\n return y == 0 ? 0 : x % y;\n}\n\nstatic inline int32_t srem_safe32(int32_t x, int32_t y) {\n return y == 0 ? 0 : x % y;\n}\n\nstatic inline int64_t srem_safe64(int64_t x, int64_t y) {\n return y == 0 ? 0 : x % y;\n}\n\n#endif\n\nstatic inline int8_t smin8(int8_t x, int8_t y) {\n return x < y ? x : y;\n}\n\nstatic inline int16_t smin16(int16_t x, int16_t y) {\n return x < y ? x : y;\n}\n\nstatic inline int32_t smin32(int32_t x, int32_t y) {\n return x < y ? x : y;\n}\n\nstatic inline int64_t smin64(int64_t x, int64_t y) {\n return x < y ? x : y;\n}\n\nstatic inline uint8_t umin8(uint8_t x, uint8_t y) {\n return x < y ? x : y;\n}\n\nstatic inline uint16_t umin16(uint16_t x, uint16_t y) {\n return x < y ? x : y;\n}\n\nstatic inline uint32_t umin32(uint32_t x, uint32_t y) {\n return x < y ? x : y;\n}\n\nstatic inline uint64_t umin64(uint64_t x, uint64_t y) {\n return x < y ? x : y;\n}\n\nstatic inline int8_t smax8(int8_t x, int8_t y) {\n return x < y ? y : x;\n}\n\nstatic inline int16_t smax16(int16_t x, int16_t y) {\n return x < y ? y : x;\n}\n\nstatic inline int32_t smax32(int32_t x, int32_t y) {\n return x < y ? y : x;\n}\n\ns", | |
"tatic inline int64_t smax64(int64_t x, int64_t y) {\n return x < y ? y : x;\n}\n\nstatic inline uint8_t umax8(uint8_t x, uint8_t y) {\n return x < y ? y : x;\n}\n\nstatic inline uint16_t umax16(uint16_t x, uint16_t y) {\n return x < y ? y : x;\n}\n\nstatic inline uint32_t umax32(uint32_t x, uint32_t y) {\n return x < y ? y : x;\n}\n\nstatic inline uint64_t umax64(uint64_t x, uint64_t y) {\n return x < y ? y : x;\n}\n\nstatic inline uint8_t shl8(uint8_t x, uint8_t y) {\n return (uint8_t)(x << y);\n}\n\nstatic inline uint16_t shl16(uint16_t x, uint16_t y) {\n return (uint16_t)(x << y);\n}\n\nstatic inline uint32_t shl32(uint32_t x, uint32_t y) {\n return x << y;\n}\n\nstatic inline uint64_t shl64(uint64_t x, uint64_t y) {\n return x << y;\n}\n\nstatic inline uint8_t lshr8(uint8_t x, uint8_t y) {\n return x >> y;\n}\n\nstatic inline uint16_t lshr16(uint16_t x, uint16_t y) {\n return x >> y;\n}\n\nstatic inline uint32_t lshr32(uint32_t x, uint32_t y) {\n return x >> y;\n}\n\nstatic inline uint64_t lshr64(uint64_t x, uint64_t y) {\n return x >> y;\n}\n\nstatic inline int8_t ashr8(int8_t x, int8_t y) {\n return x >> y;\n}\n\nstatic inline int16_t ashr16(int16_t x, int16_t y) {\n return x >> y;\n}\n\nstatic inline int32_t ashr32(int32_t x, int32_t y) {\n return x >> y;\n}\n\nstatic inline int64_t ashr64(int64_t x, int64_t y) {\n return x >> y;\n}\n\nstatic inline uint8_t and8(uint8_t x, uint8_t y) {\n return x & y;\n}\n\nstatic inline uint16_t and16(uint16_t x, uint16_t y) {\n return x & y;\n}\n\nstatic inline uint32_t and32(uint32_t x, uint32_t y) {\n return x & y;\n}\n\nstatic inline uint64_t and64(uint64_t x, uint64_t y) {\n return x & y;\n}\n\nstatic inline uint8_t or8(uint8_t x, uint8_t y) {\n return x | y;\n}\n\nstatic inline uint16_t or16(uint16_t x, uint16_t y) {\n return x | y;\n}\n\nstatic inline uint32_t or32(uint32_t x, uint32_t y) {\n return x | y;\n}\n\nstatic inline uint64_t or64(uint64_t x, uint64_t y) {\n return x | y;\n}\n\nstatic inline uint8_t xor8(uint8_t x, uint8_t y) {\n return x ^ y;\n}\n\nstatic inline uint16_t xor16(uint16_", "t x, uint16_t y) {\n return x ^ y;\n}\n\nstatic inline uint32_t xor32(uint32_t x, uint32_t y) {\n return x ^ y;\n}\n\nstatic inline uint64_t xor64(uint64_t x, uint64_t y) {\n return x ^ y;\n}\n\nstatic inline bool ult8(uint8_t x, uint8_t y) {\n return x < y;\n}\n\nstatic inline bool ult16(uint16_t x, uint16_t y) {\n return x < y;\n}\n\nstatic inline bool ult32(uint32_t x, uint32_t y) {\n return x < y;\n}\n\nstatic inline bool ult64(uint64_t x, uint64_t y) {\n return x < y;\n}\n\nstatic inline bool ule8(uint8_t x, uint8_t y) {\n return x <= y;\n}\n\nstatic inline bool ule16(uint16_t x, uint16_t y) {\n return x <= y;\n}\n\nstatic inline bool ule32(uint32_t x, uint32_t y) {\n return x <= y;\n}\n\nstatic inline bool ule64(uint64_t x, uint64_t y) {\n return x <= y;\n}\n\nstatic inline bool slt8(int8_t x, int8_t y) {\n return x < y;\n}\n\nstatic inline bool slt16(int16_t x, int16_t y) {\n return x < y;\n}\n\nstatic inline bool slt32(int32_t x, int32_t y) {\n return x < y;\n}\n\nstatic inline bool slt64(int64_t x, int64_t y) {\n return x < y;\n}\n\nstatic inline bool sle8(int8_t x, int8_t y) {\n return x <= y;\n}\n\nstatic inline bool sle16(int16_t x, int16_t y) {\n return x <= y;\n}\n\nstatic inline bool sle32(int32_t x, int32_t y) {\n return x <= y;\n}\n\nstatic inline bool sle64(int64_t x, int64_t y) {\n return x <= y;\n}\n\nstatic inline uint8_t pow8(uint8_t x, uint8_t y) {\n uint8_t res = 1, rem = y;\n\n while (rem != 0) {\n if (rem & 1)\n res *= x;\n rem >>= 1;\n x *= x;\n }\n return res;\n}\n\nstatic inline uint16_t pow16(uint16_t x, uint16_t y) {\n uint16_t res = 1, rem = y;\n\n while (rem != 0) {\n if (rem & 1)\n res *= x;\n rem >>= 1;\n x *= x;\n }\n return res;\n}\n\nstatic inline uint32_t pow32(uint32_t x, uint32_t y) {\n uint32_t res = 1, rem = y;\n\n while (rem != 0) {\n if (rem & 1)\n res *= x;\n rem >>= 1;\n x *= x;\n }\n return res;\n}\n\nstatic inline uint64_t pow64(uint64_t x, uint64_t y) {\n uint64_t res = 1, rem = y;\n\n while (rem != 0) {\n if (rem & 1)\n res *= x;\n rem >>= 1;\n ", "x *= x;\n }\n return res;\n}\n\nstatic inline bool itob_i8_bool(int8_t x) {\n return x != 0;\n}\n\nstatic inline bool itob_i16_bool(int16_t x) {\n return x != 0;\n}\n\nstatic inline bool itob_i32_bool(int32_t x) {\n return x != 0;\n}\n\nstatic inline bool itob_i64_bool(int64_t x) {\n return x != 0;\n}\n\nstatic inline int8_t btoi_bool_i8(bool x) {\n return x;\n}\n\nstatic inline int16_t btoi_bool_i16(bool x) {\n return x;\n}\n\nstatic inline int32_t btoi_bool_i32(bool x) {\n return x;\n}\n\nstatic inline int64_t btoi_bool_i64(bool x) {\n return x;\n}\n\n#define sext_i8_i8(x) ((int8_t) (int8_t) (x))\n#define sext_i8_i16(x) ((int16_t) (int8_t) (x))\n#define sext_i8_i32(x) ((int32_t) (int8_t) (x))\n#define sext_i8_i64(x) ((int64_t) (int8_t) (x))\n#define sext_i16_i8(x) ((int8_t) (int16_t) (x))\n#define sext_i16_i16(x) ((int16_t) (int16_t) (x))\n#define sext_i16_i32(x) ((int32_t) (int16_t) (x))\n#define sext_i16_i64(x) ((int64_t) (int16_t) (x))\n#define sext_i32_i8(x) ((int8_t) (int32_t) (x))\n#define sext_i32_i16(x) ((int16_t) (int32_t) (x))\n#define sext_i32_i32(x) ((int32_t) (int32_t) (x))\n#define sext_i32_i64(x) ((int64_t) (int32_t) (x))\n#define sext_i64_i8(x) ((int8_t) (int64_t) (x))\n#define sext_i64_i16(x) ((int16_t) (int64_t) (x))\n#define sext_i64_i32(x) ((int32_t) (int64_t) (x))\n#define sext_i64_i64(x) ((int64_t) (int64_t) (x))\n#define zext_i8_i8(x) ((int8_t) (uint8_t) (x))\n#define zext_i8_i16(x) ((int16_t) (uint8_t) (x))\n#define zext_i8_i32(x) ((int32_t) (uint8_t) (x))\n#define zext_i8_i64(x) ((int64_t) (uint8_t) (x))\n#define zext_i16_i8(x) ((int8_t) (uint16_t) (x))\n#define zext_i16_i16(x) ((int16_t) (uint16_t) (x))\n#define zext_i16_i32(x) ((int32_t) (uint16_t) (x))\n#define zext_i16_i64(x) ((int64_t) (uint16_t) (x))\n#define zext_i32_i8(x) ((int8_t) (uint32_t) (x))\n#define zext_i32_i16(x) ((int16_t) (uint32_t) (x))\n#define zext_i32_i32(x) ((int32_t) (uint32_t) (x))\n#define zext_i32_i64(x) ((int64_t) (uint32_t) (x))\n#define zext_i64_i8(x) ((int8_t) (uint64_t) (x))\n#define zext_i64_i16(x) ((int16_t) (", | |
"uint64_t) (x))\n#define zext_i64_i32(x) ((int32_t) (uint64_t) (x))\n#define zext_i64_i64(x) ((int64_t) (uint64_t) (x))\n\nstatic int8_t abs8(int8_t x) {\n return (int8_t)abs(x);\n}\n\nstatic int16_t abs16(int16_t x) {\n return (int16_t)abs(x);\n}\n\nstatic int32_t abs32(int32_t x) {\n return abs(x);\n}\n\nstatic int64_t abs64(int64_t x) {\n#if defined(__OPENCL_VERSION__) || defined(ISPC)\n return abs(x);\n#else\n return llabs(x);\n#endif\n}\n\n#if defined(__OPENCL_VERSION__)\nstatic int32_t futrts_popc8(int8_t x) {\n return popcount(x);\n}\n\nstatic int32_t futrts_popc16(int16_t x) {\n return popcount(x);\n}\n\nstatic int32_t futrts_popc32(int32_t x) {\n return popcount(x);\n}\n\nstatic int32_t futrts_popc64(int64_t x) {\n return popcount(x);\n}\n#elif defined(__CUDA_ARCH__)\n\nstatic int32_t futrts_popc8(int8_t x) {\n return __popc(zext_i8_i32(x));\n}\n\nstatic int32_t futrts_popc16(int16_t x) {\n return __popc(zext_i16_i32(x));\n}\n\nstatic int32_t futrts_popc32(int32_t x) {\n return __popc(x);\n}\n\nstatic int32_t futrts_popc64(int64_t x) {\n return __popcll(x);\n}\n\n#else // Not OpenCL or CUDA, but plain C.\n\nstatic int32_t futrts_popc8(uint8_t x) {\n int c = 0;\n for (; x; ++c) { x &= x - 1; }\n return c;\n}\n\nstatic int32_t futrts_popc16(uint16_t x) {\n int c = 0;\n for (; x; ++c) { x &= x - 1; }\n return c;\n}\n\nstatic int32_t futrts_popc32(uint32_t x) {\n int c = 0;\n for (; x; ++c) { x &= x - 1; }\n return c;\n}\n\nstatic int32_t futrts_popc64(uint64_t x) {\n int c = 0;\n for (; x; ++c) { x &= x - 1; }\n return c;\n}\n#endif\n\n#if defined(__OPENCL_VERSION__)\nstatic uint8_t futrts_umul_hi8 ( uint8_t a, uint8_t b) { return mul_hi(a, b); }\nstatic uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return mul_hi(a, b); }\nstatic uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return mul_hi(a, b); }\nstatic uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return mul_hi(a, b); }\nstatic uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return mul_hi(a, b); }\nstatic uint16_t futrts_smul_hi16(int16_t a, int16_t b", ") { return mul_hi(a, b); }\nstatic uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return mul_hi(a, b); }\nstatic uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return mul_hi(a, b); }\n#elif defined(__CUDA_ARCH__)\nstatic uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }\nstatic uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }\nstatic uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return __umulhi(a, b); }\nstatic uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return __umul64hi(a, b); }\nstatic uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; }\nstatic uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }\nstatic uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return __mulhi(a, b); }\nstatic uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return __mul64hi(a, b); }\n#elif ISPC\nstatic uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }\nstatic uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }\nstatic uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }\nstatic uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) {\n uint64_t ah = a >> 32;\n uint64_t al = a & 0xffffffff;\n uint64_t bh = b >> 32;\n uint64_t bl = b & 0xffffffff;\n\n uint64_t p1 = al * bl;\n uint64_t p2 = al * bh;\n uint64_t p3 = ah * bl;\n uint64_t p4 = ah * bh;\n\n uint64_t p1h = p1 >> 32;\n uint64_t p2h = p2 >> 32;\n uint64_t p3h = p3 >> 32;\n uint64_t p2l = p2 & 0xffffffff;\n uint64_t p3l = p3 & 0xffffffff;\n\n uint64_t l = p1h + p2l + p3l;\n uint64_t m = (p2 >> 32) + (p3 >> 32);\n uint64_t h = (l >> 32) + m + p4;\n\n return h;\n}\nstatic int8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }\nstatic int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((uint32_t)a", ") * ((uint32_t)b) >> 16; }\nstatic int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }\nstatic int64_t futrts_smul_hi64(int64_t a, int64_t b) {\n uint64_t ah = a >> 32;\n uint64_t al = a & 0xffffffff;\n uint64_t bh = b >> 32;\n uint64_t bl = b & 0xffffffff;\n\n uint64_t p1 = al * bl;\n int64_t p2 = al * bh;\n int64_t p3 = ah * bl;\n uint64_t p4 = ah * bh;\n\n uint64_t p1h = p1 >> 32;\n uint64_t p2h = p2 >> 32;\n uint64_t p3h = p3 >> 32;\n uint64_t p2l = p2 & 0xffffffff;\n uint64_t p3l = p3 & 0xffffffff;\n\n uint64_t l = p1h + p2l + p3l;\n uint64_t m = (p2 >> 32) + (p3 >> 32);\n uint64_t h = (l >> 32) + m + p4;\n\n return h;\n}\n\n#else // Not OpenCL, ISPC, or CUDA, but plain C.\nstatic uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }\nstatic uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }\nstatic uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }\nstatic uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return ((__uint128_t)a) * ((__uint128_t)b) >> 64; }\nstatic int8_t futrts_smul_hi8(int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; }\nstatic int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }\nstatic int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((int64_t)a) * ((int64_t)b) >> 32; }\nstatic int64_t futrts_smul_hi64(int64_t a, int64_t b) { return ((__int128_t)a) * ((__int128_t)b) >> 64; }\n#endif\n\n#if defined(__OPENCL_VERSION__)\nstatic uint8_t futrts_umad_hi8 ( uint8_t a, uint8_t b, uint8_t c) { return mad_hi(a, b, c); }\nstatic uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return mad_hi(a, b, c); }\nstatic uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return mad_hi(a, b, c); }\nstatic uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return mad_hi(a, b, c); }\nstatic uint8_t futrts_sm", | |
"ad_hi8( int8_t a, int8_t b, int8_t c) { return mad_hi(a, b, c); }\nstatic uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return mad_hi(a, b, c); }\nstatic uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return mad_hi(a, b, c); }\nstatic uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return mad_hi(a, b, c); }\n#else // Not OpenCL\n\nstatic uint8_t futrts_umad_hi8( uint8_t a, uint8_t b, uint8_t c) { return futrts_umul_hi8(a, b) + c; }\nstatic uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return futrts_umul_hi16(a, b) + c; }\nstatic uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return futrts_umul_hi32(a, b) + c; }\nstatic uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return futrts_umul_hi64(a, b) + c; }\nstatic uint8_t futrts_smad_hi8 ( int8_t a, int8_t b, int8_t c) { return futrts_smul_hi8(a, b) + c; }\nstatic uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return futrts_smul_hi16(a, b) + c; }\nstatic uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return futrts_smul_hi32(a, b) + c; }\nstatic uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return futrts_smul_hi64(a, b) + c; }\n#endif\n\n#if defined(__OPENCL_VERSION__)\nstatic int32_t futrts_clzz8(int8_t x) {\n return clz(x);\n}\n\nstatic int32_t futrts_clzz16(int16_t x) {\n return clz(x);\n}\n\nstatic int32_t futrts_clzz32(int32_t x) {\n return clz(x);\n}\n\nstatic int32_t futrts_clzz64(int64_t x) {\n return clz(x);\n}\n\n#elif defined(__CUDA_ARCH__)\n\nstatic int32_t futrts_clzz8(int8_t x) {\n return __clz(zext_i8_i32(x)) - 24;\n}\n\nstatic int32_t futrts_clzz16(int16_t x) {\n return __clz(zext_i16_i32(x)) - 16;\n}\n\nstatic int32_t futrts_clzz32(int32_t x) {\n return __clz(x);\n}\n\nstatic int32_t futrts_clzz64(int64_t x) {\n return __clzll(x);\n}\n\n#elif ISPC\n\nstatic int32_t futrts_clzz8(int8_t x) {\n return count_leading_zeros((int32_t)(uint8_t)x)-24;\n}\n\nstatic int32_t futrts_clzz16(int16_t x) {\n return count_lead", "ing_zeros((int32_t)(uint16_t)x)-16;\n}\n\nstatic int32_t futrts_clzz32(int32_t x) {\n return count_leading_zeros(x);\n}\n\nstatic int32_t futrts_clzz64(int64_t x) {\n return count_leading_zeros(x);\n}\n\n#else // Not OpenCL, ISPC or CUDA, but plain C.\n\nstatic int32_t futrts_clzz8(int8_t x) {\n return x == 0 ? 8 : __builtin_clz((uint32_t)zext_i8_i32(x)) - 24;\n}\n\nstatic int32_t futrts_clzz16(int16_t x) {\n return x == 0 ? 16 : __builtin_clz((uint32_t)zext_i16_i32(x)) - 16;\n}\n\nstatic int32_t futrts_clzz32(int32_t x) {\n return x == 0 ? 32 : __builtin_clz((uint32_t)x);\n}\n\nstatic int32_t futrts_clzz64(int64_t x) {\n return x == 0 ? 64 : __builtin_clzll((uint64_t)x);\n}\n#endif\n\n#if defined(__OPENCL_VERSION__)\nstatic int32_t futrts_ctzz8(int8_t x) {\n int i = 0;\n for (; i < 8 && (x & 1) == 0; i++, x >>= 1)\n ;\n return i;\n}\n\nstatic int32_t futrts_ctzz16(int16_t x) {\n int i = 0;\n for (; i < 16 && (x & 1) == 0; i++, x >>= 1)\n ;\n return i;\n}\n\nstatic int32_t futrts_ctzz32(int32_t x) {\n int i = 0;\n for (; i < 32 && (x & 1) == 0; i++, x >>= 1)\n ;\n return i;\n}\n\nstatic int32_t futrts_ctzz64(int64_t x) {\n int i = 0;\n for (; i < 64 && (x & 1) == 0; i++, x >>= 1)\n ;\n return i;\n}\n\n#elif defined(__CUDA_ARCH__)\n\nstatic int32_t futrts_ctzz8(int8_t x) {\n int y = __ffs(x);\n return y == 0 ? 8 : y - 1;\n}\n\nstatic int32_t futrts_ctzz16(int16_t x) {\n int y = __ffs(x);\n return y == 0 ? 16 : y - 1;\n}\n\nstatic int32_t futrts_ctzz32(int32_t x) {\n int y = __ffs(x);\n return y == 0 ? 32 : y - 1;\n}\n\nstatic int32_t futrts_ctzz64(int64_t x) {\n int y = __ffsll(x);\n return y == 0 ? 64 : y - 1;\n}\n\n#elif ISPC\n\nstatic int32_t futrts_ctzz8(int8_t x) {\n return x == 0 ? 8 : count_trailing_zeros((int32_t)x);\n}\n\nstatic int32_t futrts_ctzz16(int16_t x) {\n return x == 0 ? 16 : count_trailing_zeros((int32_t)x);\n}\n\nstatic int32_t futrts_ctzz32(int32_t x) {\n return count_trailing_zeros(x);\n}\n\nstatic int32_t futrts_ctzz64(int64_t x) {\n return count_trailing_zeros(x);\n}\n\n#else // Not OpenCL or CUDA,", " but plain C.\n\nstatic int32_t futrts_ctzz8(int8_t x) {\n return x == 0 ? 8 : __builtin_ctz((uint32_t)x);\n}\n\nstatic int32_t futrts_ctzz16(int16_t x) {\n return x == 0 ? 16 : __builtin_ctz((uint32_t)x);\n}\n\nstatic int32_t futrts_ctzz32(int32_t x) {\n return x == 0 ? 32 : __builtin_ctz((uint32_t)x);\n}\n\nstatic int32_t futrts_ctzz64(int64_t x) {\n return x == 0 ? 64 : __builtin_ctzll((uint64_t)x);\n}\n#endif\n\nstatic inline float fdiv32(float x, float y) {\n return x / y;\n}\n\nstatic inline float fadd32(float x, float y) {\n return x + y;\n}\n\nstatic inline float fsub32(float x, float y) {\n return x - y;\n}\n\nstatic inline float fmul32(float x, float y) {\n return x * y;\n}\n\nstatic inline bool cmplt32(float x, float y) {\n return x < y;\n}\n\nstatic inline bool cmple32(float x, float y) {\n return x <= y;\n}\n\nstatic inline float sitofp_i8_f32(int8_t x) {\n return (float) x;\n}\n\nstatic inline float sitofp_i16_f32(int16_t x) {\n return (float) x;\n}\n\nstatic inline float sitofp_i32_f32(int32_t x) {\n return (float) x;\n}\n\nstatic inline float sitofp_i64_f32(int64_t x) {\n return (float) x;\n}\n\nstatic inline float uitofp_i8_f32(uint8_t x) {\n return (float) x;\n}\n\nstatic inline float uitofp_i16_f32(uint16_t x) {\n return (float) x;\n}\n\nstatic inline float uitofp_i32_f32(uint32_t x) {\n return (float) x;\n}\n\nstatic inline float uitofp_i64_f32(uint64_t x) {\n return (float) x;\n}\n\n#ifdef __OPENCL_VERSION__\nstatic inline float fabs32(float x) {\n return fabs(x);\n}\n\nstatic inline float fmax32(float x, float y) {\n return fmax(x, y);\n}\n\nstatic inline float fmin32(float x, float y) {\n return fmin(x, y);\n}\n\nstatic inline float fpow32(float x, float y) {\n return pow(x, y);\n}\n\n#elif ISPC\n\nstatic inline float fabs32(float x) {\n return abs(x);\n}\n\nstatic inline float fmax32(float x, float y) {\n return isnan(x) ? y : isnan(y) ? x : max(x, y);\n}\n\nstatic inline float fmin32(float x, float y) {\n return isnan(x) ? y : isnan(y) ? x : min(x, y);\n}\n\nstatic inline float fpow32(float a, float b) {\n float ret;\n f", | |
"oreach_active (i) {\n uniform float r = __stdlib_powf(extract(a, i), extract(b, i));\n ret = insert(ret, i, r);\n }\n return ret;\n}\n\n#else // Not OpenCL, but CUDA or plain C.\n\nstatic inline float fabs32(float x) {\n return fabsf(x);\n}\n\nstatic inline float fmax32(float x, float y) {\n return fmaxf(x, y);\n}\n\nstatic inline float fmin32(float x, float y) {\n return fminf(x, y);\n}\n\nstatic inline float fpow32(float x, float y) {\n return powf(x, y);\n}\n#endif\n\nstatic inline bool futrts_isnan32(float x) {\n return isnan(x);\n}\n\n#if ISPC\n\nstatic inline bool futrts_isinf32(float x) {\n return !isnan(x) && isnan(x - x);\n}\n\nstatic inline bool futrts_isfinite32(float x) {\n return !isnan(x) && !futrts_isinf32(x);\n}\n\n#else\n\nstatic inline bool futrts_isinf32(float x) {\n return isinf(x);\n}\n\n#endif\n\nstatic inline int8_t fptosi_f32_i8(float x) {\n if (futrts_isnan32(x) || futrts_isinf32(x)) {\n return 0;\n } else {\n return (int8_t) x;\n }\n}\n\nstatic inline int16_t fptosi_f32_i16(float x) {\n if (futrts_isnan32(x) || futrts_isinf32(x)) {\n return 0;\n } else {\n return (int16_t) x;\n }\n}\n\nstatic inline int32_t fptosi_f32_i32(float x) {\n if (futrts_isnan32(x) || futrts_isinf32(x)) {\n return 0;\n } else {\n return (int32_t) x;\n }\n}\n\nstatic inline int64_t fptosi_f32_i64(float x) {\n if (futrts_isnan32(x) || futrts_isinf32(x)) {\n return 0;\n } else {\n return (int64_t) x;\n };\n}\n\nstatic inline uint8_t fptoui_f32_i8(float x) {\n if (futrts_isnan32(x) || futrts_isinf32(x)) {\n return 0;\n } else {\n return (uint8_t) (int8_t) x;\n }\n}\n\nstatic inline uint16_t fptoui_f32_i16(float x) {\n if (futrts_isnan32(x) || futrts_isinf32(x)) {\n return 0;\n } else {\n return (uint16_t) (int16_t) x;\n }\n}\n\nstatic inline uint32_t fptoui_f32_i32(float x) {\n if (futrts_isnan32(x) || futrts_isinf32(x)) {\n return 0;\n } else {\n return (uint32_t) (int32_t) x;\n }\n}\n\nstatic inline uint64_t fptoui_f32_i64(float x) {\n if (futrts_isnan32(x) || futrts_isinf32(x)) {\n ret", "urn 0;\n } else {\n return (uint64_t) (int64_t) x;\n }\n}\n\nstatic inline bool ftob_f32_bool(float x) {\n return x != 0;\n}\n\nstatic inline float btof_bool_f32(bool x) {\n return x ? 1 : 0;\n}\n\n#ifdef __OPENCL_VERSION__\nstatic inline float futrts_log32(float x) {\n return log(x);\n}\n\nstatic inline float futrts_log2_32(float x) {\n return log2(x);\n}\n\nstatic inline float futrts_log10_32(float x) {\n return log10(x);\n}\n\nstatic inline float futrts_log1p_32(float x) {\n return log1p(x);\n}\n\nstatic inline float futrts_sqrt32(float x) {\n return sqrt(x);\n}\n\nstatic inline float futrts_cbrt32(float x) {\n return cbrt(x);\n}\n\nstatic inline float futrts_exp32(float x) {\n return exp(x);\n}\n\nstatic inline float futrts_cos32(float x) {\n return cos(x);\n}\n\nstatic inline float futrts_sin32(float x) {\n return sin(x);\n}\n\nstatic inline float futrts_tan32(float x) {\n return tan(x);\n}\n\nstatic inline float futrts_acos32(float x) {\n return acos(x);\n}\n\nstatic inline float futrts_asin32(float x) {\n return asin(x);\n}\n\nstatic inline float futrts_atan32(float x) {\n return atan(x);\n}\n\nstatic inline float futrts_cosh32(float x) {\n return cosh(x);\n}\n\nstatic inline float futrts_sinh32(float x) {\n return sinh(x);\n}\n\nstatic inline float futrts_tanh32(float x) {\n return tanh(x);\n}\n\nstatic inline float futrts_acosh32(float x) {\n return acosh(x);\n}\n\nstatic inline float futrts_asinh32(float x) {\n return asinh(x);\n}\n\nstatic inline float futrts_atanh32(float x) {\n return atanh(x);\n}\n\nstatic inline float futrts_atan2_32(float x, float y) {\n return atan2(x, y);\n}\n\nstatic inline float futrts_hypot32(float x, float y) {\n return hypot(x, y);\n}\n\nstatic inline float futrts_gamma32(float x) {\n return tgamma(x);\n}\n\nstatic inline float futrts_lgamma32(float x) {\n return lgamma(x);\n}\n\nstatic inline float futrts_erf32(float x) {\n return erf(x);\n}\n\nstatic inline float futrts_erfc32(float x) {\n return erfc(x);\n}\n\nstatic inline float fmod32(float x, float y) {\n return fmod(x, y);\n}\n\nstatic inline float futrt", "s_round32(float x) {\n return rint(x);\n}\n\nstatic inline float futrts_floor32(float x) {\n return floor(x);\n}\n\nstatic inline float futrts_ceil32(float x) {\n return ceil(x);\n}\n\nstatic inline float futrts_nextafter32(float x, float y) {\n return nextafter(x, y);\n}\n\nstatic inline float futrts_lerp32(float v0, float v1, float t) {\n return mix(v0, v1, t);\n}\n\nstatic inline float futrts_mad32(float a, float b, float c) {\n return mad(a, b, c);\n}\n\nstatic inline float futrts_fma32(float a, float b, float c) {\n return fma(a, b, c);\n}\n\n#elif ISPC\n\nstatic inline float futrts_log32(float x) {\n return futrts_isfinite32(x) || (futrts_isinf32(x) && x < 0)? log(x) : x;\n}\n\nstatic inline float futrts_log2_32(float x) {\n return futrts_log32(x) / log(2.0f);\n}\n\nstatic inline float futrts_log10_32(float x) {\n return futrts_log32(x) / log(10.0f);\n}\n\nstatic inline float futrts_log1p_32(float x) {\n if(x == -1.0f || (futrts_isinf32(x) && x > 0.0f)) return x / 0.0f;\n float y = 1.0f + x;\n float z = y - 1.0f;\n return log(y) - (z-x)/y;\n}\n\nstatic inline float futrts_sqrt32(float x) {\n return sqrt(x);\n}\n\nextern \"C\" unmasked uniform float cbrtf(uniform float);\nstatic inline float futrts_cbrt32(float x) {\n float res;\n foreach_active (i) {\n uniform float r = cbrtf(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline float futrts_exp32(float x) {\n return exp(x);\n}\n\nstatic inline float futrts_cos32(float x) {\n return cos(x);\n}\n\nstatic inline float futrts_sin32(float x) {\n return sin(x);\n}\n\nstatic inline float futrts_tan32(float x) {\n return tan(x);\n}\n\nstatic inline float futrts_acos32(float x) {\n return acos(x);\n}\n\nstatic inline float futrts_asin32(float x) {\n return asin(x);\n}\n\nstatic inline float futrts_atan32(float x) {\n return atan(x);\n}\n\nstatic inline float futrts_cosh32(float x) {\n return (exp(x)+exp(-x)) / 2.0f;\n}\n\nstatic inline float futrts_sinh32(float x) {\n return (exp(x)-exp(-x)) / 2.0f;\n}\n\nstatic inline float futrts_tanh32(float x) {\n retur", | |
"n futrts_sinh32(x)/futrts_cosh32(x);\n}\n\nstatic inline float futrts_acosh32(float x) {\n float f = x+sqrt(x*x-1);\n if(futrts_isfinite32(f)) return log(f);\n return f;\n}\n\nstatic inline float futrts_asinh32(float x) {\n float f = x+sqrt(x*x+1);\n if(futrts_isfinite32(f)) return log(f);\n return f;\n\n}\n\nstatic inline float futrts_atanh32(float x) {\n float f = (1+x)/(1-x);\n if(futrts_isfinite32(f)) return log(f)/2.0f;\n return f;\n\n}\n\nstatic inline float futrts_atan2_32(float x, float y) {\n return (x == 0.0f && y == 0.0f) ? 0.0f : atan2(x, y);\n}\n\nstatic inline float futrts_hypot32(float x, float y) {\n if (futrts_isfinite32(x) && futrts_isfinite32(y)) {\n x = abs(x);\n y = abs(y);\n float a;\n float b;\n if (x >= y){\n a = x;\n b = y;\n } else {\n a = y;\n b = x;\n }\n if(b == 0){\n return a;\n }\n\n int e;\n float an;\n float bn;\n an = frexp (a, &e);\n bn = ldexp (b, - e);\n float cn;\n cn = sqrt (an * an + bn * bn);\n return ldexp (cn, e);\n } else {\n if (futrts_isinf32(x) || futrts_isinf32(y)) return INFINITY;\n else return x + y;\n }\n\n}\n\nextern \"C\" unmasked uniform float tgammaf(uniform float x);\nstatic inline float futrts_gamma32(float x) {\n float res;\n foreach_active (i) {\n uniform float r = tgammaf(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nextern \"C\" unmasked uniform float lgammaf(uniform float x);\nstatic inline float futrts_lgamma32(float x) {\n float res;\n foreach_active (i) {\n uniform float r = lgammaf(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nextern \"C\" unmasked uniform float erff(uniform float x);\nstatic inline float futrts_erf32(float x) {\n float res;\n foreach_active (i) {\n uniform float r = erff(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nextern \"C\" unmasked uniform float erfcf(uniform float x);\nstatic inline float futrts_erfc32(float x) {\n float res;\n foreach_active (i) {\n uniform float r = erfcf(extr", "act(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline float fmod32(float x, float y) {\n return x - y * trunc(x/y);\n}\n\nstatic inline float futrts_round32(float x) {\n return round(x);\n}\n\nstatic inline float futrts_floor32(float x) {\n return floor(x);\n}\n\nstatic inline float futrts_ceil32(float x) {\n return ceil(x);\n}\n\nextern \"C\" unmasked uniform float nextafterf(uniform float x, uniform float y);\nstatic inline float futrts_nextafter32(float x, float y) {\n float res;\n foreach_active (i) {\n uniform float r = nextafterf(extract(x, i), extract(y, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline float futrts_lerp32(float v0, float v1, float t) {\n return v0 + (v1 - v0) * t;\n}\n\nstatic inline float futrts_mad32(float a, float b, float c) {\n return a * b + c;\n}\n\nstatic inline float futrts_fma32(float a, float b, float c) {\n return a * b + c;\n}\n\n#else // Not OpenCL or ISPC, but CUDA or plain C.\n\nstatic inline float futrts_log32(float x) {\n return logf(x);\n}\n\nstatic inline float futrts_log2_32(float x) {\n return log2f(x);\n}\n\nstatic inline float futrts_log10_32(float x) {\n return log10f(x);\n}\n\nstatic inline float futrts_log1p_32(float x) {\n return log1pf(x);\n}\n\nstatic inline float futrts_sqrt32(float x) {\n return sqrtf(x);\n}\n\nstatic inline float futrts_cbrt32(float x) {\n return cbrtf(x);\n}\n\nstatic inline float futrts_exp32(float x) {\n return expf(x);\n}\n\nstatic inline float futrts_cos32(float x) {\n return cosf(x);\n}\n\nstatic inline float futrts_sin32(float x) {\n return sinf(x);\n}\n\nstatic inline float futrts_tan32(float x) {\n return tanf(x);\n}\n\nstatic inline float futrts_acos32(float x) {\n return acosf(x);\n}\n\nstatic inline float futrts_asin32(float x) {\n return asinf(x);\n}\n\nstatic inline float futrts_atan32(float x) {\n return atanf(x);\n}\n\nstatic inline float futrts_cosh32(float x) {\n return coshf(x);\n}\n\nstatic inline float futrts_sinh32(float x) {\n return sinhf(x);\n}\n\nstatic inline float futrts_tanh32(float x) {\n r", "eturn tanhf(x);\n}\n\nstatic inline float futrts_acosh32(float x) {\n return acoshf(x);\n}\n\nstatic inline float futrts_asinh32(float x) {\n return asinhf(x);\n}\n\nstatic inline float futrts_atanh32(float x) {\n return atanhf(x);\n}\n\nstatic inline float futrts_atan2_32(float x, float y) {\n return atan2f(x, y);\n}\n\nstatic inline float futrts_hypot32(float x, float y) {\n return hypotf(x, y);\n}\n\nstatic inline float futrts_gamma32(float x) {\n return tgammaf(x);\n}\n\nstatic inline float futrts_lgamma32(float x) {\n return lgammaf(x);\n}\n\nstatic inline float futrts_erf32(float x) {\n return erff(x);\n}\n\nstatic inline float futrts_erfc32(float x) {\n return erfcf(x);\n}\n\nstatic inline float fmod32(float x, float y) {\n return fmodf(x, y);\n}\n\nstatic inline float futrts_round32(float x) {\n return rintf(x);\n}\n\nstatic inline float futrts_floor32(float x) {\n return floorf(x);\n}\n\nstatic inline float futrts_ceil32(float x) {\n return ceilf(x);\n}\n\nstatic inline float futrts_nextafter32(float x, float y) {\n return nextafterf(x, y);\n}\n\nstatic inline float futrts_lerp32(float v0, float v1, float t) {\n return v0 + (v1 - v0) * t;\n}\n\nstatic inline float futrts_mad32(float a, float b, float c) {\n return a * b + c;\n}\n\nstatic inline float futrts_fma32(float a, float b, float c) {\n return fmaf(a, b, c);\n}\n#endif\n\n#if ISPC\nstatic inline int32_t futrts_to_bits32(float x) {\n return intbits(x);\n}\n\nstatic inline float futrts_from_bits32(int32_t x) {\n return floatbits(x);\n}\n#else\nstatic inline int32_t futrts_to_bits32(float x) {\n union {\n float f;\n int32_t t;\n } p;\n\n p.f = x;\n return p.t;\n}\n\nstatic inline float futrts_from_bits32(int32_t x) {\n union {\n int32_t f;\n float t;\n } p;\n\n p.f = x;\n return p.t;\n}\n#endif\n\nstatic inline float fsignum32(float x) {\n return futrts_isnan32(x) ? x : (x > 0 ? 1 : 0) - (x < 0 ? 1 : 0);\n}\n\n#ifdef FUTHARK_F64_ENABLED\n\n#if ISPC\nstatic inline bool futrts_isinf64(float x) {\n return !isnan(x) && isnan(x - x);\n}\n\nstatic inline bool futrts_isfinite64(fl", | |
"oat x) {\n return !isnan(x) && !futrts_isinf64(x);\n}\n\nstatic inline double fdiv64(double x, double y) {\n return x / y;\n}\n\nstatic inline double fadd64(double x, double y) {\n return x + y;\n}\n\nstatic inline double fsub64(double x, double y) {\n return x - y;\n}\n\nstatic inline double fmul64(double x, double y) {\n return x * y;\n}\n\nstatic inline bool cmplt64(double x, double y) {\n return x < y;\n}\n\nstatic inline bool cmple64(double x, double y) {\n return x <= y;\n}\n\nstatic inline double sitofp_i8_f64(int8_t x) {\n return (double) x;\n}\n\nstatic inline double sitofp_i16_f64(int16_t x) {\n return (double) x;\n}\n\nstatic inline double sitofp_i32_f64(int32_t x) {\n return (double) x;\n}\n\nstatic inline double sitofp_i64_f64(int64_t x) {\n return (double) x;\n}\n\nstatic inline double uitofp_i8_f64(uint8_t x) {\n return (double) x;\n}\n\nstatic inline double uitofp_i16_f64(uint16_t x) {\n return (double) x;\n}\n\nstatic inline double uitofp_i32_f64(uint32_t x) {\n return (double) x;\n}\n\nstatic inline double uitofp_i64_f64(uint64_t x) {\n return (double) x;\n}\n\nstatic inline double fabs64(double x) {\n return abs(x);\n}\n\nstatic inline double fmax64(double x, double y) {\n return isnan(x) ? y : isnan(y) ? x : max(x, y);\n}\n\nstatic inline double fmin64(double x, double y) {\n return isnan(x) ? y : isnan(y) ? x : min(x, y);\n}\n\nstatic inline double fpow64(double a, double b) {\n float ret;\n foreach_active (i) {\n uniform float r = __stdlib_powf(extract(a, i), extract(b, i));\n ret = insert(ret, i, r);\n }\n return ret;\n}\n\nstatic inline double futrts_log64(double x) {\n return futrts_isfinite64(x) || (futrts_isinf64(x) && x < 0)? log(x) : x;\n}\n\nstatic inline double futrts_log2_64(double x) {\n return futrts_log64(x)/log(2.0d);\n}\n\nstatic inline double futrts_log10_64(double x) {\n return futrts_log64(x)/log(10.0d);\n}\n\nstatic inline double futrts_log1p_64(double x) {\n if(x == -1.0d || (futrts_isinf64(x) && x > 0.0d)) return x / 0.0d;\n double y = 1.0d + x;\n double z = y - 1.0d;\n return log", "(y) - (z-x)/y;\n}\n\nstatic inline double futrts_sqrt64(double x) {\n return sqrt(x);\n}\n\nextern \"C\" unmasked uniform double cbrt(uniform double);\nstatic inline double futrts_cbrt64(double x) {\n double res;\n foreach_active (i) {\n uniform double r = cbrtf(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline double futrts_exp64(double x) {\n return exp(x);\n}\n\nstatic inline double futrts_cos64(double x) {\n return cos(x);\n}\n\nstatic inline double futrts_sin64(double x) {\n return sin(x);\n}\n\nstatic inline double futrts_tan64(double x) {\n return tan(x);\n}\n\nstatic inline double futrts_acos64(double x) {\n return acos(x);\n}\n\nstatic inline double futrts_asin64(double x) {\n return asin(x);\n}\n\nstatic inline double futrts_atan64(double x) {\n return atan(x);\n}\n\nstatic inline double futrts_cosh64(double x) {\n return (exp(x)+exp(-x)) / 2.0d;\n}\n\nstatic inline double futrts_sinh64(double x) {\n return (exp(x)-exp(-x)) / 2.0d;\n}\n\nstatic inline double futrts_tanh64(double x) {\n return futrts_sinh64(x)/futrts_cosh64(x);\n}\n\nstatic inline double futrts_acosh64(double x) {\n double f = x+sqrt(x*x-1.0d);\n if(futrts_isfinite64(f)) return log(f);\n return f;\n}\n\nstatic inline double futrts_asinh64(double x) {\n double f = x+sqrt(x*x+1.0d);\n if(futrts_isfinite64(f)) return log(f);\n return f;\n}\n\nstatic inline double futrts_atanh64(double x) {\n double f = (1.0d+x)/(1.0d-x);\n if(futrts_isfinite64(f)) return log(f)/2.0d;\n return f;\n\n}\n\nstatic inline double futrts_atan2_64(double x, double y) {\n return atan2(x, y);\n}\n\nextern \"C\" unmasked uniform double hypot(uniform double x, uniform double y);\nstatic inline double futrts_hypot64(double x, double y) {\n double res;\n foreach_active (i) {\n uniform double r = hypot(extract(x, i), extract(y, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nextern \"C\" unmasked uniform double tgamma(uniform double x);\nstatic inline double futrts_gamma64(double x) {\n double res;\n foreach_active (i) {\n uniform double r", " = tgamma(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nextern \"C\" unmasked uniform double lgamma(uniform double x);\nstatic inline double futrts_lgamma64(double x) {\n double res;\n foreach_active (i) {\n uniform double r = lgamma(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nextern \"C\" unmasked uniform double erf(uniform double x);\nstatic inline double futrts_erf64(double x) {\n double res;\n foreach_active (i) {\n uniform double r = erf(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nextern \"C\" unmasked uniform double erfc(uniform double x);\nstatic inline double futrts_erfc64(double x) {\n double res;\n foreach_active (i) {\n uniform double r = erfc(extract(x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline double futrts_fma64(double a, double b, double c) {\n return a * b + c;\n}\n\nstatic inline double futrts_round64(double x) {\n return round(x);\n}\n\nstatic inline double futrts_ceil64(double x) {\n return ceil(x);\n}\n\nextern \"C\" unmasked uniform double nextafter(uniform float x, uniform double y);\nstatic inline float futrts_nextafter64(double x, double y) {\n double res;\n foreach_active (i) {\n uniform double r = nextafter(extract(x, i), extract(y, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline double futrts_floor64(double x) {\n return floor(x);\n}\n\nstatic inline bool futrts_isnan64(double x) {\n return isnan(x);\n}\n\nstatic inline int8_t fptosi_f64_i8(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (int8_t) x;\n }\n}\n\nstatic inline int16_t fptosi_f64_i16(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (int16_t) x;\n }\n}\n\nstatic inline int32_t fptosi_f64_i32(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (int32_t) x;\n }\n}\n\nstatic inline int64_t fptosi_f64_i64(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {", | |
"\n return 0;\n } else {\n return (int64_t) x;\n }\n}\n\nstatic inline uint8_t fptoui_f64_i8(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (uint8_t) (int8_t) x;\n }\n}\n\nstatic inline uint16_t fptoui_f64_i16(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (uint16_t) (int16_t) x;\n }\n}\n\nstatic inline uint32_t fptoui_f64_i32(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (uint32_t) (int32_t) x;\n }\n}\n\nstatic inline uint64_t fptoui_f64_i64(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (uint64_t) (int64_t) x;\n }\n}\n\nstatic inline bool ftob_f64_bool(double x) {\n return x != 0.0;\n}\n\nstatic inline double btof_bool_f64(bool x) {\n return x ? 1.0 : 0.0;\n}\n\nstatic inline int64_t futrts_to_bits64(double x) {\n int64_t res;\n foreach_active (i) {\n uniform double tmp = extract(x, i);\n uniform int64_t r = *((uniform int64_t* uniform)&tmp);\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline double futrts_from_bits64(int64_t x) {\n double res;\n foreach_active (i) {\n uniform int64_t tmp = extract(x, i);\n uniform double r = *((uniform double* uniform)&tmp);\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline double fmod64(double x, double y) {\n return x - y * trunc(x/y);\n}\n\nstatic inline double fsignum64(double x) {\n return futrts_isnan64(x) ? x : (x > 0 ? 1.0d : 0.0d) - (x < 0 ? 1.0d : 0.0d);\n}\n\nstatic inline double futrts_lerp64(double v0, double v1, double t) {\n return v0 + (v1 - v0) * t;\n}\n\nstatic inline double futrts_mad64(double a, double b, double c) {\n return a * b + c;\n}\n\nstatic inline float fpconv_f32_f32(float x) {\n return (float) x;\n}\n\nstatic inline double fpconv_f32_f64(float x) {\n return (double) x;\n}\n\nstatic inline float fpconv_f64_f32(double x) {\n return (float) x;\n}\n\nstatic inline double fpconv_f64_f64(double x) {\n return (double", ") x;\n}\n\n#else\n\nstatic inline double fdiv64(double x, double y) {\n return x / y;\n}\n\nstatic inline double fadd64(double x, double y) {\n return x + y;\n}\n\nstatic inline double fsub64(double x, double y) {\n return x - y;\n}\n\nstatic inline double fmul64(double x, double y) {\n return x * y;\n}\n\nstatic inline bool cmplt64(double x, double y) {\n return x < y;\n}\n\nstatic inline bool cmple64(double x, double y) {\n return x <= y;\n}\n\nstatic inline double sitofp_i8_f64(int8_t x) {\n return (double) x;\n}\n\nstatic inline double sitofp_i16_f64(int16_t x) {\n return (double) x;\n}\n\nstatic inline double sitofp_i32_f64(int32_t x) {\n return (double) x;\n}\n\nstatic inline double sitofp_i64_f64(int64_t x) {\n return (double) x;\n}\n\nstatic inline double uitofp_i8_f64(uint8_t x) {\n return (double) x;\n}\n\nstatic inline double uitofp_i16_f64(uint16_t x) {\n return (double) x;\n}\n\nstatic inline double uitofp_i32_f64(uint32_t x) {\n return (double) x;\n}\n\nstatic inline double uitofp_i64_f64(uint64_t x) {\n return (double) x;\n}\n\nstatic inline double fabs64(double x) {\n return fabs(x);\n}\n\nstatic inline double fmax64(double x, double y) {\n return fmax(x, y);\n}\n\nstatic inline double fmin64(double x, double y) {\n return fmin(x, y);\n}\n\nstatic inline double fpow64(double x, double y) {\n return pow(x, y);\n}\n\nstatic inline double futrts_log64(double x) {\n return log(x);\n}\n\nstatic inline double futrts_log2_64(double x) {\n return log2(x);\n}\n\nstatic inline double futrts_log10_64(double x) {\n return log10(x);\n}\n\nstatic inline double futrts_log1p_64(double x) {\n return log1p(x);\n}\n\nstatic inline double futrts_sqrt64(double x) {\n return sqrt(x);\n}\n\nstatic inline double futrts_cbrt64(double x) {\n return cbrt(x);\n}\n\nstatic inline double futrts_exp64(double x) {\n return exp(x);\n}\n\nstatic inline double futrts_cos64(double x) {\n return cos(x);\n}\n\nstatic inline double futrts_sin64(double x) {\n return sin(x);\n}\n\nstatic inline double futrts_tan64(double x) {\n return tan(x);\n}\n\nstatic inline double futrts_a", "cos64(double x) {\n return acos(x);\n}\n\nstatic inline double futrts_asin64(double x) {\n return asin(x);\n}\n\nstatic inline double futrts_atan64(double x) {\n return atan(x);\n}\n\nstatic inline double futrts_cosh64(double x) {\n return cosh(x);\n}\n\nstatic inline double futrts_sinh64(double x) {\n return sinh(x);\n}\n\nstatic inline double futrts_tanh64(double x) {\n return tanh(x);\n}\n\nstatic inline double futrts_acosh64(double x) {\n return acosh(x);\n}\n\nstatic inline double futrts_asinh64(double x) {\n return asinh(x);\n}\n\nstatic inline double futrts_atanh64(double x) {\n return atanh(x);\n}\n\nstatic inline double futrts_atan2_64(double x, double y) {\n return atan2(x, y);\n}\n\nstatic inline double futrts_hypot64(double x, double y) {\n return hypot(x, y);\n}\n\nstatic inline double futrts_gamma64(double x) {\n return tgamma(x);\n}\n\nstatic inline double futrts_lgamma64(double x) {\n return lgamma(x);\n}\n\nstatic inline double futrts_erf64(double x) {\n return erf(x);\n}\n\nstatic inline double futrts_erfc64(double x) {\n return erfc(x);\n}\n\nstatic inline double futrts_fma64(double a, double b, double c) {\n return fma(a, b, c);\n}\n\nstatic inline double futrts_round64(double x) {\n return rint(x);\n}\n\nstatic inline double futrts_ceil64(double x) {\n return ceil(x);\n}\n\nstatic inline float futrts_nextafter64(float x, float y) {\n return nextafter(x, y);\n}\n\nstatic inline double futrts_floor64(double x) {\n return floor(x);\n}\n\nstatic inline bool futrts_isnan64(double x) {\n return isnan(x);\n}\n\nstatic inline bool futrts_isinf64(double x) {\n return isinf(x);\n}\n\nstatic inline int8_t fptosi_f64_i8(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (int8_t) x;\n }\n}\n\nstatic inline int16_t fptosi_f64_i16(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (int16_t) x;\n }\n}\n\nstatic inline int32_t fptosi_f64_i32(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (int32_t", | |
") x;\n }\n}\n\nstatic inline int64_t fptosi_f64_i64(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (int64_t) x;\n }\n}\n\nstatic inline uint8_t fptoui_f64_i8(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (uint8_t) (int8_t) x;\n }\n}\n\nstatic inline uint16_t fptoui_f64_i16(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (uint16_t) (int16_t) x;\n }\n}\n\nstatic inline uint32_t fptoui_f64_i32(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (uint32_t) (int32_t) x;\n }\n}\n\nstatic inline uint64_t fptoui_f64_i64(double x) {\n if (futrts_isnan64(x) || futrts_isinf64(x)) {\n return 0;\n } else {\n return (uint64_t) (int64_t) x;\n }\n}\n\nstatic inline bool ftob_f64_bool(double x) {\n return x != 0;\n}\n\nstatic inline double btof_bool_f64(bool x) {\n return x ? 1 : 0;\n}\n\nstatic inline int64_t futrts_to_bits64(double x) {\n union {\n double f;\n int64_t t;\n } p;\n\n p.f = x;\n return p.t;\n}\n\nstatic inline double futrts_from_bits64(int64_t x) {\n union {\n int64_t f;\n double t;\n } p;\n\n p.f = x;\n return p.t;\n}\n\nstatic inline double fmod64(double x, double y) {\n return fmod(x, y);\n}\n\nstatic inline double fsignum64(double x) {\n return futrts_isnan64(x) ? x : (x > 0) - (x < 0);\n}\n\nstatic inline double futrts_lerp64(double v0, double v1, double t) {\n#ifdef __OPENCL_VERSION__\n return mix(v0, v1, t);\n#else\n return v0 + (v1 - v0) * t;\n#endif\n}\n\nstatic inline double futrts_mad64(double a, double b, double c) {\n#ifdef __OPENCL_VERSION__\n return mad(a, b, c);\n#else\n return a * b + c;\n#endif\n}\n\nstatic inline float fpconv_f32_f32(float x) {\n return (float) x;\n}\n\nstatic inline double fpconv_f32_f64(float x) {\n return (double) x;\n}\n\nstatic inline float fpconv_f64_f32(double x) {\n return (float) x;\n}\n\nstatic inline double fpconv_f64_f64(double x) {\n return (double) x;\n}\n\n#endif\n\n#endif\n\n// End", " of scalar.h.\n// Start of scalar_f16.h.\n\n// Half-precision is emulated if needed (e.g. in straight C) with the\n// native type used if possible. The emulation works by typedef'ing\n// 'float' to 'f16', and then implementing all operations on single\n// precision. To cut down on duplication, we use the same code for\n// those Futhark functions that require just operators or casts. The\n// in-memory representation for arrays will still be 16 bits even\n// under emulation, so the compiler will have to be careful when\n// generating reads or writes.\n\n#if !defined(cl_khr_fp16) && !(defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) && !(defined(ISPC))\n#define EMULATE_F16\n#endif\n\n#if !defined(EMULATE_F16) && defined(__OPENCL_VERSION__)\n#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n#endif\n\n#ifdef EMULATE_F16\n\n// Note that the half-precision storage format is still 16 bits - the\n// compiler will have to be real careful!\ntypedef float f16;\n\n#elif ISPC\ntypedef float16 f16;\n\n#else\n\n#ifdef __CUDA_ARCH__\n#include <cuda_fp16.h>\n#endif\n\ntypedef half f16;\n\n#endif\n\n// Some of these functions convert to single precision because half\n// precision versions are not available.\n\nstatic inline f16 fadd16(f16 x, f16 y) {\n return x + y;\n}\n\nstatic inline f16 fsub16(f16 x, f16 y) {\n return x - y;\n}\n\nstatic inline f16 fmul16(f16 x, f16 y) {\n return x * y;\n}\n\nstatic inline bool cmplt16(f16 x, f16 y) {\n return x < y;\n}\n\nstatic inline bool cmple16(f16 x, f16 y) {\n return x <= y;\n}\n\nstatic inline f16 sitofp_i8_f16(int8_t x) {\n return (f16) x;\n}\n\nstatic inline f16 sitofp_i16_f16(int16_t x) {\n return (f16) x;\n}\n\nstatic inline f16 sitofp_i32_f16(int32_t x) {\n return (f16) x;\n}\n\nstatic inline f16 sitofp_i64_f16(int64_t x) {\n return (f16) x;\n}\n\nstatic inline f16 uitofp_i8_f16(uint8_t x) {\n return (f16) x;\n}\n\nstatic inline f16 uitofp_i16_f16(uint16_t x) {\n return (f16) x;\n}\n\nstatic inline f16 uitofp_i32_f16(uint32_t x) {\n return (f16) x;\n}\n\nstatic inline f16 uitofp_i64_f16(uint64_t x) {\n return (f1", "6) x;\n}\n\nstatic inline int8_t fptosi_f16_i8(f16 x) {\n return (int8_t) (float) x;\n}\n\nstatic inline int16_t fptosi_f16_i16(f16 x) {\n return (int16_t) x;\n}\n\nstatic inline int32_t fptosi_f16_i32(f16 x) {\n return (int32_t) x;\n}\n\nstatic inline int64_t fptosi_f16_i64(f16 x) {\n return (int64_t) x;\n}\n\nstatic inline uint8_t fptoui_f16_i8(f16 x) {\n return (uint8_t) (float) x;\n}\n\nstatic inline uint16_t fptoui_f16_i16(f16 x) {\n return (uint16_t) x;\n}\n\nstatic inline uint32_t fptoui_f16_i32(f16 x) {\n return (uint32_t) x;\n}\n\nstatic inline uint64_t fptoui_f16_i64(f16 x) {\n return (uint64_t) x;\n}\n\nstatic inline bool ftob_f16_bool(f16 x) {\n return x != (f16)0;\n}\n\nstatic inline f16 btof_bool_f16(bool x) {\n return x ? 1 : 0;\n}\n\n#ifndef EMULATE_F16\nstatic inline bool futrts_isnan16(f16 x) {\n return isnan((float)x);\n}\n\n#ifdef __OPENCL_VERSION__\n\nstatic inline f16 fabs16(f16 x) {\n return fabs(x);\n}\n\nstatic inline f16 fmax16(f16 x, f16 y) {\n return fmax(x, y);\n}\n\nstatic inline f16 fmin16(f16 x, f16 y) {\n return fmin(x, y);\n}\n\nstatic inline f16 fpow16(f16 x, f16 y) {\n return pow(x, y);\n}\n\n#elif ISPC\nstatic inline f16 fabs16(f16 x) {\n return abs(x);\n}\n\nstatic inline f16 fmax16(f16 x, f16 y) {\n return futrts_isnan16(x) ? y : futrts_isnan16(y) ? x : max(x, y);\n}\n\nstatic inline f16 fmin16(f16 x, f16 y) {\n return futrts_isnan16(x) ? y : futrts_isnan16(y) ? x : min(x, y);\n}\n\nstatic inline f16 fpow16(f16 x, f16 y) {\n return pow(x, y);\n}\n#else // Assuming CUDA.\n\nstatic inline f16 fabs16(f16 x) {\n return fabsf(x);\n}\n\nstatic inline f16 fmax16(f16 x, f16 y) {\n return fmaxf(x, y);\n}\n\nstatic inline f16 fmin16(f16 x, f16 y) {\n return fminf(x, y);\n}\n\nstatic inline f16 fpow16(f16 x, f16 y) {\n return powf(x, y);\n}\n#endif\n\n#if ISPC\nstatic inline bool futrts_isinf16(float x) {\n return !futrts_isnan16(x) && futrts_isnan16(x - x);\n}\nstatic inline bool futrts_isfinite16(float x) {\n return !futrts_isnan16(x) && !futrts_isinf16(x);\n}\n\n#else\n\nstatic inline bool futrts_isinf16(f16 x) {\n retu", | |
"rn isinf((float)x);\n}\n#endif\n\n#ifdef __OPENCL_VERSION__\nstatic inline f16 futrts_log16(f16 x) {\n return log(x);\n}\n\nstatic inline f16 futrts_log2_16(f16 x) {\n return log2(x);\n}\n\nstatic inline f16 futrts_log10_16(f16 x) {\n return log10(x);\n}\n\nstatic inline f16 futrts_log1p_16(f16 x) {\n return log1p(x);\n}\n\nstatic inline f16 futrts_sqrt16(f16 x) {\n return sqrt(x);\n}\n\nstatic inline f16 futrts_cbrt16(f16 x) {\n return cbrt(x);\n}\n\nstatic inline f16 futrts_exp16(f16 x) {\n return exp(x);\n}\n\nstatic inline f16 futrts_cos16(f16 x) {\n return cos(x);\n}\n\nstatic inline f16 futrts_sin16(f16 x) {\n return sin(x);\n}\n\nstatic inline f16 futrts_tan16(f16 x) {\n return tan(x);\n}\n\nstatic inline f16 futrts_acos16(f16 x) {\n return acos(x);\n}\n\nstatic inline f16 futrts_asin16(f16 x) {\n return asin(x);\n}\n\nstatic inline f16 futrts_atan16(f16 x) {\n return atan(x);\n}\n\nstatic inline f16 futrts_cosh16(f16 x) {\n return cosh(x);\n}\n\nstatic inline f16 futrts_sinh16(f16 x) {\n return sinh(x);\n}\n\nstatic inline f16 futrts_tanh16(f16 x) {\n return tanh(x);\n}\n\nstatic inline f16 futrts_acosh16(f16 x) {\n return acosh(x);\n}\n\nstatic inline f16 futrts_asinh16(f16 x) {\n return asinh(x);\n}\n\nstatic inline f16 futrts_atanh16(f16 x) {\n return atanh(x);\n}\n\nstatic inline f16 futrts_atan2_16(f16 x, f16 y) {\n return atan2(x, y);\n}\n\nstatic inline f16 futrts_hypot16(f16 x, f16 y) {\n return hypot(x, y);\n}\n\nstatic inline f16 futrts_gamma16(f16 x) {\n return tgamma(x);\n}\n\nstatic inline f16 futrts_lgamma16(f16 x) {\n return lgamma(x);\n}\n\nstatic inline f16 futrts_erf16(f16 x) {\n return erf(x);\n}\n\nstatic inline f16 futrts_erfc16(f16 x) {\n return erfc(x);\n}\n\nstatic inline f16 fmod16(f16 x, f16 y) {\n return fmod(x, y);\n}\n\nstatic inline f16 futrts_round16(f16 x) {\n return rint(x);\n}\n\nstatic inline f16 futrts_floor16(f16 x) {\n return floor(x);\n}\n\nstatic inline f16 futrts_ceil16(f16 x) {\n return ceil(x);\n}\n\nstatic inline f16 futrts_nextafter16(f16 x, f16 y) {\n return nextafter(x, y);\n}\n\nstatic inline f16 futrts_", "lerp16(f16 v0, f16 v1, f16 t) {\n return mix(v0, v1, t);\n}\n\nstatic inline f16 futrts_mad16(f16 a, f16 b, f16 c) {\n return mad(a, b, c);\n}\n\nstatic inline f16 futrts_fma16(f16 a, f16 b, f16 c) {\n return fma(a, b, c);\n}\n#elif ISPC\n\nstatic inline f16 futrts_log16(f16 x) {\n return futrts_isfinite16(x) || (futrts_isinf16(x) && x < 0) ? log(x) : x;\n}\n\nstatic inline f16 futrts_log2_16(f16 x) {\n return futrts_log16(x) / log(2.0f16);\n}\n\nstatic inline f16 futrts_log10_16(f16 x) {\n return futrts_log16(x) / log(10.0f16);\n}\n\nstatic inline f16 futrts_log1p_16(f16 x) {\n if(x == -1.0f16 || (futrts_isinf16(x) && x > 0.0f16)) return x / 0.0f16;\n f16 y = 1.0f16 + x;\n f16 z = y - 1.0f16;\n return log(y) - (z-x)/y;\n}\n\nstatic inline f16 futrts_sqrt16(f16 x) {\n return (float16)sqrt((float)x);\n}\n\nstatic inline f16 futrts_exp16(f16 x) {\n return exp(x);\n}\n\nstatic inline f16 futrts_cos16(f16 x) {\n return (float16)cos((float)x);\n}\n\nstatic inline f16 futrts_sin16(f16 x) {\n return (float16)sin((float)x);\n}\n\nstatic inline f16 futrts_tan16(f16 x) {\n return (float16)tan((float)x);\n}\n\nstatic inline f16 futrts_acos16(f16 x) {\n return (float16)acos((float)x);\n}\n\nstatic inline f16 futrts_asin16(f16 x) {\n return (float16)asin((float)x);\n}\n\nstatic inline f16 futrts_atan16(f16 x) {\n return (float16)atan((float)x);\n}\n\nstatic inline f16 futrts_cosh16(f16 x) {\n return (exp(x)+exp(-x)) / 2.0f16;\n}\n\nstatic inline f16 futrts_sinh16(f16 x) {\n return (exp(x)-exp(-x)) / 2.0f16;\n}\n\nstatic inline f16 futrts_tanh16(f16 x) {\n return futrts_sinh16(x)/futrts_cosh16(x);\n}\n\nstatic inline f16 futrts_acosh16(f16 x) {\n float16 f = x+(float16)sqrt((float)(x*x-1));\n if(futrts_isfinite16(f)) return log(f);\n return f;\n}\n\nstatic inline f16 futrts_asinh16(f16 x) {\n float16 f = x+(float16)sqrt((float)(x*x+1));\n if(futrts_isfinite16(f)) return log(f);\n return f;\n}\n\nstatic inline f16 futrts_atanh16(f16 x) {\n float16 f = (1+x)/(1-x);\n if(futrts_isfinite16(f)) return log(f)/2.0f16;\n return f;\n}\n\nstatic inline", " f16 futrts_atan2_16(f16 x, f16 y) {\n return (float16)atan2((float)x, (float)y);\n}\n\nstatic inline f16 futrts_hypot16(f16 x, f16 y) {\n return (float16)futrts_hypot32((float)x, (float)y);\n}\n\nextern \"C\" unmasked uniform float tgammaf(uniform float x);\nstatic inline f16 futrts_gamma16(f16 x) {\n f16 res;\n foreach_active (i) {\n uniform f16 r = (f16)tgammaf(extract((float)x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nextern \"C\" unmasked uniform float lgammaf(uniform float x);\nstatic inline f16 futrts_lgamma16(f16 x) {\n f16 res;\n foreach_active (i) {\n uniform f16 r = (f16)lgammaf(extract((float)x, i));\n res = insert(res, i, r);\n }\n return res;\n}\n\nstatic inline f16 futrts_cbrt16(f16 x) {\n f16 res = (f16)futrts_cbrt32((float)x);\n return res;\n}\n\nstatic inline f16 futrts_erf16(f16 x) {\n f16 res = (f16)futrts_erf32((float)x);\n return res;\n}\n\nstatic inline f16 futrts_erfc16(f16 x) {\n f16 res = (f16)futrts_erfc32((float)x);\n return res;\n}\n\nstatic inline f16 fmod16(f16 x, f16 y) {\n return x - y * (float16)trunc((float) (x/y));\n}\n\nstatic inline f16 futrts_round16(f16 x) {\n return (float16)round((float)x);\n}\n\nstatic inline f16 futrts_floor16(f16 x) {\n return (float16)floor((float)x);\n}\n\nstatic inline f16 futrts_ceil16(f16 x) {\n return (float16)ceil((float)x);\n}\n\nstatic inline f16 futrts_nextafter16(f16 x, f16 y) {\n return (float16)futrts_nextafter32((float)x, (float) y);\n}\n\nstatic inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {\n return v0 + (v1 - v0) * t;\n}\n\nstatic inline f16 futrts_mad16(f16 a, f16 b, f16 c) {\n return a * b + c;\n}\n\nstatic inline f16 futrts_fma16(f16 a, f16 b, f16 c) {\n return a * b + c;\n}\n\n#else // Assume CUDA.\n\nstatic inline f16 futrts_log16(f16 x) {\n return hlog(x);\n}\n\nstatic inline f16 futrts_log2_16(f16 x) {\n return hlog2(x);\n}\n\nstatic inline f16 futrts_log10_16(f16 x) {\n return hlog10(x);\n}\n\nstatic inline f16 futrts_log1p_16(f16 x) {\n return (f16)log1pf((float)x);\n}\n\nstatic inline f16 futrts_sqrt16(f16 x) {\n ret", | |
"urn hsqrt(x);\n}\n\nstatic inline f16 futrts_cbrt16(f16 x) {\n return cbrtf(x);\n}\n\nstatic inline f16 futrts_exp16(f16 x) {\n return hexp(x);\n}\n\nstatic inline f16 futrts_cos16(f16 x) {\n return hcos(x);\n}\n\nstatic inline f16 futrts_sin16(f16 x) {\n return hsin(x);\n}\n\nstatic inline f16 futrts_tan16(f16 x) {\n return tanf(x);\n}\n\nstatic inline f16 futrts_acos16(f16 x) {\n return acosf(x);\n}\n\nstatic inline f16 futrts_asin16(f16 x) {\n return asinf(x);\n}\n\nstatic inline f16 futrts_atan16(f16 x) {\n return atanf(x);\n}\n\nstatic inline f16 futrts_cosh16(f16 x) {\n return coshf(x);\n}\n\nstatic inline f16 futrts_sinh16(f16 x) {\n return sinhf(x);\n}\n\nstatic inline f16 futrts_tanh16(f16 x) {\n return tanhf(x);\n}\n\nstatic inline f16 futrts_acosh16(f16 x) {\n return acoshf(x);\n}\n\nstatic inline f16 futrts_asinh16(f16 x) {\n return asinhf(x);\n}\n\nstatic inline f16 futrts_atanh16(f16 x) {\n return atanhf(x);\n}\n\nstatic inline f16 futrts_atan2_16(f16 x, f16 y) {\n return atan2f(x, y);\n}\n\nstatic inline f16 futrts_hypot16(f16 x, f16 y) {\n return hypotf(x, y);\n}\n\nstatic inline f16 futrts_gamma16(f16 x) {\n return tgammaf(x);\n}\n\nstatic inline f16 futrts_lgamma16(f16 x) {\n return lgammaf(x);\n}\n\nstatic inline f16 futrts_erf16(f16 x) {\n return erff(x);\n}\n\nstatic inline f16 futrts_erfc16(f16 x) {\n return erfcf(x);\n}\n\nstatic inline f16 fmod16(f16 x, f16 y) {\n return fmodf(x, y);\n}\n\nstatic inline f16 futrts_round16(f16 x) {\n return rintf(x);\n}\n\nstatic inline f16 futrts_floor16(f16 x) {\n return hfloor(x);\n}\n\nstatic inline f16 futrts_ceil16(f16 x) {\n return hceil(x);\n}\n\nstatic inline f16 futrts_nextafter16(f16 x, f16 y) {\n return __ushort_as_half(halfbitsnextafter(__half_as_ushort(x), __half_as_ushort(y)));\n}\n\nstatic inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {\n return v0 + (v1 - v0) * t;\n}\n\nstatic inline f16 futrts_mad16(f16 a, f16 b, f16 c) {\n return a * b + c;\n}\n\nstatic inline f16 futrts_fma16(f16 a, f16 b, f16 c) {\n return fmaf(a, b, c);\n}\n\n#endif\n\n// The CUDA __half type cannot be put ", "in unions for some reason, so we\n// use bespoke conversion functions instead.\n#ifdef __CUDA_ARCH__\nstatic inline int16_t futrts_to_bits16(f16 x) {\n return __half_as_ushort(x);\n}\nstatic inline f16 futrts_from_bits16(int16_t x) {\n return __ushort_as_half(x);\n}\n#elif ISPC\n\nstatic inline int16_t futrts_to_bits16(f16 x) {\n varying int16_t y = *((varying int16_t * uniform)&x);\n return y;\n}\n\nstatic inline f16 futrts_from_bits16(int16_t x) {\n varying f16 y = *((varying f16 * uniform)&x);\n return y;\n}\n#else\nstatic inline int16_t futrts_to_bits16(f16 x) {\n union {\n f16 f;\n int16_t t;\n } p;\n\n p.f = x;\n return p.t;\n}\n\nstatic inline f16 futrts_from_bits16(int16_t x) {\n union {\n int16_t f;\n f16 t;\n } p;\n\n p.f = x;\n return p.t;\n}\n#endif\n\n#else // No native f16 - emulate.\n\nstatic inline f16 fabs16(f16 x) {\n return fabs32(x);\n}\n\nstatic inline f16 fmax16(f16 x, f16 y) {\n return fmax32(x, y);\n}\n\nstatic inline f16 fmin16(f16 x, f16 y) {\n return fmin32(x, y);\n}\n\nstatic inline f16 fpow16(f16 x, f16 y) {\n return fpow32(x, y);\n}\n\nstatic inline bool futrts_isnan16(f16 x) {\n return futrts_isnan32(x);\n}\n\nstatic inline bool futrts_isinf16(f16 x) {\n return futrts_isinf32(x);\n}\n\nstatic inline f16 futrts_log16(f16 x) {\n return futrts_log32(x);\n}\n\nstatic inline f16 futrts_log2_16(f16 x) {\n return futrts_log2_32(x);\n}\n\nstatic inline f16 futrts_log10_16(f16 x) {\n return futrts_log10_32(x);\n}\n\nstatic inline f16 futrts_log1p_16(f16 x) {\n return futrts_log1p_32(x);\n}\n\nstatic inline f16 futrts_sqrt16(f16 x) {\n return futrts_sqrt32(x);\n}\n\nstatic inline f16 futrts_cbrt16(f16 x) {\n return futrts_cbrt32(x);\n}\n\nstatic inline f16 futrts_exp16(f16 x) {\n return futrts_exp32(x);\n}\n\nstatic inline f16 futrts_cos16(f16 x) {\n return futrts_cos32(x);\n}\n\nstatic inline f16 futrts_sin16(f16 x) {\n return futrts_sin32(x);\n}\n\nstatic inline f16 futrts_tan16(f16 x) {\n return futrts_tan32(x);\n}\n\nstatic inline f16 futrts_acos16(f16 x) {\n return futrts_acos32(x);\n}\n\nstatic inline f16 f", "utrts_asin16(f16 x) {\n return futrts_asin32(x);\n}\n\nstatic inline f16 futrts_atan16(f16 x) {\n return futrts_atan32(x);\n}\n\nstatic inline f16 futrts_cosh16(f16 x) {\n return futrts_cosh32(x);\n}\n\nstatic inline f16 futrts_sinh16(f16 x) {\n return futrts_sinh32(x);\n}\n\nstatic inline f16 futrts_tanh16(f16 x) {\n return futrts_tanh32(x);\n}\n\nstatic inline f16 futrts_acosh16(f16 x) {\n return futrts_acosh32(x);\n}\n\nstatic inline f16 futrts_asinh16(f16 x) {\n return futrts_asinh32(x);\n}\n\nstatic inline f16 futrts_atanh16(f16 x) {\n return futrts_atanh32(x);\n}\n\nstatic inline f16 futrts_atan2_16(f16 x, f16 y) {\n return futrts_atan2_32(x, y);\n}\n\nstatic inline f16 futrts_hypot16(f16 x, f16 y) {\n return futrts_hypot32(x, y);\n}\n\nstatic inline f16 futrts_gamma16(f16 x) {\n return futrts_gamma32(x);\n}\n\nstatic inline f16 futrts_lgamma16(f16 x) {\n return futrts_lgamma32(x);\n}\n\nstatic inline f16 futrts_erf16(f16 x) {\n return futrts_erf32(x);\n}\n\nstatic inline f16 futrts_erfc16(f16 x) {\n return futrts_erfc32(x);\n}\n\nstatic inline f16 fmod16(f16 x, f16 y) {\n return fmod32(x, y);\n}\n\nstatic inline f16 futrts_round16(f16 x) {\n return futrts_round32(x);\n}\n\nstatic inline f16 futrts_floor16(f16 x) {\n return futrts_floor32(x);\n}\n\nstatic inline f16 futrts_ceil16(f16 x) {\n return futrts_ceil32(x);\n}\n\nstatic inline f16 futrts_nextafter16(f16 x, f16 y) {\n return halfbits2float(halfbitsnextafter(float2halfbits(x), float2halfbits(y)));\n}\n\nstatic inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {\n return futrts_lerp32(v0, v1, t);\n}\n\nstatic inline f16 futrts_mad16(f16 a, f16 b, f16 c) {\n return futrts_mad32(a, b, c);\n}\n\nstatic inline f16 futrts_fma16(f16 a, f16 b, f16 c) {\n return futrts_fma32(a, b, c);\n}\n\n// Even when we are using an OpenCL that does not support cl_khr_fp16,\n// it must still support vload_half for actually creating a\n// half-precision number, which can then be efficiently converted to a\n// float. Similarly for vstore_half.\n#ifdef __OPENCL_VERSION__\n\nstatic inline int16_t futrt", | |
"s_to_bits16(f16 x) {\n int16_t y;\n // Violating strict aliasing here.\n vstore_half((float)x, 0, (half*)&y);\n return y;\n}\n\nstatic inline f16 futrts_from_bits16(int16_t x) {\n return (f16)vload_half(0, (half*)&x);\n}\n\n#else\n\nstatic inline int16_t futrts_to_bits16(f16 x) {\n return (int16_t)float2halfbits(x);\n}\n\nstatic inline f16 futrts_from_bits16(int16_t x) {\n return halfbits2float((uint16_t)x);\n}\n\nstatic inline f16 fsignum16(f16 x) {\n return futrts_isnan16(x) ? x : (x > 0 ? 1 : 0) - (x < 0 ? 1 : 0);\n}\n\n#endif\n\n#endif\n\nstatic inline float fpconv_f16_f16(f16 x) {\n return x;\n}\n\nstatic inline float fpconv_f16_f32(f16 x) {\n return x;\n}\n\nstatic inline f16 fpconv_f32_f16(float x) {\n return (f16) x;\n}\n\n#ifdef FUTHARK_F64_ENABLED\n\nstatic inline double fpconv_f16_f64(f16 x) {\n return (double) x;\n}\n\n#if ISPC\nstatic inline f16 fpconv_f64_f16(double x) {\n return (f16) ((float)x);\n}\n#else\nstatic inline f16 fpconv_f64_f16(double x) {\n return (f16) x;\n}\n#endif\n#endif\n\n\n// End of scalar_f16.h.\n// Start of atomics.h\n\ninline int32_t atomic_xchg_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicExch((int32_t*)p, x);\n#else\n return atomic_xor(p, x);\n#endif\n}\n\ninline int32_t atomic_xchg_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicExch((int32_t*)p, x);\n#else\n return atomic_xor(p, x);\n#endif\n}\n\ninline int32_t atomic_cmpxchg_i32_global(volatile __global int32_t *p,\n int32_t cmp, int32_t val) {\n#ifdef FUTHARK_CUDA\n return atomicCAS((int32_t*)p, cmp, val);\n#else\n return atomic_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int32_t atomic_cmpxchg_i32_local(volatile __local int32_t *p,\n int32_t cmp, int32_t val) {\n#ifdef FUTHARK_CUDA\n return atomicCAS((int32_t*)p, cmp, val);\n#else\n return atomic_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int32_t atomic_add_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n re", "turn atomicAdd((int32_t*)p, x);\n#else\n return atomic_add(p, x);\n#endif\n}\n\ninline int32_t atomic_add_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAdd((int32_t*)p, x);\n#else\n return atomic_add(p, x);\n#endif\n}\n\ninline float atomic_fadd_f32_global(volatile __global float *p, float x) {\n#ifdef FUTHARK_CUDA\n return atomicAdd((float*)p, x);\n#else\n union { int32_t i; float f; } old;\n union { int32_t i; float f; } assumed;\n old.f = *p;\n do {\n assumed.f = old.f;\n old.f = old.f + x;\n old.i = atomic_cmpxchg_i32_global((volatile __global int32_t*)p, assumed.i, old.i);\n } while (assumed.i != old.i);\n return old.f;\n#endif\n}\n\ninline float atomic_fadd_f32_local(volatile __local float *p, float x) {\n#ifdef FUTHARK_CUDA\n return atomicAdd((float*)p, x);\n#else\n union { int32_t i; float f; } old;\n union { int32_t i; float f; } assumed;\n old.f = *p;\n do {\n assumed.f = old.f;\n old.f = old.f + x;\n old.i = atomic_cmpxchg_i32_local((volatile __local int32_t*)p, assumed.i, old.i);\n } while (assumed.i != old.i);\n return old.f;\n#endif\n}\n\ninline int32_t atomic_smax_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((int32_t*)p, x);\n#else\n return atomic_max(p, x);\n#endif\n}\n\ninline int32_t atomic_smax_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((int32_t*)p, x);\n#else\n return atomic_max(p, x);\n#endif\n}\n\ninline int32_t atomic_smin_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((int32_t*)p, x);\n#else\n return atomic_min(p, x);\n#endif\n}\n\ninline int32_t atomic_smin_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((int32_t*)p, x);\n#else\n return atomic_min(p, x);\n#endif\n}\n\ninline uint32_t atomic_umax_i32_global(volatile __global uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((uint32_t*)p, x);\n#else\n return atomic_max(p, x);\n#endif\n}\n\n", "inline uint32_t atomic_umax_i32_local(volatile __local uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((uint32_t*)p, x);\n#else\n return atomic_max(p, x);\n#endif\n}\n\ninline uint32_t atomic_umin_i32_global(volatile __global uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((uint32_t*)p, x);\n#else\n return atomic_min(p, x);\n#endif\n}\n\ninline uint32_t atomic_umin_i32_local(volatile __local uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((uint32_t*)p, x);\n#else\n return atomic_min(p, x);\n#endif\n}\n\ninline int32_t atomic_and_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAnd((int32_t*)p, x);\n#else\n return atomic_and(p, x);\n#endif\n}\n\ninline int32_t atomic_and_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAnd((int32_t*)p, x);\n#else\n return atomic_and(p, x);\n#endif\n}\n\ninline int32_t atomic_or_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicOr((int32_t*)p, x);\n#else\n return atomic_or(p, x);\n#endif\n}\n\ninline int32_t atomic_or_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicOr((int32_t*)p, x);\n#else\n return atomic_or(p, x);\n#endif\n}\n\ninline int32_t atomic_xor_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicXor((int32_t*)p, x);\n#else\n return atomic_xor(p, x);\n#endif\n}\n\ninline int32_t atomic_xor_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n return atomicXor((int32_t*)p, x);\n#else\n return atomic_xor(p, x);\n#endif\n}\n\n// Start of 64 bit atomics\n\ninline int64_t atomic_xchg_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicExch((uint64_t*)p, x);\n#else\n return atom_xor(p, x);\n#endif\n}\n\ninline int64_t atomic_xchg_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicExch((uint64_t*)p, x);\n#else\n return atom_xor(p, x);\n#endif\n}\n\ninli", | |
"ne int64_t atomic_cmpxchg_i64_global(volatile __global int64_t *p,\n int64_t cmp, int64_t val) {\n#ifdef FUTHARK_CUDA\n return atomicCAS((uint64_t*)p, cmp, val);\n#else\n return atom_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int64_t atomic_cmpxchg_i64_local(volatile __local int64_t *p,\n int64_t cmp, int64_t val) {\n#ifdef FUTHARK_CUDA\n return atomicCAS((uint64_t*)p, cmp, val);\n#else\n return atom_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int64_t atomic_add_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAdd((uint64_t*)p, x);\n#else\n return atom_add(p, x);\n#endif\n}\n\ninline int64_t atomic_add_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAdd((uint64_t*)p, x);\n#else\n return atom_add(p, x);\n#endif\n}\n\n#ifdef FUTHARK_F64_ENABLED\n\ninline double atomic_fadd_f64_global(volatile __global double *p, double x) {\n#if defined(FUTHARK_CUDA) && __CUDA_ARCH__ >= 600\n return atomicAdd((double*)p, x);\n#else\n union { int64_t i; double f; } old;\n union { int64_t i; double f; } assumed;\n old.f = *p;\n do {\n assumed.f = old.f;\n old.f = old.f + x;\n old.i = atomic_cmpxchg_i64_global((volatile __global int64_t*)p, assumed.i, old.i);\n } while (assumed.i != old.i);\n return old.f;\n#endif\n}\n\ninline double atomic_fadd_f64_local(volatile __local double *p, double x) {\n#if defined(FUTHARK_CUDA) && __CUDA_ARCH__ >= 600\n return atomicAdd((double*)p, x);\n#else\n union { int64_t i; double f; } old;\n union { int64_t i; double f; } assumed;\n old.f = *p;\n do {\n assumed.f = old.f;\n old.f = old.f + x;\n old.i = atomic_cmpxchg_i64_local((volatile __local int64_t*)p, assumed.i, old.i);\n } while (assumed.i != old.i);\n return old.f;\n#endif\n}\n\n#endif\n\ninline int64_t atomic_smax_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((int64_t*)p, x);\n#else\n return atom_max(p, x);\n#endif\n}\n\ninline ", "int64_t atomic_smax_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((int64_t*)p, x);\n#else\n return atom_max(p, x);\n#endif\n}\n\ninline int64_t atomic_smin_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((int64_t*)p, x);\n#else\n return atom_min(p, x);\n#endif\n}\n\ninline int64_t atomic_smin_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((int64_t*)p, x);\n#else\n return atom_min(p, x);\n#endif\n}\n\ninline uint64_t atomic_umax_i64_global(volatile __global uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((uint64_t*)p, x);\n#else\n return atom_max(p, x);\n#endif\n}\n\ninline uint64_t atomic_umax_i64_local(volatile __local uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMax((uint64_t*)p, x);\n#else\n return atom_max(p, x);\n#endif\n}\n\ninline uint64_t atomic_umin_i64_global(volatile __global uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((uint64_t*)p, x);\n#else\n return atom_min(p, x);\n#endif\n}\n\ninline uint64_t atomic_umin_i64_local(volatile __local uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicMin((uint64_t*)p, x);\n#else\n return atom_min(p, x);\n#endif\n}\n\ninline int64_t atomic_and_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAnd((int64_t*)p, x);\n#else\n return atom_and(p, x);\n#endif\n}\n\ninline int64_t atomic_and_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicAnd((int64_t*)p, x);\n#else\n return atom_and(p, x);\n#endif\n}\n\ninline int64_t atomic_or_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicOr((int64_t*)p, x);\n#else\n return atom_or(p, x);\n#endif\n}\n\ninline int64_t atomic_or_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicOr((int64_t*)p, x);\n#else\n return atom_or(p, x);\n#endif\n}\n\ninline int64_t atomic_xor_i64_global(volatile __global ", "int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicXor((int64_t*)p, x);\n#else\n return atom_xor(p, x);\n#endif\n}\n\ninline int64_t atomic_xor_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n return atomicXor((int64_t*)p, x);\n#else\n return atom_xor(p, x);\n#endif\n}\n\n// End of atomics.h\n\n\n\n__attribute__((reqd_work_group_size(addzisegmap_group_sizze_6879, 1, 1)))\n__kernel void addzisegmap_6892(__global int *global_failure, int64_t n_6776, __global unsigned char *xs_mem_6916, __global unsigned char *ys_mem_6917, __global unsigned char *mem_6920)\n{\n #define segmap_group_sizze_6888 (addzisegmap_group_sizze_6879)\n \n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n if (*global_failure >= 0)\n return;\n \n int32_t local_tid_6925;\n int64_t group_sizze_6928;\n int32_t wave_sizze_6927;\n int32_t group_tid_6926;\n \n local_tid_6925 = get_local_id(0);\n group_sizze_6928 = get_local_size(0);\n wave_sizze_6927 = LOCKSTEP_WIDTH;\n group_tid_6926 = get_group_id(0);\n \n int32_t global_tid_6924 = group_tid_6926 * group_sizze_6928 + local_tid_6925;\n int32_t phys_tid_6892 = global_tid_6924;\n int64_t global_tid_6929 = sext_i32_i64(group_tid_6926) * segmap_group_sizze_6888 + sext_i32_i64(local_tid_6925);\n int64_t slice_6930 = n_6776;\n int64_t gtid_6891 = global_tid_6929;\n int64_t remnant_6931 = global_tid_6929 - gtid_6891;\n \n if (slt64(gtid_6891, n_6776)) {\n int8_t x_6893 = ((__global int8_t *) xs_mem_6916)[gtid_6891];\n int8_t x_6894 = ((__global int8_t *) ys_mem_6917)[gtid_6891];\n int8_t defunc_0_f_res_6895 = add8(x_6893, x_6894);\n \n ((__global int8_t *) mem_6920)[gtid_6891] = defunc_0_f_res_6895;\n }\n \n error_0:\n return;\n #undef segmap_group_sizze_6888\n}\n__attribute__((reqd_work_group_size(add_i64zisegmap_group_sizze_6899, 1, 1)))\n__kernel void add_i64zisegmap_6912(__global int *global_failure, int64_t n_", "6836, __global unsigned char *xs_mem_6916, __global unsigned char *ys_mem_6917, __global unsigned char *mem_6921)\n{\n #define segmap_group_sizze_6908 (add_i64zisegmap_group_sizze_6899)\n \n const int block_dim0 = 0;\n const int block_dim1 = 1;\n const int block_dim2 = 2;\n \n if (*global_failure >= 0)\n return;\n \n int32_t local_tid_6925;\n int64_t group_sizze_6928;\n int32_t wave_sizze_6927;\n int32_t group_tid_6926;\n \n local_tid_6925 = get_local_id(0);\n group_sizze_6928 = get_local_size(0);\n wave_sizze_6927 = LOCKSTEP_WIDTH;\n group_tid_6926 = get_group_id(0);\n \n int32_t global_tid_6924 = group_tid_6926 * group_sizze_6928 + local_tid_6925;\n int32_t phys_tid_6912 = global_tid_6924;\n int64_t global_tid_6929 = sext_i32_i64(group_tid_6926) * segmap_group_sizze_6908 + sext_i32_i64(local_tid_6925);\n int64_t slice_6930 = n_6836;\n int64_t gtid_6911 = global_tid_6929;\n int64_t remnant_6931 = global_tid_6929 - gtid_6911;\n \n if (slt64(gtid_6911, n_6836)) {\n int64_t x_6913 = ((__global int64_t *) xs_mem_6916)[gtid_6911];\n int64_t x_6914 = ((__global int64_t *) ys_mem_6917)[gtid_6911];\n int64_t defunc_0_f_res_6915 = add64(x_6913, x_6914);\n \n ((__global int64_t *) mem_6921)[gtid_6911] = defunc_0_f_res_6915;\n }\n \n error_0:\n return;\n #undef segmap_group_sizze_6908\n}\n", NULL}; | |
// Start of backends/opencl.h | |
// Forward declarations. | |
struct opencl_device_option; | |
// Invoked by setup_opencl() after the platform and device has been | |
// found, but before the program is loaded. Its intended use is to | |
// tune constants based on the selected platform and device. | |
static void post_opencl_setup(struct futhark_context*, struct opencl_device_option*); | |
static void set_tuning_params(struct futhark_context* ctx); | |
static char* get_failure_msg(int failure_idx, int64_t args[]); | |
#define OPENCL_SUCCEED_FATAL(e) opencl_succeed_fatal(e, #e, __FILE__, __LINE__) | |
#define OPENCL_SUCCEED_NONFATAL(e) opencl_succeed_nonfatal(e, #e, __FILE__, __LINE__) | |
// Take care not to override an existing error. | |
#define OPENCL_SUCCEED_OR_RETURN(e) { \ | |
char *serror = OPENCL_SUCCEED_NONFATAL(e); \ | |
if (serror) { \ | |
if (!ctx->error) { \ | |
ctx->error = serror; \ | |
return bad; \ | |
} else { \ | |
free(serror); \ | |
} \ | |
} \ | |
} | |
// OPENCL_SUCCEED_OR_RETURN returns the value of the variable 'bad' in | |
// scope. By default, it will be this one. Create a local variable | |
// of some other type if needed. This is a bit of a hack, but it | |
// saves effort in the code generator. | |
static const int bad = 1; | |
static const char* opencl_error_string(cl_int err) { | |
switch (err) { | |
case CL_SUCCESS: return "Success!"; | |
case CL_DEVICE_NOT_FOUND: return "Device not found."; | |
case CL_DEVICE_NOT_AVAILABLE: return "Device not available"; | |
case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available"; | |
case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure"; | |
case CL_OUT_OF_RESOURCES: return "Out of resources"; | |
case CL_OUT_OF_HOST_MEMORY: return "Out of host memory"; | |
case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available"; | |
case CL_MEM_COPY_OVERLAP: return "Memory copy overlap"; | |
case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch"; | |
case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported"; | |
case CL_BUILD_PROGRAM_FAILURE: return "Program build failure"; | |
case CL_MAP_FAILURE: return "Map failure"; | |
case CL_INVALID_VALUE: return "Invalid value"; | |
case CL_INVALID_DEVICE_TYPE: return "Invalid device type"; | |
case CL_INVALID_PLATFORM: return "Invalid platform"; | |
case CL_INVALID_DEVICE: return "Invalid device"; | |
case CL_INVALID_CONTEXT: return "Invalid context"; | |
case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties"; | |
case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue"; | |
case CL_INVALID_HOST_PTR: return "Invalid host pointer"; | |
case CL_INVALID_MEM_OBJECT: return "Invalid memory object"; | |
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor"; | |
case CL_INVALID_IMAGE_SIZE: return "Invalid image size"; | |
case CL_INVALID_SAMPLER: return "Invalid sampler"; | |
case CL_INVALID_BINARY: return "Invalid binary"; | |
case CL_INVALID_BUILD_OPTIONS: return "Invalid build options"; | |
case CL_INVALID_PROGRAM: return "Invalid program"; | |
case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable"; | |
case CL_INVALID_KERNEL_NAME: return "Invalid kernel name"; | |
case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition"; | |
case CL_INVALID_KERNEL: return "Invalid kernel"; | |
case CL_INVALID_ARG_INDEX: return "Invalid argument index"; | |
case CL_INVALID_ARG_VALUE: return "Invalid argument value"; | |
case CL_INVALID_ARG_SIZE: return "Invalid argument size"; | |
case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments"; | |
case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension"; | |
case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size"; | |
case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size"; | |
case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset"; | |
case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list"; | |
case CL_INVALID_EVENT: return "Invalid event"; | |
case CL_INVALID_OPERATION: return "Invalid operation"; | |
case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object"; | |
case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size"; | |
case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level"; | |
default: return "Unknown"; | |
} | |
} | |
static void opencl_succeed_fatal(cl_int ret, | |
const char *call, | |
const char *file, | |
int line) { | |
if (ret != CL_SUCCESS) { | |
futhark_panic(-1, "%s:%d: OpenCL call\n %s\nfailed with error code %d (%s)\n", | |
file, line, call, ret, opencl_error_string(ret)); | |
} | |
} | |
static char* opencl_succeed_nonfatal(cl_int ret, | |
const char *call, | |
const char *file, | |
int line) { | |
if (ret != CL_SUCCESS) { | |
return msgprintf("%s:%d: OpenCL call\n %s\nfailed with error code %d (%s)\n", | |
file, line, call, ret, opencl_error_string(ret)); | |
} else { | |
return NULL; | |
} | |
} | |
struct futhark_context_config { | |
int in_use; | |
int debugging; | |
int profiling; | |
int logging; | |
const char *cache_fname; | |
int num_tuning_params; | |
int64_t *tuning_params; | |
const char** tuning_param_names; | |
const char** tuning_param_vars; | |
const char** tuning_param_classes; | |
// Uniform fields above. | |
int preferred_device_num; | |
const char *preferred_platform; | |
const char *preferred_device; | |
int ignore_blacklist; | |
const char* dump_program_to; | |
const char* load_program_from; | |
const char* dump_binary_to; | |
const char* load_binary_from; | |
size_t default_group_size; | |
size_t default_num_groups; | |
size_t default_tile_size; | |
size_t default_reg_tile_size; | |
size_t default_threshold; | |
int default_group_size_changed; | |
int default_tile_size_changed; | |
int num_build_opts; | |
const char **build_opts; | |
cl_command_queue queue; | |
int queue_set; | |
}; | |
static void backend_context_config_setup(struct futhark_context_config* cfg) { | |
cfg->num_build_opts = 0; | |
cfg->build_opts = (const char**) malloc(sizeof(const char*)); | |
cfg->build_opts[0] = NULL; | |
cfg->preferred_device_num = 0; | |
cfg->preferred_platform = ""; | |
cfg->preferred_device = ""; | |
cfg->ignore_blacklist = 0; | |
cfg->dump_program_to = NULL; | |
cfg->load_program_from = NULL; | |
cfg->dump_binary_to = NULL; | |
cfg->load_binary_from = NULL; | |
// The following are dummy sizes that mean the concrete defaults | |
// will be set during initialisation via hardware-inspection-based | |
// heuristics. | |
cfg->default_group_size = 0; | |
cfg->default_num_groups = 0; | |
cfg->default_tile_size = 0; | |
cfg->default_reg_tile_size = 0; | |
cfg->default_threshold = 0; | |
cfg->default_group_size_changed = 0; | |
cfg->default_tile_size_changed = 0; | |
cfg->queue_set = 0; | |
} | |
static void backend_context_config_teardown(struct futhark_context_config* cfg) { | |
free(cfg->build_opts); | |
} | |
void futhark_context_config_add_build_option(struct futhark_context_config* cfg, const char *opt) { | |
cfg->build_opts[cfg->num_build_opts] = opt; | |
cfg->num_build_opts++; | |
cfg->build_opts = (const char**) realloc(cfg->build_opts, (cfg->num_build_opts+1) * sizeof(const char*)); | |
cfg->build_opts[cfg->num_build_opts] = NULL; | |
} | |
void futhark_context_config_set_device(struct futhark_context_config *cfg, const char* s) { | |
int x = 0; | |
if (*s == '#') { | |
s++; | |
while (isdigit(*s)) { | |
x = x * 10 + (*s++)-'0'; | |
} | |
// Skip trailing spaces. | |
while (isspace(*s)) { | |
s++; | |
} | |
} | |
cfg->preferred_device = s; | |
cfg->preferred_device_num = x; | |
cfg->ignore_blacklist = 1; | |
} | |
void futhark_context_config_set_platform(struct futhark_context_config *cfg, const char *s) { | |
cfg->preferred_platform = s; | |
cfg->ignore_blacklist = 1; | |
} | |
void futhark_context_config_set_command_queue(struct futhark_context_config *cfg, cl_command_queue q) { | |
cfg->queue = q; | |
cfg->queue_set = 1; | |
} | |
struct opencl_device_option { | |
cl_platform_id platform; | |
cl_device_id device; | |
cl_device_type device_type; | |
char *platform_name; | |
char *device_name; | |
}; | |
static char* opencl_platform_info(cl_platform_id platform, | |
cl_platform_info param) { | |
size_t req_bytes; | |
char *info; | |
OPENCL_SUCCEED_FATAL(clGetPlatformInfo(platform, param, 0, NULL, &req_bytes)); | |
info = (char*) malloc(req_bytes); | |
OPENCL_SUCCEED_FATAL(clGetPlatformInfo(platform, param, req_bytes, info, NULL)); | |
return info; | |
} | |
static char* opencl_device_info(cl_device_id device, | |
cl_device_info param) { | |
size_t req_bytes; | |
char *info; | |
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device, param, 0, NULL, &req_bytes)); | |
info = (char*) malloc(req_bytes); | |
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device, param, req_bytes, info, NULL)); | |
return info; | |
} | |
static int is_blacklisted(const char *platform_name, const char *device_name, | |
const struct futhark_context_config *cfg) { | |
if (strcmp(cfg->preferred_platform, "") != 0 || | |
strcmp(cfg->preferred_device, "") != 0) { | |
return 0; | |
} else if (strstr(platform_name, "Apple") != NULL && | |
strstr(device_name, "Intel(R) Core(TM)") != NULL) { | |
return 1; | |
} else { | |
return 0; | |
} | |
} | |
static void opencl_all_device_options(struct opencl_device_option **devices_out, | |
size_t *num_devices_out) { | |
size_t num_devices = 0, num_devices_added = 0; | |
cl_platform_id *all_platforms; | |
cl_uint *platform_num_devices; | |
cl_uint num_platforms; | |
// Find the number of platforms. | |
OPENCL_SUCCEED_FATAL(clGetPlatformIDs(0, NULL, &num_platforms)); | |
// Make room for them. | |
all_platforms = calloc(num_platforms, sizeof(cl_platform_id)); | |
platform_num_devices = calloc(num_platforms, sizeof(cl_uint)); | |
// Fetch all the platforms. | |
OPENCL_SUCCEED_FATAL(clGetPlatformIDs(num_platforms, all_platforms, NULL)); | |
// Count the number of devices for each platform, as well as the | |
// total number of devices. | |
for (cl_uint i = 0; i < num_platforms; i++) { | |
if (clGetDeviceIDs(all_platforms[i], CL_DEVICE_TYPE_ALL, | |
0, NULL, &platform_num_devices[i]) == CL_SUCCESS) { | |
num_devices += platform_num_devices[i]; | |
} else { | |
platform_num_devices[i] = 0; | |
} | |
} | |
// Make room for all the device options. | |
struct opencl_device_option *devices = | |
calloc(num_devices, sizeof(struct opencl_device_option)); | |
// Loop through the platforms, getting information about their devices. | |
for (cl_uint i = 0; i < num_platforms; i++) { | |
cl_platform_id platform = all_platforms[i]; | |
cl_uint num_platform_devices = platform_num_devices[i]; | |
if (num_platform_devices == 0) { | |
continue; | |
} | |
char *platform_name = opencl_platform_info(platform, CL_PLATFORM_NAME); | |
cl_device_id *platform_devices = | |
calloc(num_platform_devices, sizeof(cl_device_id)); | |
// Fetch all the devices. | |
OPENCL_SUCCEED_FATAL(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, | |
num_platform_devices, platform_devices, NULL)); | |
// Loop through the devices, adding them to the devices array. | |
for (cl_uint i = 0; i < num_platform_devices; i++) { | |
char *device_name = opencl_device_info(platform_devices[i], CL_DEVICE_NAME); | |
devices[num_devices_added].platform = platform; | |
devices[num_devices_added].device = platform_devices[i]; | |
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(platform_devices[i], CL_DEVICE_TYPE, | |
sizeof(cl_device_type), | |
&devices[num_devices_added].device_type, | |
NULL)); | |
// We don't want the structs to share memory, so copy the platform name. | |
// Each device name is already unique. | |
devices[num_devices_added].platform_name = strclone(platform_name); | |
devices[num_devices_added].device_name = device_name; | |
num_devices_added++; | |
} | |
free(platform_devices); | |
free(platform_name); | |
} | |
free(all_platforms); | |
free(platform_num_devices); | |
*devices_out = devices; | |
*num_devices_out = num_devices; | |
} | |
void futhark_context_config_select_device_interactively(struct futhark_context_config *cfg) { | |
struct opencl_device_option *devices; | |
size_t num_devices; | |
opencl_all_device_options(&devices, &num_devices); | |
printf("Choose OpenCL device:\n"); | |
const char *cur_platform = ""; | |
for (size_t i = 0; i < num_devices; i++) { | |
struct opencl_device_option device = devices[i]; | |
if (strcmp(cur_platform, device.platform_name) != 0) { | |
printf("Platform: %s\n", device.platform_name); | |
cur_platform = device.platform_name; | |
} | |
printf("[%d] %s\n", (int)i, device.device_name); | |
} | |
int selection; | |
printf("Choice: "); | |
if (scanf("%d", &selection) == 1) { | |
cfg->preferred_platform = ""; | |
cfg->preferred_device = ""; | |
cfg->preferred_device_num = selection; | |
cfg->ignore_blacklist = 1; | |
} | |
// Free all the platform and device names. | |
for (size_t j = 0; j < num_devices; j++) { | |
free(devices[j].platform_name); | |
free(devices[j].device_name); | |
} | |
free(devices); | |
} | |
void futhark_context_config_list_devices(struct futhark_context_config *cfg) { | |
(void)cfg; | |
struct opencl_device_option *devices; | |
size_t num_devices; | |
opencl_all_device_options(&devices, &num_devices); | |
const char *cur_platform = ""; | |
for (size_t i = 0; i < num_devices; i++) { | |
struct opencl_device_option device = devices[i]; | |
if (strcmp(cur_platform, device.platform_name) != 0) { | |
printf("Platform: %s\n", device.platform_name); | |
cur_platform = device.platform_name; | |
} | |
printf("[%d]: %s\n", (int)i, device.device_name); | |
} | |
// Free all the platform and device names. | |
for (size_t j = 0; j < num_devices; j++) { | |
free(devices[j].platform_name); | |
free(devices[j].device_name); | |
} | |
free(devices); | |
} | |
void futhark_context_config_dump_program_to(struct futhark_context_config *cfg, const char *path) { | |
cfg->dump_program_to = path; | |
} | |
void futhark_context_config_load_program_from(struct futhark_context_config *cfg, const char *path) { | |
cfg->load_program_from = path; | |
} | |
void futhark_context_config_dump_binary_to(struct futhark_context_config *cfg, const char *path) { | |
cfg->dump_binary_to = path; | |
} | |
void futhark_context_config_load_binary_from(struct futhark_context_config *cfg, const char *path) { | |
cfg->load_binary_from = path; | |
} | |
void futhark_context_config_set_default_group_size(struct futhark_context_config *cfg, int size) { | |
cfg->default_group_size = size; | |
cfg->default_group_size_changed = 1; | |
} | |
void futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg, int num) { | |
cfg->default_num_groups = num; | |
} | |
void futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg, int size) { | |
cfg->default_tile_size = size; | |
cfg->default_tile_size_changed = 1; | |
} | |
void futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg, int size) { | |
cfg->default_reg_tile_size = size; | |
} | |
void futhark_context_config_set_default_threshold(struct futhark_context_config *cfg, int size) { | |
cfg->default_threshold = size; | |
} | |
int futhark_context_config_set_tuning_param(struct futhark_context_config *cfg, | |
const char *param_name, | |
size_t new_value) { | |
for (int i = 0; i < cfg->num_tuning_params; i++) { | |
if (strcmp(param_name, cfg->tuning_param_names[i]) == 0) { | |
cfg->tuning_params[i] = new_value; | |
return 0; | |
} | |
} | |
if (strcmp(param_name, "default_group_size") == 0) { | |
cfg->default_group_size = new_value; | |
return 0; | |
} | |
if (strcmp(param_name, "default_num_groups") == 0) { | |
cfg->default_num_groups = new_value; | |
return 0; | |
} | |
if (strcmp(param_name, "default_threshold") == 0) { | |
cfg->default_threshold = new_value; | |
return 0; | |
} | |
if (strcmp(param_name, "default_tile_size") == 0) { | |
cfg->default_tile_size = new_value; | |
return 0; | |
} | |
if (strcmp(param_name, "default_reg_tile_size") == 0) { | |
cfg->default_reg_tile_size = new_value; | |
return 0; | |
} | |
return 1; | |
} | |
// A record of something that happened. | |
struct profiling_record { | |
cl_event *event; | |
int *runs; | |
int64_t *runtime; | |
}; | |
struct futhark_context { | |
struct futhark_context_config* cfg; | |
int detail_memory; | |
int debugging; | |
int profiling; | |
int profiling_paused; | |
int logging; | |
lock_t lock; | |
char *error; | |
lock_t error_lock; | |
FILE *log; | |
struct constants *constants; | |
struct free_list free_list; | |
int64_t peak_mem_usage_default; | |
int64_t cur_mem_usage_default; | |
struct program* program; | |
// Common fields above. | |
cl_mem global_failure; | |
cl_mem global_failure_args; | |
struct tuning_params tuning_params; | |
// True if a potentially failing kernel has been enqueued. | |
cl_int failure_is_an_option; | |
int total_runs; | |
long int total_runtime; | |
int64_t peak_mem_usage_device; | |
int64_t cur_mem_usage_device; | |
cl_device_id device; | |
cl_context ctx; | |
cl_command_queue queue; | |
cl_program clprogram; | |
struct free_list cl_free_list; | |
size_t max_group_size; | |
size_t max_num_groups; | |
size_t max_tile_size; | |
size_t max_threshold; | |
size_t max_local_memory; | |
size_t lockstep_width; | |
struct profiling_record *profiling_records; | |
int profiling_records_capacity; | |
int profiling_records_used; | |
}; | |
static cl_build_status build_opencl_program(cl_program program, cl_device_id device, const char* options) { | |
cl_int clBuildProgram_error = clBuildProgram(program, 1, &device, options, NULL, NULL); | |
// Avoid termination due to CL_BUILD_PROGRAM_FAILURE | |
if (clBuildProgram_error != CL_SUCCESS && | |
clBuildProgram_error != CL_BUILD_PROGRAM_FAILURE) { | |
OPENCL_SUCCEED_FATAL(clBuildProgram_error); | |
} | |
cl_build_status build_status; | |
OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, | |
device, | |
CL_PROGRAM_BUILD_STATUS, | |
sizeof(cl_build_status), | |
&build_status, | |
NULL)); | |
if (build_status != CL_SUCCESS) { | |
char *build_log; | |
size_t ret_val_size; | |
OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size)); | |
build_log = (char*) malloc(ret_val_size+1); | |
OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL)); | |
// The spec technically does not say whether the build log is zero-terminated, so let's be careful. | |
build_log[ret_val_size] = '\0'; | |
fprintf(stderr, "Build log:\n%s\n", build_log); | |
free(build_log); | |
} | |
return build_status; | |
} | |
static char* mk_compile_opts(struct futhark_context *ctx, | |
const char *extra_build_opts[], | |
struct opencl_device_option device_option) { | |
int compile_opts_size = 1024; | |
for (int i = 0; i < ctx->cfg->num_tuning_params; i++) { | |
compile_opts_size += strlen(ctx->cfg->tuning_param_names[i]) + 20; | |
} | |
for (int i = 0; extra_build_opts[i] != NULL; i++) { | |
compile_opts_size += strlen(extra_build_opts[i] + 1); | |
} | |
char *compile_opts = (char*) malloc(compile_opts_size); | |
int w = snprintf(compile_opts, compile_opts_size, | |
"-DLOCKSTEP_WIDTH=%d ", | |
(int)ctx->lockstep_width); | |
w += snprintf(compile_opts+w, compile_opts_size-w, | |
"-D%s=%d ", | |
"max_group_size", | |
(int)ctx->max_group_size); | |
for (int i = 0; i < ctx->cfg->num_tuning_params; i++) { | |
w += snprintf(compile_opts+w, compile_opts_size-w, | |
"-D%s=%d ", | |
ctx->cfg->tuning_param_vars[i], | |
(int)ctx->cfg->tuning_params[i]); | |
} | |
for (int i = 0; extra_build_opts[i] != NULL; i++) { | |
w += snprintf(compile_opts+w, compile_opts_size-w, | |
"%s ", extra_build_opts[i]); | |
} | |
// Oclgrind claims to support cl_khr_fp16, but this is not actually | |
// the case. | |
if (strcmp(device_option.platform_name, "Oclgrind") == 0) { | |
w += snprintf(compile_opts+w, compile_opts_size-w, "-DEMULATE_F16 "); | |
} | |
return compile_opts; | |
} | |
// Count up the runtime all the profiling_records that occured during execution. | |
// Also clears the buffer of profiling_records. | |
static cl_int opencl_tally_profiling_records(struct futhark_context *ctx) { | |
cl_int err; | |
for (int i = 0; i < ctx->profiling_records_used; i++) { | |
struct profiling_record record = ctx->profiling_records[i]; | |
cl_ulong start_t, end_t; | |
if ((err = clGetEventProfilingInfo(*record.event, | |
CL_PROFILING_COMMAND_START, | |
sizeof(start_t), | |
&start_t, | |
NULL)) != CL_SUCCESS) { | |
return err; | |
} | |
if ((err = clGetEventProfilingInfo(*record.event, | |
CL_PROFILING_COMMAND_END, | |
sizeof(end_t), | |
&end_t, | |
NULL)) != CL_SUCCESS) { | |
return err; | |
} | |
// OpenCL provides nanosecond resolution, but we want | |
// microseconds. | |
*record.runs += 1; | |
*record.runtime += (end_t - start_t)/1000; | |
if ((err = clReleaseEvent(*record.event)) != CL_SUCCESS) { | |
return err; | |
} | |
free(record.event); | |
} | |
ctx->profiling_records_used = 0; | |
return CL_SUCCESS; | |
} | |
// If profiling, produce an event associated with a profiling record. | |
static cl_event* opencl_get_event(struct futhark_context *ctx, int *runs, int64_t *runtime) { | |
if (ctx->profiling_records_used == ctx->profiling_records_capacity) { | |
ctx->profiling_records_capacity *= 2; | |
ctx->profiling_records = | |
realloc(ctx->profiling_records, | |
ctx->profiling_records_capacity * | |
sizeof(struct profiling_record)); | |
} | |
cl_event *event = malloc(sizeof(cl_event)); | |
ctx->profiling_records[ctx->profiling_records_used].event = event; | |
ctx->profiling_records[ctx->profiling_records_used].runs = runs; | |
ctx->profiling_records[ctx->profiling_records_used].runtime = runtime; | |
ctx->profiling_records_used++; | |
return event; | |
} | |
// Allocate memory from driver. The problem is that OpenCL may perform | |
// lazy allocation, so we cannot know whether an allocation succeeded | |
// until the first time we try to use it. Hence we immediately | |
// perform a write to see if the allocation succeeded. This is slow, | |
// but the assumption is that this operation will be rare (most things | |
// will go through the free list). | |
static int opencl_alloc_actual(struct futhark_context *ctx, size_t size, cl_mem *mem_out) { | |
int error; | |
*mem_out = clCreateBuffer(ctx->ctx, CL_MEM_READ_WRITE, size, NULL, &error); | |
if (error != CL_SUCCESS) { | |
return error; | |
} | |
int x = 2; | |
error = clEnqueueWriteBuffer(ctx->queue, *mem_out, | |
CL_TRUE, | |
0, sizeof(x), &x, | |
0, NULL, NULL); | |
// No need to wait for completion here. clWaitForEvents() cannot | |
// return mem object allocation failures. This implies that the | |
// buffer is faulted onto the device on enqueue. (Observation by | |
// Andreas Kloeckner.) | |
return error; | |
} | |
static int opencl_alloc(struct futhark_context *ctx, FILE *log, | |
size_t min_size, const char *tag, | |
cl_mem *mem_out, size_t *size_out) { | |
(void)tag; | |
if (min_size < sizeof(int)) { | |
min_size = sizeof(int); | |
} | |
cl_mem* memptr; | |
if (free_list_find(&ctx->cl_free_list, min_size, tag, size_out, (fl_mem*)&memptr) == 0) { | |
// Successfully found a free block. Is it big enough? | |
if (*size_out >= min_size) { | |
if (ctx->cfg->debugging) { | |
fprintf(log, "No need to allocate: Found a block in the free list.\n"); | |
} | |
*mem_out = *memptr; | |
free(memptr); | |
return CL_SUCCESS; | |
} else { | |
if (ctx->cfg->debugging) { | |
fprintf(log, "Found a free block, but it was too small.\n"); | |
} | |
int error = clReleaseMemObject(*memptr); | |
free(*memptr); | |
if (error != CL_SUCCESS) { | |
return error; | |
} | |
} | |
} | |
*size_out = min_size; | |
// We have to allocate a new block from the driver. If the | |
// allocation does not succeed, then we might be in an out-of-memory | |
// situation. We now start freeing things from the free list until | |
// we think we have freed enough that the allocation will succeed. | |
// Since we don't know how far the allocation is from fitting, we | |
// have to check after every deallocation. This might be pretty | |
// expensive. Let's hope that this case is hit rarely. | |
if (ctx->cfg->debugging) { | |
fprintf(log, "Actually allocating the desired block.\n"); | |
} | |
int error = opencl_alloc_actual(ctx, min_size, mem_out); | |
while (error == CL_MEM_OBJECT_ALLOCATION_FAILURE) { | |
if (ctx->cfg->debugging) { | |
fprintf(log, "Out of OpenCL memory: releasing entry from the free list...\n"); | |
} | |
cl_mem* memptr; | |
if (free_list_first(&ctx->cl_free_list, (fl_mem*)&memptr) == 0) { | |
cl_mem mem = *memptr; | |
free(memptr); | |
error = clReleaseMemObject(mem); | |
if (error != CL_SUCCESS) { | |
return error; | |
} | |
} else { | |
break; | |
} | |
error = opencl_alloc_actual(ctx, min_size, mem_out); | |
} | |
return error; | |
} | |
static int opencl_free(struct futhark_context *ctx, | |
cl_mem mem, size_t size, const char *tag) { | |
cl_mem* memptr = malloc(sizeof(cl_mem)); | |
*memptr = mem; | |
free_list_insert(&ctx->cl_free_list, size, (fl_mem)memptr, tag); | |
return CL_SUCCESS; | |
} | |
static int opencl_free_all(struct futhark_context *ctx) { | |
free_list_pack(&ctx->cl_free_list); | |
cl_mem* memptr; | |
while (free_list_first(&ctx->cl_free_list, (fl_mem*)&memptr) == 0) { | |
cl_mem mem = *memptr; | |
free(memptr); | |
int error = clReleaseMemObject(mem); | |
if (error != CL_SUCCESS) { | |
return error; | |
} | |
} | |
return CL_SUCCESS; | |
} | |
int futhark_context_sync(struct futhark_context* ctx) { | |
// Check for any delayed error. | |
cl_int failure_idx = -1; | |
if (ctx->failure_is_an_option) { | |
OPENCL_SUCCEED_OR_RETURN( | |
clEnqueueReadBuffer(ctx->queue, | |
ctx->global_failure, | |
CL_FALSE, | |
0, sizeof(cl_int), &failure_idx, | |
0, NULL, NULL)); | |
ctx->failure_is_an_option = 0; | |
} | |
OPENCL_SUCCEED_OR_RETURN(clFinish(ctx->queue)); | |
if (failure_idx >= 0) { | |
// We have to clear global_failure so that the next entry point | |
// is not considered a failure from the start. | |
cl_int no_failure = -1; | |
OPENCL_SUCCEED_OR_RETURN( | |
clEnqueueWriteBuffer(ctx->queue, ctx->global_failure, CL_TRUE, | |
0, sizeof(cl_int), &no_failure, | |
0, NULL, NULL)); | |
int64_t args[max_failure_args+1]; | |
OPENCL_SUCCEED_OR_RETURN( | |
clEnqueueReadBuffer(ctx->queue, | |
ctx->global_failure_args, | |
CL_TRUE, | |
0, sizeof(args), &args, | |
0, NULL, NULL)); | |
ctx->error = get_failure_msg(failure_idx, args); | |
return FUTHARK_PROGRAM_ERROR; | |
} | |
return 0; | |
} | |
// We take as input several strings representing the program, because | |
// C does not guarantee that the compiler supports particularly large | |
// literals. Notably, Visual C has a limit of 2048 characters. The | |
// array must be NULL-terminated. | |
static void setup_opencl_with_command_queue(struct futhark_context *ctx, | |
cl_command_queue queue, | |
const char *srcs[], | |
const char *extra_build_opts[], | |
const char* cache_fname) { | |
int error; | |
free_list_init(&ctx->cl_free_list); | |
ctx->queue = queue; | |
OPENCL_SUCCEED_FATAL(clGetCommandQueueInfo(ctx->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx->ctx, NULL)); | |
// Fill out the device info. This is redundant work if we are | |
// called from setup_opencl() (which is the common case), but I | |
// doubt it matters much. | |
struct opencl_device_option device_option; | |
OPENCL_SUCCEED_FATAL(clGetCommandQueueInfo(ctx->queue, CL_QUEUE_DEVICE, | |
sizeof(cl_device_id), | |
&device_option.device, | |
NULL)); | |
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_PLATFORM, | |
sizeof(cl_platform_id), | |
&device_option.platform, | |
NULL)); | |
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_TYPE, | |
sizeof(cl_device_type), | |
&device_option.device_type, | |
NULL)); | |
device_option.platform_name = opencl_platform_info(device_option.platform, CL_PLATFORM_NAME); | |
device_option.device_name = opencl_device_info(device_option.device, CL_DEVICE_NAME); | |
ctx->device = device_option.device; | |
if (f64_required) { | |
cl_uint supported; | |
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, | |
sizeof(cl_uint), &supported, NULL)); | |
if (!supported) { | |
futhark_panic(1, "Program uses double-precision floats, but this is not supported on the chosen device: %s\n", | |
device_option.device_name); | |
} | |
} | |
size_t max_group_size; | |
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, | |
sizeof(size_t), &max_group_size, NULL)); | |
size_t max_tile_size = sqrt(max_group_size); | |
cl_ulong max_local_memory; | |
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_LOCAL_MEM_SIZE, | |
sizeof(size_t), &max_local_memory, NULL)); | |
// Futhark reserves 4 bytes for bookkeeping information. | |
max_local_memory -= 4; | |
// The OpenCL implementation may reserve some local memory bytes for | |
// various purposes. In principle, we should use | |
// clGetKernelWorkGroupInfo() to figure out for each kernel how much | |
// is actually available, but our current code generator design | |
// makes this infeasible. Instead, we have this nasty hack where we | |
// arbitrarily subtract some bytes, based on empirical measurements | |
// (but which might be arbitrarily wrong). Fortunately, we rarely | |
// try to really push the local memory usage. | |
if (strstr(device_option.platform_name, "NVIDIA CUDA") != NULL) { | |
max_local_memory -= 12; | |
} else if (strstr(device_option.platform_name, "AMD") != NULL) { | |
max_local_memory -= 16; | |
} | |
// Make sure this function is defined. | |
post_opencl_setup(ctx, &device_option); | |
if (max_group_size < ctx->cfg->default_group_size) { | |
if (ctx->cfg->default_group_size_changed) { | |
fprintf(stderr, "Note: Device limits default group size to %zu (down from %zu).\n", | |
max_group_size, ctx->cfg->default_group_size); | |
} | |
ctx->cfg->default_group_size = max_group_size; | |
} | |
if (max_tile_size < ctx->cfg->default_tile_size) { | |
if (ctx->cfg->default_tile_size_changed) { | |
fprintf(stderr, "Note: Device limits default tile size to %zu (down from %zu).\n", | |
max_tile_size, ctx->cfg->default_tile_size); | |
} | |
ctx->cfg->default_tile_size = max_tile_size; | |
} | |
ctx->max_group_size = max_group_size; | |
ctx->max_tile_size = max_tile_size; // No limit. | |
ctx->max_threshold = ctx->max_num_groups = 0; // No limit. | |
ctx->max_local_memory = max_local_memory; | |
// Now we go through all the sizes, clamp them to the valid range, | |
// or set them to the default. | |
for (int i = 0; i < ctx->cfg->num_tuning_params; i++) { | |
const char *size_class = ctx->cfg->tuning_param_classes[i]; | |
int64_t *size_value = &ctx->cfg->tuning_params[i]; | |
const char* size_name = ctx->cfg->tuning_param_names[i]; | |
int64_t max_value = 0, default_value = 0; | |
if (strstr(size_class, "group_size") == size_class) { | |
max_value = max_group_size; | |
default_value = ctx->cfg->default_group_size; | |
} else if (strstr(size_class, "num_groups") == size_class) { | |
max_value = max_group_size; // Futhark assumes this constraint. | |
default_value = ctx->cfg->default_num_groups; | |
// XXX: as a quick and dirty hack, use twice as many threads for | |
// histograms by default. We really should just be smarter | |
// about sizes somehow. | |
if (strstr(size_name, ".seghist_") != NULL) { | |
default_value *= 2; | |
} | |
} else if (strstr(size_class, "tile_size") == size_class) { | |
max_value = sqrt(max_group_size); | |
default_value = ctx->cfg->default_tile_size; | |
} else if (strstr(size_class, "reg_tile_size") == size_class) { | |
max_value = 0; // No limit. | |
default_value = ctx->cfg->default_reg_tile_size; | |
} else if (strstr(size_class, "threshold") == size_class) { | |
// Threshold can be as large as it takes. | |
default_value = ctx->cfg->default_threshold; | |
} else { | |
// Bespoke sizes have no limit or default. | |
} | |
if (*size_value == 0) { | |
*size_value = default_value; | |
} else if (max_value > 0 && *size_value > max_value) { | |
fprintf(stderr, "Note: Device limits %s to %d (down from %d)\n", | |
size_name, (int)max_value, (int)*size_value); | |
*size_value = max_value; | |
} | |
} | |
if (ctx->lockstep_width == 0) { | |
ctx->lockstep_width = 1; | |
} | |
if (ctx->cfg->logging) { | |
fprintf(stderr, "Lockstep width: %d\n", (int)ctx->lockstep_width); | |
fprintf(stderr, "Default group size: %d\n", (int)ctx->cfg->default_group_size); | |
fprintf(stderr, "Default number of groups: %d\n", (int)ctx->cfg->default_num_groups); | |
} | |
char *compile_opts = mk_compile_opts(ctx, extra_build_opts, device_option); | |
if (ctx->cfg->logging) { | |
fprintf(stderr, "OpenCL compiler options: %s\n", compile_opts); | |
} | |
char *fut_opencl_src = NULL; | |
cl_program prog; | |
error = CL_SUCCESS; | |
struct cache_hash h; | |
int loaded_from_cache = 0; | |
if (ctx->cfg->load_binary_from == NULL) { | |
size_t src_size = 0; | |
// Maybe we have to read OpenCL source from somewhere else (used for debugging). | |
if (ctx->cfg->load_program_from != NULL) { | |
fut_opencl_src = slurp_file(ctx->cfg->load_program_from, NULL); | |
assert(fut_opencl_src != NULL); | |
} else { | |
// Construct the OpenCL source concatenating all the fragments. | |
for (const char **src = srcs; src && *src; src++) { | |
src_size += strlen(*src); | |
} | |
fut_opencl_src = (char*) malloc(src_size + 1); | |
size_t n, i; | |
for (i = 0, n = 0; srcs && srcs[i]; i++) { | |
strncpy(fut_opencl_src+n, srcs[i], src_size-n); | |
n += strlen(srcs[i]); | |
} | |
fut_opencl_src[src_size] = 0; | |
} | |
if (ctx->cfg->dump_program_to != NULL) { | |
if (ctx->cfg->logging) { | |
fprintf(stderr, "Dumping OpenCL source to %s...\n", ctx->cfg->dump_program_to); | |
} | |
dump_file(ctx->cfg->dump_program_to, fut_opencl_src, strlen(fut_opencl_src)); | |
} | |
if (cache_fname != NULL) { | |
if (ctx->cfg->logging) { | |
fprintf(stderr, "Restoring cache from from %s...\n", cache_fname); | |
} | |
cache_hash_init(&h); | |
cache_hash(&h, fut_opencl_src, strlen(fut_opencl_src)); | |
cache_hash(&h, compile_opts, strlen(compile_opts)); | |
unsigned char *buf; | |
size_t bufsize; | |
errno = 0; | |
if (cache_restore(cache_fname, &h, &buf, &bufsize) != 0) { | |
if (ctx->cfg->logging) { | |
fprintf(stderr, "Failed to restore cache (errno: %s)\n", strerror(errno)); | |
} | |
} else { | |
if (ctx->cfg->logging) { | |
fprintf(stderr, "Cache restored; loading OpenCL binary...\n"); | |
} | |
cl_int status = 0; | |
prog = clCreateProgramWithBinary(ctx->ctx, 1, &device_option.device, | |
&bufsize, (const unsigned char**)&buf, | |
&status, &error); | |
if (status == CL_SUCCESS) { | |
loaded_from_cache = 1; | |
if (ctx->cfg->logging) { | |
fprintf(stderr, "Loading succeeded.\n"); | |
} | |
} else { | |
if (ctx->cfg->logging) { | |
fprintf(stderr, "Loading failed.\n"); | |
} | |
} | |
} | |
} | |
if (!loaded_from_cache) { | |
if (ctx->cfg->logging) { | |
fprintf(stderr, "Creating OpenCL program...\n"); | |
} | |
const char* src_ptr[] = {fut_opencl_src}; | |
prog = clCreateProgramWithSource(ctx->ctx, 1, src_ptr, &src_size, &error); | |
OPENCL_SUCCEED_FATAL(error); | |
} | |
} else { | |
if (ctx->cfg->logging) { | |
fprintf(stderr, "Loading OpenCL binary from %s...\n", ctx->cfg->load_binary_from); | |
} | |
size_t binary_size; | |
unsigned char *fut_opencl_bin = | |
(unsigned char*) slurp_file(ctx->cfg->load_binary_from, &binary_size); | |
assert(fut_opencl_bin != NULL); | |
const unsigned char *binaries[1] = { fut_opencl_bin }; | |
cl_int status = 0; | |
prog = clCreateProgramWithBinary(ctx->ctx, 1, &device_option.device, | |
&binary_size, binaries, | |
&status, &error); | |
OPENCL_SUCCEED_FATAL(status); | |
OPENCL_SUCCEED_FATAL(error); | |
} | |
if (ctx->cfg->logging) { | |
fprintf(stderr, "Building OpenCL program...\n"); | |
} | |
OPENCL_SUCCEED_FATAL(build_opencl_program(prog, device_option.device, compile_opts)); | |
free(compile_opts); | |
free(fut_opencl_src); | |
size_t binary_size = 0; | |
unsigned char *binary = NULL; | |
int store_in_cache = cache_fname != NULL && !loaded_from_cache; | |
if (store_in_cache || ctx->cfg->dump_binary_to != NULL) { | |
OPENCL_SUCCEED_FATAL(clGetProgramInfo(prog, CL_PROGRAM_BINARY_SIZES, | |
sizeof(size_t), &binary_size, NULL)); | |
binary = (unsigned char*) malloc(binary_size); | |
OPENCL_SUCCEED_FATAL(clGetProgramInfo(prog, CL_PROGRAM_BINARIES, | |
sizeof(unsigned char*), &binary, NULL)); | |
} | |
if (store_in_cache) { | |
if (ctx->cfg->logging) { | |
fprintf(stderr, "Caching OpenCL binary in %s...\n", cache_fname); | |
} | |
if (cache_store(cache_fname, &h, binary, binary_size) != 0) { | |
printf("Failed to cache binary: %s\n", strerror(errno)); | |
} | |
} | |
if (ctx->cfg->dump_binary_to != NULL) { | |
if (ctx->cfg->logging) { | |
fprintf(stderr, "Dumping OpenCL binary to %s...\n", ctx->cfg->dump_binary_to); | |
} | |
dump_file(ctx->cfg->dump_binary_to, binary, binary_size); | |
} | |
ctx->clprogram = prog; | |
} | |
static struct opencl_device_option get_preferred_device(const struct futhark_context_config *cfg) { | |
struct opencl_device_option *devices; | |
size_t num_devices; | |
opencl_all_device_options(&devices, &num_devices); | |
int num_device_matches = 0; | |
for (size_t i = 0; i < num_devices; i++) { | |
struct opencl_device_option device = devices[i]; | |
if (strstr(device.platform_name, cfg->preferred_platform) != NULL && | |
strstr(device.device_name, cfg->preferred_device) != NULL && | |
(cfg->ignore_blacklist || | |
!is_blacklisted(device.platform_name, device.device_name, cfg)) && | |
num_device_matches++ == cfg->preferred_device_num) { | |
// Free all the platform and device names, except the ones we have chosen. | |
for (size_t j = 0; j < num_devices; j++) { | |
if (j != i) { | |
free(devices[j].platform_name); | |
free(devices[j].device_name); | |
} | |
} | |
free(devices); | |
return device; | |
} | |
} | |
futhark_panic(1, "Could not find acceptable OpenCL device.\n"); | |
exit(1); // Never reached | |
} | |
static void setup_opencl(struct futhark_context *ctx, | |
const char *srcs[], | |
const char *extra_build_opts[], | |
const char* cache_fname) { | |
struct opencl_device_option device_option = get_preferred_device(ctx->cfg); | |
if (ctx->cfg->logging) { | |
fprintf(stderr, "Using platform: %s\n", device_option.platform_name); | |
fprintf(stderr, "Using device: %s\n", device_option.device_name); | |
} | |
// Note that NVIDIA's OpenCL requires the platform property | |
cl_context_properties properties[] = { | |
CL_CONTEXT_PLATFORM, | |
(cl_context_properties)device_option.platform, | |
0 | |
}; | |
cl_int clCreateContext_error; | |
ctx->ctx = clCreateContext(properties, 1, &device_option.device, NULL, NULL, &clCreateContext_error); | |
OPENCL_SUCCEED_FATAL(clCreateContext_error); | |
cl_int clCreateCommandQueue_error; | |
cl_command_queue queue = | |
clCreateCommandQueue(ctx->ctx, | |
device_option.device, | |
ctx->cfg->profiling ? CL_QUEUE_PROFILING_ENABLE : 0, | |
&clCreateCommandQueue_error); | |
OPENCL_SUCCEED_FATAL(clCreateCommandQueue_error); | |
setup_opencl_with_command_queue(ctx, queue, srcs, extra_build_opts, cache_fname); | |
} | |
int backend_context_setup(struct futhark_context* ctx) { | |
ctx->lockstep_width = 0; // Real value set later. | |
ctx->profiling_records_capacity = 200; | |
ctx->profiling_records_used = 0; | |
ctx->profiling_records = | |
malloc(ctx->profiling_records_capacity * | |
sizeof(struct profiling_record)); | |
ctx->failure_is_an_option = 0; | |
ctx->total_runs = 0; | |
ctx->total_runtime = 0; | |
ctx->peak_mem_usage_device = 0; | |
ctx->cur_mem_usage_device = 0; | |
if (ctx->cfg->queue_set) { | |
setup_opencl_with_command_queue(ctx, ctx->cfg->queue, opencl_program, ctx->cfg->build_opts, ctx->cfg->cache_fname); | |
} else { | |
setup_opencl(ctx, opencl_program, ctx->cfg->build_opts, ctx->cfg->cache_fname); | |
} | |
cl_int error; | |
cl_int no_error = -1; | |
ctx->global_failure = | |
clCreateBuffer(ctx->ctx, | |
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, | |
sizeof(cl_int), &no_error, &error); | |
OPENCL_SUCCEED_OR_RETURN(error); | |
// The +1 is to avoid zero-byte allocations. | |
ctx->global_failure_args = | |
clCreateBuffer(ctx->ctx, | |
CL_MEM_READ_WRITE, | |
sizeof(int64_t)*(max_failure_args+1), NULL, &error); | |
OPENCL_SUCCEED_OR_RETURN(error); | |
return 0; | |
} | |
void backend_context_teardown(struct futhark_context* ctx) { | |
OPENCL_SUCCEED_FATAL(clReleaseMemObject(ctx->global_failure)); | |
OPENCL_SUCCEED_FATAL(clReleaseMemObject(ctx->global_failure_args)); | |
(void)opencl_tally_profiling_records(ctx); | |
free(ctx->profiling_records); | |
(void)opencl_free_all(ctx); | |
(void)clReleaseProgram(ctx->clprogram); | |
(void)clReleaseCommandQueue(ctx->queue); | |
(void)clReleaseContext(ctx->ctx); | |
} | |
cl_command_queue futhark_context_get_command_queue(struct futhark_context* ctx) { | |
return ctx->queue; | |
} | |
// End of backends/opencl.h | |
static char *get_failure_msg(int failure_idx, int64_t args[]) | |
{ | |
switch (failure_idx) { } | |
return strdup("Unknown error. This is a compiler bug."); | |
} | |
void post_opencl_setup(struct futhark_context *ctx, struct opencl_device_option *option) | |
{ | |
if ((ctx->lockstep_width == 0 && strstr(option->platform_name, "NVIDIA CUDA") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) { | |
ctx->lockstep_width = 32; | |
} | |
if ((ctx->lockstep_width == 0 && strstr(option->platform_name, "AMD Accelerated Parallel Processing") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) { | |
ctx->lockstep_width = 32; | |
} | |
if ((ctx->lockstep_width == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) { | |
ctx->lockstep_width = 1; | |
} | |
if ((ctx->cfg->default_num_groups == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) { | |
size_t MAX_COMPUTE_UNITS_val = 0; | |
clGetDeviceInfo(ctx->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MAX_COMPUTE_UNITS_val), &MAX_COMPUTE_UNITS_val, NULL); | |
ctx->cfg->default_num_groups = 4 * MAX_COMPUTE_UNITS_val; | |
} | |
if ((ctx->cfg->default_group_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) { | |
ctx->cfg->default_group_size = 256; | |
} | |
if ((ctx->cfg->default_tile_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) { | |
ctx->cfg->default_tile_size = 16; | |
} | |
if ((ctx->cfg->default_reg_tile_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) { | |
ctx->cfg->default_reg_tile_size = 4; | |
} | |
if ((ctx->cfg->default_threshold == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) { | |
ctx->cfg->default_threshold = 32768; | |
} | |
if ((ctx->lockstep_width == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) { | |
ctx->lockstep_width = 1; | |
} | |
if ((ctx->cfg->default_num_groups == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) { | |
size_t MAX_COMPUTE_UNITS_val = 0; | |
clGetDeviceInfo(ctx->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MAX_COMPUTE_UNITS_val), &MAX_COMPUTE_UNITS_val, NULL); | |
ctx->cfg->default_num_groups = MAX_COMPUTE_UNITS_val; | |
} | |
if ((ctx->cfg->default_group_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) { | |
ctx->cfg->default_group_size = 32; | |
} | |
if ((ctx->cfg->default_tile_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) { | |
ctx->cfg->default_tile_size = 4; | |
} | |
if ((ctx->cfg->default_reg_tile_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) { | |
ctx->cfg->default_reg_tile_size = 1; | |
} | |
if ((ctx->cfg->default_threshold == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) { | |
size_t MAX_COMPUTE_UNITS_val = 0; | |
clGetDeviceInfo(ctx->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MAX_COMPUTE_UNITS_val), &MAX_COMPUTE_UNITS_val, NULL); | |
ctx->cfg->default_threshold = MAX_COMPUTE_UNITS_val; | |
} | |
} | |
struct program { | |
cl_kernel addzisegmap_6892; | |
cl_kernel add_i64zisegmap_6912; | |
int64_t copy_dev_to_dev_total_runtime; | |
int copy_dev_to_dev_runs; | |
int64_t copy_dev_to_host_total_runtime; | |
int copy_dev_to_host_runs; | |
int64_t copy_host_to_dev_total_runtime; | |
int copy_host_to_dev_runs; | |
int64_t copy_scalar_to_dev_total_runtime; | |
int copy_scalar_to_dev_runs; | |
int64_t copy_scalar_from_dev_total_runtime; | |
int copy_scalar_from_dev_runs; | |
int64_t addzisegmap_6892_total_runtime; | |
int addzisegmap_6892_runs; | |
int64_t add_i64zisegmap_6912_total_runtime; | |
int add_i64zisegmap_6912_runs; | |
}; | |
static void setup_program(struct futhark_context *ctx) | |
{ | |
(void) ctx; | |
int error = 0; | |
(void) error; | |
ctx->program = malloc(sizeof(struct program)); | |
ctx->program->copy_dev_to_dev_total_runtime = 0; | |
ctx->program->copy_dev_to_dev_runs = 0; | |
ctx->program->copy_dev_to_host_total_runtime = 0; | |
ctx->program->copy_dev_to_host_runs = 0; | |
ctx->program->copy_host_to_dev_total_runtime = 0; | |
ctx->program->copy_host_to_dev_runs = 0; | |
ctx->program->copy_scalar_to_dev_total_runtime = 0; | |
ctx->program->copy_scalar_to_dev_runs = 0; | |
ctx->program->copy_scalar_from_dev_total_runtime = 0; | |
ctx->program->copy_scalar_from_dev_runs = 0; | |
ctx->program->addzisegmap_6892_total_runtime = 0; | |
ctx->program->addzisegmap_6892_runs = 0; | |
ctx->program->add_i64zisegmap_6912_total_runtime = 0; | |
ctx->program->add_i64zisegmap_6912_runs = 0; | |
{ | |
ctx->program->addzisegmap_6892 = clCreateKernel(ctx->clprogram, "addzisegmap_6892", &error); | |
OPENCL_SUCCEED_FATAL(error); | |
OPENCL_SUCCEED_FATAL(clSetKernelArg(ctx->program->addzisegmap_6892, 0, sizeof(cl_mem), &ctx->global_failure)); | |
if (ctx->debugging) | |
fprintf(ctx->log, "Created kernel %s.\n", "add.segmap_6892"); | |
} | |
{ | |
ctx->program->add_i64zisegmap_6912 = clCreateKernel(ctx->clprogram, "add_i64zisegmap_6912", &error); | |
OPENCL_SUCCEED_FATAL(error); | |
OPENCL_SUCCEED_FATAL(clSetKernelArg(ctx->program->add_i64zisegmap_6912, 0, sizeof(cl_mem), &ctx->global_failure)); | |
if (ctx->debugging) | |
fprintf(ctx->log, "Created kernel %s.\n", "add_i64.segmap_6912"); | |
} | |
} | |
static void teardown_program(struct futhark_context *ctx) | |
{ | |
(void) ctx; | |
int error = 0; | |
(void) error; | |
OPENCL_SUCCEED_FATAL(clReleaseKernel(ctx->program->addzisegmap_6892)); | |
OPENCL_SUCCEED_FATAL(clReleaseKernel(ctx->program->add_i64zisegmap_6912)); | |
free(ctx->program); | |
} | |
static void set_tuning_params(struct futhark_context *ctx) | |
{ | |
(void) ctx; | |
ctx->tuning_params.addzisegmap_group_sizze_6879 = &ctx->cfg->tuning_params[0]; | |
ctx->tuning_params.add_i64zisegmap_group_sizze_6899 = &ctx->cfg->tuning_params[1]; | |
} | |
int memblock_unref_device(struct futhark_context *ctx, struct memblock_device *block, const char *desc) | |
{ | |
if (block->references != NULL) { | |
*block->references -= 1; | |
if (ctx->detail_memory) | |
fprintf(ctx->log, "Unreferencing block %s (allocated as %s) in %s: %d references remaining.\n", desc, block->desc, "space 'device'", *block->references); | |
if (*block->references == 0) { | |
ctx->cur_mem_usage_device -= block->size; | |
OPENCL_SUCCEED_OR_RETURN(opencl_free(ctx, block->mem, block->size, desc)); | |
free(block->references); | |
if (ctx->detail_memory) | |
fprintf(ctx->log, "%lld bytes freed (now allocated: %lld bytes)\n", (long long) block->size, (long long) ctx->cur_mem_usage_device); | |
} | |
block->references = NULL; | |
} | |
return 0; | |
} | |
int memblock_alloc_device(struct futhark_context *ctx, struct memblock_device *block, int64_t size, const char *desc) | |
{ | |
if (size < 0) | |
futhark_panic(1, "Negative allocation of %lld bytes attempted for %s in %s.\n", (long long) size, desc, "space 'device'", ctx->cur_mem_usage_device); | |
int ret = memblock_unref_device(ctx, block, desc); | |
if (ret != FUTHARK_SUCCESS) | |
return ret; | |
if (ctx->detail_memory) | |
fprintf(ctx->log, "Allocating %lld bytes for %s in %s (then allocated: %lld bytes)", (long long) size, desc, "space 'device'", (long long) ctx->cur_mem_usage_device + size); | |
if (ctx->cur_mem_usage_device > ctx->peak_mem_usage_device) { | |
ctx->peak_mem_usage_device = ctx->cur_mem_usage_device; | |
if (ctx->detail_memory) | |
fprintf(ctx->log, " (new peak).\n"); | |
} else if (ctx->detail_memory) | |
fprintf(ctx->log, ".\n"); | |
ctx->error = OPENCL_SUCCEED_NONFATAL(opencl_alloc(ctx, ctx->log, (size_t) size, desc, &block->mem, (size_t *) &size)); | |
if (ctx->error == NULL) { | |
block->references = (int *) malloc(sizeof(int)); | |
*block->references = 1; | |
block->size = size; | |
block->desc = desc; | |
ctx->cur_mem_usage_device += size; | |
return FUTHARK_SUCCESS; | |
} else { | |
// We are naively assuming that any memory allocation error is due to OOM. | |
lock_lock(&ctx->error_lock); | |
char *old_error = ctx->error; | |
ctx->error = msgprintf("Failed to allocate memory in %s.\nAttempted allocation: %12lld bytes\nCurrently allocated: %12lld bytes\n%s", "space 'device'", (long long) size, (long long) ctx->cur_mem_usage_device, old_error); | |
free(old_error); | |
lock_unlock(&ctx->error_lock); | |
return FUTHARK_OUT_OF_MEMORY; | |
} | |
} | |
int memblock_set_device(struct futhark_context *ctx, struct memblock_device *lhs, struct memblock_device *rhs, const char *lhs_desc) | |
{ | |
int ret = memblock_unref_device(ctx, lhs, lhs_desc); | |
if (rhs->references != NULL) | |
(*rhs->references)++; | |
*lhs = *rhs; | |
return ret; | |
} | |
int memblock_unref(struct futhark_context *ctx, struct memblock *block, const char *desc) | |
{ | |
if (block->references != NULL) { | |
*block->references -= 1; | |
if (ctx->detail_memory) | |
fprintf(ctx->log, "Unreferencing block %s (allocated as %s) in %s: %d references remaining.\n", desc, block->desc, "default space", *block->references); | |
if (*block->references == 0) { | |
ctx->cur_mem_usage_default -= block->size; | |
host_free(ctx, (size_t) block->size, desc, (void *) block->mem); | |
free(block->references); | |
if (ctx->detail_memory) | |
fprintf(ctx->log, "%lld bytes freed (now allocated: %lld bytes)\n", (long long) block->size, (long long) ctx->cur_mem_usage_default); | |
} | |
block->references = NULL; | |
} | |
return 0; | |
} | |
int memblock_alloc(struct futhark_context *ctx, struct memblock *block, int64_t size, const char *desc) | |
{ | |
if (size < 0) | |
futhark_panic(1, "Negative allocation of %lld bytes attempted for %s in %s.\n", (long long) size, desc, "default space", ctx->cur_mem_usage_default); | |
int ret = memblock_unref(ctx, block, desc); | |
if (ret != FUTHARK_SUCCESS) | |
return ret; | |
if (ctx->detail_memory) | |
fprintf(ctx->log, "Allocating %lld bytes for %s in %s (then allocated: %lld bytes)", (long long) size, desc, "default space", (long long) ctx->cur_mem_usage_default + size); | |
if (ctx->cur_mem_usage_default > ctx->peak_mem_usage_default) { | |
ctx->peak_mem_usage_default = ctx->cur_mem_usage_default; | |
if (ctx->detail_memory) | |
fprintf(ctx->log, " (new peak).\n"); | |
} else if (ctx->detail_memory) | |
fprintf(ctx->log, ".\n"); | |
host_alloc(ctx, (size_t) size, desc, (size_t *) &size, (void *) &block->mem); | |
if (ctx->error == NULL) { | |
block->references = (int *) malloc(sizeof(int)); | |
*block->references = 1; | |
block->size = size; | |
block->desc = desc; | |
ctx->cur_mem_usage_default += size; | |
return FUTHARK_SUCCESS; | |
} else { | |
// We are naively assuming that any memory allocation error is due to OOM. | |
lock_lock(&ctx->error_lock); | |
char *old_error = ctx->error; | |
ctx->error = msgprintf("Failed to allocate memory in %s.\nAttempted allocation: %12lld bytes\nCurrently allocated: %12lld bytes\n%s", "default space", (long long) size, (long long) ctx->cur_mem_usage_default, old_error); | |
free(old_error); | |
lock_unlock(&ctx->error_lock); | |
return FUTHARK_OUT_OF_MEMORY; | |
} | |
} | |
int memblock_set(struct futhark_context *ctx, struct memblock *lhs, struct memblock *rhs, const char *lhs_desc) | |
{ | |
int ret = memblock_unref(ctx, lhs, lhs_desc); | |
if (rhs->references != NULL) | |
(*rhs->references)++; | |
*lhs = *rhs; | |
return ret; | |
} | |
void futhark_context_config_set_debugging(struct futhark_context_config *cfg, int flag) | |
{ | |
cfg->profiling = cfg->logging = cfg->debugging = flag; | |
} | |
void futhark_context_config_set_profiling(struct futhark_context_config *cfg, int flag) | |
{ | |
cfg->profiling = flag; | |
} | |
void futhark_context_config_set_logging(struct futhark_context_config *cfg, int flag) | |
{ | |
cfg->logging = flag; | |
} | |
void futhark_context_config_set_cache_file(struct futhark_context_config *cfg, const char *f) | |
{ | |
cfg->cache_fname = f; | |
} | |
int futhark_get_tuning_param_count(void) | |
{ | |
return num_tuning_params; | |
} | |
const char *futhark_get_tuning_param_name(int i) | |
{ | |
return tuning_param_names[i]; | |
} | |
const char *futhark_get_tuning_param_class(int i) | |
{ | |
return tuning_param_classes[i]; | |
} | |
char *futhark_context_report(struct futhark_context *ctx) | |
{ | |
if (futhark_context_sync(ctx) != 0) | |
return NULL; | |
struct str_builder builder; | |
str_builder_init(&builder); | |
str_builder(&builder, "Peak memory usage for space 'device': %lld bytes.\n", (long long) ctx->peak_mem_usage_device); | |
{ } | |
if (ctx->profiling) { | |
OPENCL_SUCCEED_FATAL(opencl_tally_profiling_records(ctx)); | |
str_builder(&builder, "copy_dev_to_dev ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->copy_dev_to_dev_runs, (long) ctx->program->copy_dev_to_dev_total_runtime / (ctx->program->copy_dev_to_dev_runs != 0 ? ctx->program->copy_dev_to_dev_runs : 1), (long) ctx->program->copy_dev_to_dev_total_runtime); | |
ctx->total_runtime += ctx->program->copy_dev_to_dev_total_runtime; | |
ctx->total_runs += ctx->program->copy_dev_to_dev_runs; | |
str_builder(&builder, "copy_dev_to_host ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->copy_dev_to_host_runs, (long) ctx->program->copy_dev_to_host_total_runtime / (ctx->program->copy_dev_to_host_runs != 0 ? ctx->program->copy_dev_to_host_runs : 1), (long) ctx->program->copy_dev_to_host_total_runtime); | |
ctx->total_runtime += ctx->program->copy_dev_to_host_total_runtime; | |
ctx->total_runs += ctx->program->copy_dev_to_host_runs; | |
str_builder(&builder, "copy_host_to_dev ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->copy_host_to_dev_runs, (long) ctx->program->copy_host_to_dev_total_runtime / (ctx->program->copy_host_to_dev_runs != 0 ? ctx->program->copy_host_to_dev_runs : 1), (long) ctx->program->copy_host_to_dev_total_runtime); | |
ctx->total_runtime += ctx->program->copy_host_to_dev_total_runtime; | |
ctx->total_runs += ctx->program->copy_host_to_dev_runs; | |
str_builder(&builder, "copy_scalar_to_dev ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->copy_scalar_to_dev_runs, (long) ctx->program->copy_scalar_to_dev_total_runtime / (ctx->program->copy_scalar_to_dev_runs != 0 ? ctx->program->copy_scalar_to_dev_runs : 1), (long) ctx->program->copy_scalar_to_dev_total_runtime); | |
ctx->total_runtime += ctx->program->copy_scalar_to_dev_total_runtime; | |
ctx->total_runs += ctx->program->copy_scalar_to_dev_runs; | |
str_builder(&builder, "copy_scalar_from_dev ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->copy_scalar_from_dev_runs, (long) ctx->program->copy_scalar_from_dev_total_runtime / (ctx->program->copy_scalar_from_dev_runs != 0 ? ctx->program->copy_scalar_from_dev_runs : 1), (long) ctx->program->copy_scalar_from_dev_total_runtime); | |
ctx->total_runtime += ctx->program->copy_scalar_from_dev_total_runtime; | |
ctx->total_runs += ctx->program->copy_scalar_from_dev_runs; | |
str_builder(&builder, "add.segmap_6892 ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->addzisegmap_6892_runs, (long) ctx->program->addzisegmap_6892_total_runtime / (ctx->program->addzisegmap_6892_runs != 0 ? ctx->program->addzisegmap_6892_runs : 1), (long) ctx->program->addzisegmap_6892_total_runtime); | |
ctx->total_runtime += ctx->program->addzisegmap_6892_total_runtime; | |
ctx->total_runs += ctx->program->addzisegmap_6892_runs; | |
str_builder(&builder, "add_i64.segmap_6912 ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->add_i64zisegmap_6912_runs, (long) ctx->program->add_i64zisegmap_6912_total_runtime / (ctx->program->add_i64zisegmap_6912_runs != 0 ? ctx->program->add_i64zisegmap_6912_runs : 1), (long) ctx->program->add_i64zisegmap_6912_total_runtime); | |
ctx->total_runtime += ctx->program->add_i64zisegmap_6912_total_runtime; | |
ctx->total_runs += ctx->program->add_i64zisegmap_6912_runs; | |
str_builder(&builder, "%d operations with cumulative runtime: %6ldus\n", ctx->total_runs, ctx->total_runtime); | |
} | |
return builder.str; | |
} | |
char *futhark_context_get_error(struct futhark_context *ctx) | |
{ | |
char *error = ctx->error; | |
ctx->error = NULL; | |
return error; | |
} | |
void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f) | |
{ | |
ctx->log = f; | |
} | |
void futhark_context_pause_profiling(struct futhark_context *ctx) | |
{ | |
ctx->profiling_paused = 1; | |
} | |
void futhark_context_unpause_profiling(struct futhark_context *ctx) | |
{ | |
ctx->profiling_paused = 0; | |
} | |
int futhark_context_clear_caches(struct futhark_context *ctx) | |
{ | |
lock_lock(&ctx->lock); | |
ctx->peak_mem_usage_device = 0; | |
ctx->peak_mem_usage_default = 0; | |
if (ctx->error == NULL) | |
ctx->error = OPENCL_SUCCEED_NONFATAL(opencl_free_all(ctx)); | |
lock_unlock(&ctx->lock); | |
return ctx->error != NULL; | |
} | |
// Start of context.h | |
// Eventually it would be nice to move the context definition in here | |
// instead of generating it in the compiler. For now it defines | |
// various helper functions that must be available. | |
// Internal functions. | |
static void set_error(struct futhark_context* ctx, char *error) { | |
lock_lock(&ctx->error_lock); | |
if (ctx->error == NULL) { | |
ctx->error = error; | |
} else { | |
free(error); | |
} | |
lock_unlock(&ctx->error_lock); | |
} | |
// XXX: should be static, but used in ispc_util.h | |
void lexical_realloc_error(struct futhark_context* ctx, size_t new_size) { | |
set_error(ctx, | |
msgprintf("Failed to allocate memory.\nAttempted allocation: %12lld bytes\n", | |
(long long) new_size)); | |
} | |
static int lexical_realloc(struct futhark_context *ctx, | |
unsigned char **ptr, | |
int64_t *old_size, | |
int64_t new_size) { | |
unsigned char *new = realloc(*ptr, (size_t)new_size); | |
if (new == NULL) { | |
lexical_realloc_error(ctx, new_size); | |
return FUTHARK_OUT_OF_MEMORY; | |
} else { | |
*ptr = new; | |
*old_size = new_size; | |
return FUTHARK_SUCCESS; | |
} | |
} | |
static void free_all_in_free_list(struct futhark_context* ctx) { | |
fl_mem mem; | |
free_list_pack(&ctx->free_list); | |
while (free_list_first(&ctx->free_list, (fl_mem*)&mem) == 0) { | |
free((void*)mem); | |
} | |
} | |
static int is_small_alloc(size_t size) { | |
return size < 1024*1024; | |
} | |
static void host_alloc(struct futhark_context* ctx, | |
size_t size, const char* tag, size_t* size_out, void** mem_out) { | |
if (is_small_alloc(size) || free_list_find(&ctx->free_list, size, tag, size_out, (fl_mem*)mem_out) != 0) { | |
*size_out = size; | |
*mem_out = malloc(size); | |
} | |
} | |
static void host_free(struct futhark_context* ctx, | |
size_t size, const char* tag, void* mem) { | |
// Small allocations are handled by malloc()s own free list. The | |
// threshold here is kind of arbitrary, but seems to work OK. | |
// Larger allocations are mmap()ed/munmapped() every time, which is | |
// very slow, and Futhark programs tend to use a few very large | |
// allocations. | |
if (is_small_alloc(size)) { | |
free(mem); | |
} else { | |
free_list_insert(&ctx->free_list, size, (fl_mem)mem, tag); | |
} | |
} | |
struct futhark_context_config* futhark_context_config_new(void) { | |
struct futhark_context_config* cfg = malloc(sizeof(struct futhark_context_config)); | |
if (cfg == NULL) { | |
return NULL; | |
} | |
cfg->in_use = 0; | |
cfg->debugging = 0; | |
cfg->profiling = 0; | |
cfg->logging = 0; | |
cfg->cache_fname = NULL; | |
cfg->num_tuning_params = num_tuning_params; | |
cfg->tuning_params = malloc(cfg->num_tuning_params * sizeof(int64_t)); | |
memcpy(cfg->tuning_params, tuning_param_defaults, | |
cfg->num_tuning_params * sizeof(int64_t)); | |
cfg->tuning_param_names = tuning_param_names; | |
cfg->tuning_param_vars = tuning_param_vars; | |
cfg->tuning_param_classes = tuning_param_classes; | |
backend_context_config_setup(cfg); | |
return cfg; | |
} | |
void futhark_context_config_free(struct futhark_context_config* cfg) { | |
assert(!cfg->in_use); | |
backend_context_config_teardown(cfg); | |
free(cfg->tuning_params); | |
free(cfg); | |
} | |
struct futhark_context* futhark_context_new(struct futhark_context_config* cfg) { | |
struct futhark_context* ctx = malloc(sizeof(struct futhark_context)); | |
if (ctx == NULL) { | |
return NULL; | |
} | |
assert(!cfg->in_use); | |
ctx->cfg = cfg; | |
ctx->cfg->in_use = 1; | |
create_lock(&ctx->error_lock); | |
create_lock(&ctx->lock); | |
free_list_init(&ctx->free_list); | |
ctx->peak_mem_usage_default = 0; | |
ctx->cur_mem_usage_default = 0; | |
ctx->constants = malloc(sizeof(struct constants)); | |
ctx->detail_memory = cfg->debugging; | |
ctx->debugging = cfg->debugging; | |
ctx->logging = cfg->logging; | |
ctx->profiling = cfg->profiling; | |
ctx->profiling_paused = 0; | |
ctx->error = NULL; | |
ctx->log = stderr; | |
if (backend_context_setup(ctx) == 0) { | |
set_tuning_params(ctx); | |
setup_program(ctx); | |
init_constants(ctx); | |
(void)futhark_context_clear_caches(ctx); | |
(void)futhark_context_sync(ctx); | |
} | |
return ctx; | |
} | |
void futhark_context_free(struct futhark_context* ctx) { | |
free_constants(ctx); | |
teardown_program(ctx); | |
backend_context_teardown(ctx); | |
free_all_in_free_list(ctx); | |
free_list_destroy(&ctx->free_list); | |
free(ctx->constants); | |
free_lock(&ctx->lock); | |
free_lock(&ctx->error_lock); | |
ctx->cfg->in_use = 0; | |
free(ctx); | |
} | |
// End of context.h | |
static int futrts_entry_add(struct futhark_context *ctx, struct memblock_device *mem_out_p_6932, struct memblock_device xs_mem_6916, struct memblock_device ys_mem_6917, int64_t n_6776); | |
static int futrts_entry_add_i64(struct futhark_context *ctx, struct memblock_device *mem_out_p_6938, struct memblock_device xs_mem_6916, struct memblock_device ys_mem_6917, int64_t n_6836); | |
static int init_constants(struct futhark_context *ctx) | |
{ | |
(void) ctx; | |
int err = 0; | |
cleanup: | |
return err; | |
} | |
static int free_constants(struct futhark_context *ctx) | |
{ | |
(void) ctx; | |
return 0; | |
} | |
struct futhark_u8_1d { | |
struct memblock_device mem; | |
int64_t shape[1]; | |
}; | |
struct futhark_u8_1d *futhark_new_u8_1d(struct futhark_context *ctx, const uint8_t *data, int64_t dim0) | |
{ | |
struct futhark_u8_1d *bad = NULL; | |
struct futhark_u8_1d *arr = (struct futhark_u8_1d *) malloc(sizeof(struct futhark_u8_1d)); | |
if (arr == NULL) | |
return bad; | |
lock_lock(&ctx->lock); | |
arr->mem.references = NULL; | |
if (memblock_alloc_device(ctx, &arr->mem, dim0 * 1, "arr->mem")) | |
return NULL; | |
arr->shape[0] = dim0; | |
if ((size_t) dim0 * 1 > 0) | |
OPENCL_SUCCEED_OR_RETURN(clEnqueueWriteBuffer(ctx->queue, arr->mem.mem, CL_FALSE, (size_t) 0, (size_t) ((size_t) dim0 * 1), data + 0, 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_dev_to_host_runs, &ctx->program->copy_dev_to_host_total_runtime))); | |
lock_unlock(&ctx->lock); | |
return arr; | |
} | |
struct futhark_u8_1d *futhark_new_raw_u8_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0) | |
{ | |
struct futhark_u8_1d *bad = NULL; | |
struct futhark_u8_1d *arr = (struct futhark_u8_1d *) malloc(sizeof(struct futhark_u8_1d)); | |
if (arr == NULL) | |
return bad; | |
lock_lock(&ctx->lock); | |
arr->mem.references = NULL; | |
if (memblock_alloc_device(ctx, &arr->mem, dim0 * 1, "arr->mem")) | |
return NULL; | |
arr->shape[0] = dim0; | |
if ((size_t) dim0 * 1 > 0) { | |
OPENCL_SUCCEED_OR_RETURN(clEnqueueCopyBuffer(ctx->queue, data, arr->mem.mem, (size_t) offset, (size_t) 0, (size_t) ((size_t) dim0 * 1), 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_dev_to_dev_runs, &ctx->program->copy_dev_to_dev_total_runtime))); | |
if (ctx->debugging) | |
OPENCL_SUCCEED_FATAL(clFinish(ctx->queue)); | |
} | |
lock_unlock(&ctx->lock); | |
return arr; | |
} | |
int futhark_free_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr) | |
{ | |
lock_lock(&ctx->lock); | |
if (memblock_unref_device(ctx, &arr->mem, "arr->mem") != 0) | |
return 1; | |
lock_unlock(&ctx->lock); | |
free(arr); | |
return 0; | |
} | |
int futhark_values_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr, uint8_t *data) | |
{ | |
lock_lock(&ctx->lock); | |
if ((size_t) arr->shape[0] * 1 > 0) { | |
cl_bool sync_call = CL_FALSE; | |
OPENCL_SUCCEED_OR_RETURN(clEnqueueReadBuffer(ctx->queue, arr->mem.mem, ctx->failure_is_an_option ? CL_FALSE : sync_call, (size_t) 0, (size_t) ((size_t) arr->shape[0] * 1), data + 0, 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_host_to_dev_runs, &ctx->program->copy_host_to_dev_total_runtime))); | |
if ((sync_call && ctx->failure_is_an_option) && futhark_context_sync(ctx) != 0) | |
return 1; | |
} | |
lock_unlock(&ctx->lock); | |
return 0; | |
} | |
cl_mem futhark_values_raw_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr) | |
{ | |
(void) ctx; | |
return arr->mem.mem; | |
} | |
const int64_t *futhark_shape_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr) | |
{ | |
(void) ctx; | |
return arr->shape; | |
} | |
struct futhark_i64_1d { | |
struct memblock_device mem; | |
int64_t shape[1]; | |
}; | |
struct futhark_i64_1d *futhark_new_i64_1d(struct futhark_context *ctx, const int64_t *data, int64_t dim0) | |
{ | |
struct futhark_i64_1d *bad = NULL; | |
struct futhark_i64_1d *arr = (struct futhark_i64_1d *) malloc(sizeof(struct futhark_i64_1d)); | |
if (arr == NULL) | |
return bad; | |
lock_lock(&ctx->lock); | |
arr->mem.references = NULL; | |
if (memblock_alloc_device(ctx, &arr->mem, dim0 * 8, "arr->mem")) | |
return NULL; | |
arr->shape[0] = dim0; | |
if ((size_t) dim0 * 8 > 0) | |
OPENCL_SUCCEED_OR_RETURN(clEnqueueWriteBuffer(ctx->queue, arr->mem.mem, CL_FALSE, (size_t) 0, (size_t) ((size_t) dim0 * 8), data + 0, 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_dev_to_host_runs, &ctx->program->copy_dev_to_host_total_runtime))); | |
lock_unlock(&ctx->lock); | |
return arr; | |
} | |
struct futhark_i64_1d *futhark_new_raw_i64_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0) | |
{ | |
struct futhark_i64_1d *bad = NULL; | |
struct futhark_i64_1d *arr = (struct futhark_i64_1d *) malloc(sizeof(struct futhark_i64_1d)); | |
if (arr == NULL) | |
return bad; | |
lock_lock(&ctx->lock); | |
arr->mem.references = NULL; | |
if (memblock_alloc_device(ctx, &arr->mem, dim0 * 8, "arr->mem")) | |
return NULL; | |
arr->shape[0] = dim0; | |
if ((size_t) dim0 * 8 > 0) { | |
OPENCL_SUCCEED_OR_RETURN(clEnqueueCopyBuffer(ctx->queue, data, arr->mem.mem, (size_t) offset, (size_t) 0, (size_t) ((size_t) dim0 * 8), 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_dev_to_dev_runs, &ctx->program->copy_dev_to_dev_total_runtime))); | |
if (ctx->debugging) | |
OPENCL_SUCCEED_FATAL(clFinish(ctx->queue)); | |
} | |
lock_unlock(&ctx->lock); | |
return arr; | |
} | |
int futhark_free_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr) | |
{ | |
lock_lock(&ctx->lock); | |
if (memblock_unref_device(ctx, &arr->mem, "arr->mem") != 0) | |
return 1; | |
lock_unlock(&ctx->lock); | |
free(arr); | |
return 0; | |
} | |
int futhark_values_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr, int64_t *data) | |
{ | |
lock_lock(&ctx->lock); | |
if ((size_t) arr->shape[0] * 8 > 0) { | |
cl_bool sync_call = CL_FALSE; | |
OPENCL_SUCCEED_OR_RETURN(clEnqueueReadBuffer(ctx->queue, arr->mem.mem, ctx->failure_is_an_option ? CL_FALSE : sync_call, (size_t) 0, (size_t) ((size_t) arr->shape[0] * 8), data + 0, 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_host_to_dev_runs, &ctx->program->copy_host_to_dev_total_runtime))); | |
if ((sync_call && ctx->failure_is_an_option) && futhark_context_sync(ctx) != 0) | |
return 1; | |
} | |
lock_unlock(&ctx->lock); | |
return 0; | |
} | |
cl_mem futhark_values_raw_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr) | |
{ | |
(void) ctx; | |
return arr->mem.mem; | |
} | |
const int64_t *futhark_shape_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr) | |
{ | |
(void) ctx; | |
return arr->shape; | |
} | |
static int futrts_entry_add(struct futhark_context *ctx, struct memblock_device *mem_out_p_6932, struct memblock_device xs_mem_6916, struct memblock_device ys_mem_6917, int64_t n_6776) | |
{ | |
(void) ctx; | |
int err = 0; | |
struct memblock_device mem_6920; | |
mem_6920.references = NULL; | |
struct memblock_device mem_out_6922; | |
mem_out_6922.references = NULL; | |
int64_t bytes_6919 = smax64((int64_t) 0, n_6776); | |
int64_t segmap_group_sizze_6888; | |
segmap_group_sizze_6888 = *ctx->tuning_params.addzisegmap_group_sizze_6879; | |
int64_t segmap_usable_groups_6889 = sdiv_up64(n_6776, segmap_group_sizze_6888); | |
if (memblock_alloc_device(ctx, &mem_6920, bytes_6919, "mem_6920")) { | |
err = 1; | |
goto cleanup; | |
} | |
if (ctx->debugging) | |
fprintf(ctx->log, "%s\n", "\n# SegMap"); | |
int32_t virt_num_groups_6923 = sext_i64_i32(sdiv_up64(n_6776, segmap_group_sizze_6888)); | |
OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->addzisegmap_6892, 1, sizeof(n_6776), &n_6776)); | |
OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->addzisegmap_6892, 2, sizeof(xs_mem_6916.mem), &xs_mem_6916.mem)); | |
OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->addzisegmap_6892, 3, sizeof(ys_mem_6917.mem), &ys_mem_6917.mem)); | |
OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->addzisegmap_6892, 4, sizeof(mem_6920.mem), &mem_6920.mem)); | |
if (1 * ((size_t) segmap_usable_groups_6889 * (size_t) *ctx->tuning_params.addzisegmap_group_sizze_6879) != 0) { | |
const size_t global_work_sizze_6933[1] = {(size_t) segmap_usable_groups_6889 * (size_t) *ctx->tuning_params.addzisegmap_group_sizze_6879}; | |
const size_t local_work_sizze_6937[1] = {(size_t) *ctx->tuning_params.addzisegmap_group_sizze_6879}; | |
int64_t time_start_6934 = 0, time_end_6935 = 0; | |
if (ctx->debugging) { | |
fprintf(ctx->log, "Launching %s with global work size [%zu] and local work size [%zu]; local memory: %d bytes.\n", "add.segmap_6892", global_work_sizze_6933[0], local_work_sizze_6937[0], (int) 0); | |
time_start_6934 = get_wall_time(); | |
} | |
cl_event *pevent = ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->addzisegmap_6892_runs, &ctx->program->addzisegmap_6892_total_runtime); | |
OPENCL_SUCCEED_OR_RETURN(clEnqueueNDRangeKernel(ctx->queue, ctx->program->addzisegmap_6892, 1, NULL, global_work_sizze_6933, local_work_sizze_6937, 0, NULL, pevent)); | |
if (ctx->debugging) { | |
OPENCL_SUCCEED_FATAL(clFinish(ctx->queue)); | |
time_end_6935 = get_wall_time(); | |
long time_diff_6936 = time_end_6935 - time_start_6934; | |
fprintf(ctx->log, "kernel %s runtime: %ldus\n", "add.segmap_6892", time_diff_6936); | |
} | |
} | |
if (ctx->debugging) | |
fprintf(ctx->log, "%s\n", ""); | |
if (memblock_set_device(ctx, &mem_out_6922, &mem_6920, "mem_6920") != 0) | |
return 1; | |
if (memblock_set_device(ctx, &*mem_out_p_6932, &mem_out_6922, "mem_out_6922") != 0) | |
return 1; | |
cleanup: | |
{ | |
if (memblock_unref_device(ctx, &mem_6920, "mem_6920") != 0) | |
return 1; | |
if (memblock_unref_device(ctx, &mem_out_6922, "mem_out_6922") != 0) | |
return 1; | |
} | |
return err; | |
} | |
static int futrts_entry_add_i64(struct futhark_context *ctx, struct memblock_device *mem_out_p_6938, struct memblock_device xs_mem_6916, struct memblock_device ys_mem_6917, int64_t n_6836) | |
{ | |
(void) ctx; | |
int err = 0; | |
struct memblock_device mem_6921; | |
mem_6921.references = NULL; | |
struct memblock_device mem_out_6922; | |
mem_out_6922.references = NULL; | |
int64_t binop_y_6919 = (int64_t) 8 * n_6836; | |
int64_t bytes_6920 = smax64((int64_t) 0, binop_y_6919); | |
int64_t segmap_group_sizze_6908; | |
segmap_group_sizze_6908 = *ctx->tuning_params.add_i64zisegmap_group_sizze_6899; | |
int64_t segmap_usable_groups_6909 = sdiv_up64(n_6836, segmap_group_sizze_6908); | |
if (memblock_alloc_device(ctx, &mem_6921, bytes_6920, "mem_6921")) { | |
err = 1; | |
goto cleanup; | |
} | |
if (ctx->debugging) | |
fprintf(ctx->log, "%s\n", "\n# SegMap"); | |
int32_t virt_num_groups_6923 = sext_i64_i32(sdiv_up64(n_6836, segmap_group_sizze_6908)); | |
OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->add_i64zisegmap_6912, 1, sizeof(n_6836), &n_6836)); | |
OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->add_i64zisegmap_6912, 2, sizeof(xs_mem_6916.mem), &xs_mem_6916.mem)); | |
OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->add_i64zisegmap_6912, 3, sizeof(ys_mem_6917.mem), &ys_mem_6917.mem)); | |
OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->add_i64zisegmap_6912, 4, sizeof(mem_6921.mem), &mem_6921.mem)); | |
if (1 * ((size_t) segmap_usable_groups_6909 * (size_t) *ctx->tuning_params.add_i64zisegmap_group_sizze_6899) != 0) { | |
const size_t global_work_sizze_6939[1] = {(size_t) segmap_usable_groups_6909 * (size_t) *ctx->tuning_params.add_i64zisegmap_group_sizze_6899}; | |
const size_t local_work_sizze_6943[1] = {(size_t) *ctx->tuning_params.add_i64zisegmap_group_sizze_6899}; | |
int64_t time_start_6940 = 0, time_end_6941 = 0; | |
if (ctx->debugging) { | |
fprintf(ctx->log, "Launching %s with global work size [%zu] and local work size [%zu]; local memory: %d bytes.\n", "add_i64.segmap_6912", global_work_sizze_6939[0], local_work_sizze_6943[0], (int) 0); | |
time_start_6940 = get_wall_time(); | |
} | |
cl_event *pevent = ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->add_i64zisegmap_6912_runs, &ctx->program->add_i64zisegmap_6912_total_runtime); | |
OPENCL_SUCCEED_OR_RETURN(clEnqueueNDRangeKernel(ctx->queue, ctx->program->add_i64zisegmap_6912, 1, NULL, global_work_sizze_6939, local_work_sizze_6943, 0, NULL, pevent)); | |
if (ctx->debugging) { | |
OPENCL_SUCCEED_FATAL(clFinish(ctx->queue)); | |
time_end_6941 = get_wall_time(); | |
long time_diff_6942 = time_end_6941 - time_start_6940; | |
fprintf(ctx->log, "kernel %s runtime: %ldus\n", "add_i64.segmap_6912", time_diff_6942); | |
} | |
} | |
if (ctx->debugging) | |
fprintf(ctx->log, "%s\n", ""); | |
if (memblock_set_device(ctx, &mem_out_6922, &mem_6921, "mem_6921") != 0) | |
return 1; | |
if (memblock_set_device(ctx, &*mem_out_p_6938, &mem_out_6922, "mem_out_6922") != 0) | |
return 1; | |
cleanup: | |
{ | |
if (memblock_unref_device(ctx, &mem_6921, "mem_6921") != 0) | |
return 1; | |
if (memblock_unref_device(ctx, &mem_out_6922, "mem_out_6922") != 0) | |
return 1; | |
} | |
return err; | |
} | |
int futhark_entry_add(struct futhark_context *ctx, struct futhark_u8_1d **out0, const struct futhark_u8_1d *in0, const struct futhark_u8_1d *in1) | |
{ | |
int64_t n_6776 = (int64_t) 0; | |
int ret = 0; | |
lock_lock(&ctx->lock); | |
struct memblock_device mem_out_6922; | |
mem_out_6922.references = NULL; | |
struct memblock_device ys_mem_6917; | |
ys_mem_6917.references = NULL; | |
struct memblock_device xs_mem_6916; | |
xs_mem_6916.references = NULL; | |
xs_mem_6916 = in0->mem; | |
n_6776 = in0->shape[0]; | |
ys_mem_6917 = in1->mem; | |
n_6776 = in1->shape[0]; | |
if (!(n_6776 == in0->shape[0] && n_6776 == in1->shape[0])) { | |
ret = 1; | |
set_error(ctx, msgprintf("Error: entry point arguments have invalid sizes.\n")); | |
} | |
if (ret == 0) { | |
ret = futrts_entry_add(ctx, &mem_out_6922, xs_mem_6916, ys_mem_6917, n_6776); | |
if (ret == 0) { | |
assert((*out0 = (struct futhark_u8_1d *) malloc(sizeof(struct futhark_u8_1d))) != NULL); | |
(*out0)->mem = mem_out_6922; | |
(*out0)->shape[0] = n_6776; | |
} | |
} | |
lock_unlock(&ctx->lock); | |
return ret; | |
} | |
int futhark_entry_add_i64(struct futhark_context *ctx, struct futhark_i64_1d **out0, const struct futhark_i64_1d *in0, const struct futhark_i64_1d *in1) | |
{ | |
int64_t n_6836 = (int64_t) 0; | |
int ret = 0; | |
lock_lock(&ctx->lock); | |
struct memblock_device mem_out_6922; | |
mem_out_6922.references = NULL; | |
struct memblock_device ys_mem_6917; | |
ys_mem_6917.references = NULL; | |
struct memblock_device xs_mem_6916; | |
xs_mem_6916.references = NULL; | |
xs_mem_6916 = in0->mem; | |
n_6836 = in0->shape[0]; | |
ys_mem_6917 = in1->mem; | |
n_6836 = in1->shape[0]; | |
if (!(n_6836 == in0->shape[0] && n_6836 == in1->shape[0])) { | |
ret = 1; | |
set_error(ctx, msgprintf("Error: entry point arguments have invalid sizes.\n")); | |
} | |
if (ret == 0) { | |
ret = futrts_entry_add_i64(ctx, &mem_out_6922, xs_mem_6916, ys_mem_6917, n_6836); | |
if (ret == 0) { | |
assert((*out0 = (struct futhark_i64_1d *) malloc(sizeof(struct futhark_i64_1d))) != NULL); | |
(*out0)->mem = mem_out_6922; | |
(*out0)->shape[0] = n_6836; | |
} | |
} | |
lock_unlock(&ctx->lock); | |
return ret; | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule Map.NIF do | |
@on_load :load_nifs | |
def load_nifs do | |
:erlang.load_nif('./lib_map_nif', 0) | |
end | |
def futhark_context_config_new do | |
raise "NIF futhark_context_config_new not implemented" | |
end | |
def futhark_context_new(_cfg) do | |
raise "NIF futhark_context_new not implemented" | |
end | |
def futhark_context_sync(_ctx) do | |
raise "NIF futhark_context_sync not implemented" | |
end | |
def futhark_new_i64_1d(_ctx, _binary) do | |
raise "NIF futhark_new_i64_1d not implemented" | |
end | |
def futhark_i64_1d_to_binary(_ctx, _in) do | |
raise "NIF futhark_i64_1d_to_binary not implemented" | |
end | |
def futhark_new_u8_1d(_ctx, _binary) do | |
raise "NIF futhark_new_u8_1d not implemented" | |
end | |
def futhark_u8_1d_to_binary(_ctx, _in) do | |
raise "NIF futhark_u8_1d_to_binary not implemented" | |
end | |
def futhark_entry_add(_ctx, _xs, _ys) do | |
raise "NIF futhark_entry_add not implemented" | |
end | |
def futhark_entry_add_i64(_ctx, _xs, _ys) do | |
raise "NIF futhark_entry_add_i64 not implemented" | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Generated by Futhark 0.25.0 (prerelease - include info below when reporting bugs) | |
// git: 6a2e6e1 (Sun Apr 30 19:03:19 2023 +0200) [modified] | |
#pragma once | |
// Headers | |
#include <stdint.h> | |
#include <stddef.h> | |
#include <stdbool.h> | |
#include <stdio.h> | |
#include <float.h> | |
#define CL_TARGET_OPENCL_VERSION 120 | |
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS | |
#ifdef __APPLE__ | |
#define CL_SILENCE_DEPRECATION | |
#include <OpenCL/cl.h> | |
#else | |
#include <CL/cl.h> | |
#endif | |
#ifdef __cplusplus | |
extern "C" { | |
#endif | |
// Initialisation | |
struct futhark_context_config; | |
struct futhark_context_config *futhark_context_config_new(void); | |
void futhark_context_config_free(struct futhark_context_config *cfg); | |
int futhark_context_config_set_tuning_param(struct futhark_context_config *cfg, const char *param_name, size_t new_value); | |
struct futhark_context; | |
struct futhark_context *futhark_context_new(struct futhark_context_config *cfg); | |
void futhark_context_free(struct futhark_context *cfg); | |
void futhark_context_config_add_build_option(struct futhark_context_config *cfg, const char *opt); | |
void futhark_context_config_set_device(struct futhark_context_config *cfg, const char *s); | |
void futhark_context_config_set_platform(struct futhark_context_config *cfg, const char *s); | |
void futhark_context_config_select_device_interactively(struct futhark_context_config *cfg); | |
void futhark_context_config_list_devices(struct futhark_context_config *cfg); | |
void futhark_context_config_dump_program_to(struct futhark_context_config *cfg, const char *s); | |
void futhark_context_config_load_program_from(struct futhark_context_config *cfg, const char *s); | |
void futhark_context_config_dump_binary_to(struct futhark_context_config *cfg, const char *s); | |
void futhark_context_config_load_binary_from(struct futhark_context_config *cfg, const char *s); | |
void futhark_context_config_set_default_group_size(struct futhark_context_config *cfg, int size); | |
void futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg, int size); | |
void futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg, int size); | |
void futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg, int size); | |
void futhark_context_config_set_default_threshold(struct futhark_context_config *cfg, int size); | |
void futhark_context_config_set_command_queue(struct futhark_context_config *cfg, cl_command_queue); | |
void futhark_context_config_set_debugging(struct futhark_context_config *cfg, int flag); | |
void futhark_context_config_set_profiling(struct futhark_context_config *cfg, int flag); | |
void futhark_context_config_set_logging(struct futhark_context_config *cfg, int flag); | |
int futhark_get_tuning_param_count(void); | |
const char *futhark_get_tuning_param_name(int); | |
const char *futhark_get_tuning_param_class(int); | |
// Arrays | |
struct futhark_i64_1d; | |
struct futhark_i64_1d *futhark_new_i64_1d(struct futhark_context *ctx, const int64_t *data, int64_t dim0); | |
struct futhark_i64_1d *futhark_new_raw_i64_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0); | |
int futhark_free_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr); | |
int futhark_values_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr, int64_t *data); | |
cl_mem futhark_values_raw_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr); | |
const int64_t *futhark_shape_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr); | |
struct futhark_u8_1d; | |
struct futhark_u8_1d *futhark_new_u8_1d(struct futhark_context *ctx, const uint8_t *data, int64_t dim0); | |
struct futhark_u8_1d *futhark_new_raw_u8_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0); | |
int futhark_free_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr); | |
int futhark_values_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr, uint8_t *data); | |
cl_mem futhark_values_raw_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr); | |
const int64_t *futhark_shape_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr); | |
// Opaque values | |
// Entry points | |
int futhark_entry_add(struct futhark_context *ctx, struct futhark_u8_1d **out0, const struct futhark_u8_1d *in0, const struct futhark_u8_1d *in1); | |
int futhark_entry_add_i64(struct futhark_context *ctx, struct futhark_i64_1d **out0, const struct futhark_i64_1d *in0, const struct futhark_i64_1d *in1); | |
// Miscellaneous | |
int futhark_context_sync(struct futhark_context *ctx); | |
cl_command_queue futhark_context_get_command_queue(struct futhark_context *ctx); | |
void futhark_context_config_set_cache_file(struct futhark_context_config *cfg, const char *f); | |
char *futhark_context_report(struct futhark_context *ctx); | |
char *futhark_context_get_error(struct futhark_context *ctx); | |
void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f); | |
void futhark_context_pause_profiling(struct futhark_context *ctx); | |
void futhark_context_unpause_profiling(struct futhark_context *ctx); | |
int futhark_context_clear_caches(struct futhark_context *ctx); | |
#define FUTHARK_BACKEND_opencl | |
#define FUTHARK_SUCCESS 0 | |
#define FUTHARK_PROGRAM_ERROR 2 | |
#define FUTHARK_OUT_OF_MEMORY 3 | |
#ifdef __cplusplus | |
} | |
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"backend": "opencl", | |
"entry_points": { | |
"add": { | |
"cfun": "futhark_entry_add", | |
"inputs": [ | |
{ | |
"name": "xs", | |
"type": "[]u8", | |
"unique": false | |
}, | |
{ | |
"name": "ys", | |
"type": "[]u8", | |
"unique": false | |
} | |
], | |
"outputs": [ | |
{ | |
"type": "[]u8", | |
"unique": false | |
} | |
], | |
"tuning_params": [ | |
"add.segmap_group_size_6879" | |
] | |
}, | |
"add_i64": { | |
"cfun": "futhark_entry_add_i64", | |
"inputs": [ | |
{ | |
"name": "xs", | |
"type": "[]i64", | |
"unique": false | |
}, | |
{ | |
"name": "ys", | |
"type": "[]i64", | |
"unique": false | |
} | |
], | |
"outputs": [ | |
{ | |
"type": "[]i64", | |
"unique": false | |
} | |
], | |
"tuning_params": [ | |
"add_i64.segmap_group_size_6899" | |
] | |
} | |
}, | |
"types": { | |
"[]i64": { | |
"ctype": "struct futhark_i64_1d *", | |
"elemtype": "i64", | |
"kind": "array", | |
"ops": { | |
"free": "futhark_free_i64_1d", | |
"new": "futhark_new_i64_1d", | |
"shape": "futhark_shape_i64_1d", | |
"values": "futhark_values_i64_1d" | |
}, | |
"rank": 1 | |
}, | |
"[]u8": { | |
"ctype": "struct futhark_u8_1d *", | |
"elemtype": "u8", | |
"kind": "array", | |
"ops": { | |
"free": "futhark_free_u8_1d", | |
"new": "futhark_new_u8_1d", | |
"shape": "futhark_shape_u8_1d", | |
"values": "futhark_values_u8_1d" | |
}, | |
"rank": 1 | |
} | |
}, | |
"version": "0.25.0 (prerelease - include info below when reporting bugs)\ngit: 6a2e6e1 (Sun Apr 30 19:03:19 2023 +0200) [modified]" | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <erl_nif.h> | |
#include "lib_map.c" | |
struct futhark_context; | |
ERL_NIF_TERM atom_ok; | |
ErlNifResourceType* CONFIG_TYPE; | |
ErlNifResourceType* CONTEXT_TYPE; | |
ErlNifResourceType* I64_1D; | |
ErlNifResourceType* U8_1D; | |
static int open_resource(ErlNifEnv* env, ErlNifResourceType** resource_type, const char* name) | |
{ | |
const char* mod = "resources"; | |
int flags = ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER; | |
*resource_type = enif_open_resource_type(env, mod, name, NULL, flags, NULL); | |
if(CONFIG_TYPE == NULL) return -1; | |
return 0; | |
} | |
static int load(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info) | |
{ | |
if(open_resource(env, &CONFIG_TYPE, "Config") == -1) return -1; | |
if(open_resource(env, &CONTEXT_TYPE, "Context") == -1) return -1; | |
if(open_resource(env, &I64_1D, "i64_1d") == -1) return -1; | |
if(open_resource(env, &U8_1D, "u8_1d") == -1) return -1; | |
atom_ok = enif_make_atom(env, "ok"); | |
return 0; | |
} | |
static ERL_NIF_TERM futhark_context_config_new_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) | |
{ | |
struct futhark_context_config **res; | |
ERL_NIF_TERM ret; | |
if(argc != 0) { | |
return enif_make_badarg(env); | |
} | |
res = enif_alloc_resource(CONFIG_TYPE, sizeof(struct futhark_context_config *)); | |
if(res == NULL) return enif_make_badarg(env); | |
struct futhark_context_config* tmp = futhark_context_config_new(); | |
*res = tmp; | |
ret = enif_make_resource(env, res); | |
enif_release_resource(res); | |
return enif_make_tuple2(env, atom_ok, ret); | |
} | |
static ERL_NIF_TERM futhark_context_new_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) | |
{ | |
struct futhark_context_config **cfg; | |
struct futhark_context **res; | |
ERL_NIF_TERM ret; | |
if(argc != 1) { | |
return enif_make_badarg(env); | |
} | |
if(!enif_get_resource(env, argv[0], CONFIG_TYPE, (void**) &cfg)) { | |
return enif_make_badarg(env); | |
} | |
res = enif_alloc_resource(CONTEXT_TYPE, sizeof(struct futhark_context *)); | |
if(res == NULL) return enif_make_badarg(env); | |
struct futhark_context* tmp = futhark_context_new(*cfg); | |
*res = tmp; | |
ret = enif_make_resource(env, res); | |
enif_release_resource(res); | |
return enif_make_tuple2(env, atom_ok, ret); | |
} | |
static ERL_NIF_TERM futhark_context_sync_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) | |
{ | |
struct futhark_context **ctx; | |
if(argc != 1) { | |
return enif_make_badarg(env); | |
} | |
if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) { | |
return enif_make_badarg(env); | |
} | |
futhark_context_sync(*ctx); | |
return atom_ok; | |
} | |
static ERL_NIF_TERM futhark_new_i64_1d_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) | |
{ | |
struct futhark_context **ctx; | |
ErlNifBinary bin; | |
struct futhark_i64_1d **res; | |
ERL_NIF_TERM ret; | |
if(argc != 2) { | |
return enif_make_badarg(env); | |
} | |
if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) { | |
return enif_make_badarg(env); | |
} | |
if (!enif_inspect_binary(env, argv[1], &bin)) { | |
return enif_make_badarg(env); | |
} | |
res = enif_alloc_resource(I64_1D, sizeof(struct futhark_i64_1d *)); | |
if(res == NULL) return enif_make_badarg(env); | |
struct futhark_i64_1d * tmp = futhark_new_i64_1d(*ctx, (const int64_t *)bin.data, bin.size / sizeof(int64_t)); | |
const int64_t *shape = futhark_shape_i64_1d(*ctx, tmp); | |
*res = tmp; | |
ret = enif_make_resource(env, res); | |
enif_release_resource(res); | |
return enif_make_tuple2(env, atom_ok, ret); | |
} | |
static ERL_NIF_TERM futhark_i64_1d_to_binary_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) | |
{ | |
struct futhark_context **ctx; | |
struct futhark_i64_1d **xs; | |
ErlNifBinary binary; | |
ERL_NIF_TERM ret; | |
if(argc != 2) { | |
return enif_make_badarg(env); | |
} | |
if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) { | |
return enif_make_badarg(env); | |
} | |
if(!enif_get_resource(env, argv[1], I64_1D, (void**) &xs)) { | |
return enif_make_badarg(env); | |
} | |
const int64_t *shape = futhark_shape_i64_1d(*ctx, *xs); | |
enif_alloc_binary(shape[0] * sizeof(int64_t), &binary); | |
if (futhark_values_i64_1d(*ctx, *xs, (int64_t *)(binary.data)) != 0) return enif_make_badarg(env); | |
futhark_context_sync(*ctx); | |
ret = enif_make_binary(env, &binary); | |
return enif_make_tuple2(env, atom_ok, ret); | |
} | |
static ERL_NIF_TERM futhark_new_u8_1d_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) | |
{ | |
struct futhark_context **ctx; | |
ErlNifBinary bin; | |
struct futhark_u8_1d **res; | |
ERL_NIF_TERM ret; | |
if(argc != 2) { | |
return enif_make_badarg(env); | |
} | |
if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) { | |
return enif_make_badarg(env); | |
} | |
if (!enif_inspect_binary(env, argv[1], &bin)) { | |
return enif_make_badarg(env); | |
} | |
res = enif_alloc_resource(U8_1D, sizeof(struct futhark_u8_1d *)); | |
if(res == NULL) return enif_make_badarg(env); | |
struct futhark_u8_1d * tmp = futhark_new_u8_1d(*ctx, (const uint8_t *)bin.data, bin.size / sizeof(uint8_t)); | |
const int64_t *shape = futhark_shape_u8_1d(*ctx, tmp); | |
*res = tmp; | |
ret = enif_make_resource(env, res); | |
enif_release_resource(res); | |
return enif_make_tuple2(env, atom_ok, ret); | |
} | |
static ERL_NIF_TERM futhark_u8_1d_to_binary_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) | |
{ | |
struct futhark_context **ctx; | |
struct futhark_u8_1d **xs; | |
ErlNifBinary binary; | |
ERL_NIF_TERM ret; | |
if(argc != 2) { | |
return enif_make_badarg(env); | |
} | |
if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) { | |
return enif_make_badarg(env); | |
} | |
if(!enif_get_resource(env, argv[1], U8_1D, (void**) &xs)) { | |
return enif_make_badarg(env); | |
} | |
const int64_t *shape = futhark_shape_u8_1d(*ctx, *xs); | |
enif_alloc_binary(shape[0] * sizeof(uint8_t), &binary); | |
if (futhark_values_u8_1d(*ctx, *xs, (uint8_t *)(binary.data)) != 0) return enif_make_badarg(env); | |
futhark_context_sync(*ctx); | |
ret = enif_make_binary(env, &binary); | |
return enif_make_tuple2(env, atom_ok, ret); | |
} | |
static ERL_NIF_TERM futhark_entry_add_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) | |
{ | |
struct futhark_context **ctx; | |
struct futhark_u8_1d **xs; | |
struct futhark_u8_1d **ys; | |
struct futhark_u8_1d **res; | |
ERL_NIF_TERM ret; | |
if(argc != 3) { | |
return enif_make_badarg(env); | |
} | |
if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) { | |
return enif_make_badarg(env); | |
} | |
if(!enif_get_resource(env, argv[1], U8_1D, (void**) &xs)) { | |
return enif_make_badarg(env); | |
} | |
if(!enif_get_resource(env, argv[2], U8_1D, (void**) &ys)) { | |
return enif_make_badarg(env); | |
} | |
res = enif_alloc_resource(U8_1D, sizeof(struct futhark_u8_1d *)); | |
if(res == NULL) return enif_make_badarg(env); | |
if (futhark_entry_add(*ctx, res, *xs, *ys) != 0) return enif_make_badarg(env); | |
ret = enif_make_resource(env, res); | |
enif_release_resource(res); | |
return enif_make_tuple2(env, atom_ok, ret); | |
} | |
static ERL_NIF_TERM futhark_entry_add_i64_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) | |
{ | |
struct futhark_context **ctx; | |
struct futhark_i64_1d **xs; | |
struct futhark_i64_1d **ys; | |
struct futhark_i64_1d **res; | |
ERL_NIF_TERM ret; | |
if(argc != 3) { | |
return enif_make_badarg(env); | |
} | |
if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) { | |
return enif_make_badarg(env); | |
} | |
if(!enif_get_resource(env, argv[1], I64_1D, (void**) &xs)) { | |
return enif_make_badarg(env); | |
} | |
if(!enif_get_resource(env, argv[2], I64_1D, (void**) &ys)) { | |
return enif_make_badarg(env); | |
} | |
res = enif_alloc_resource(I64_1D, sizeof(struct futhark_i64_1d *)); | |
if(res == NULL) return enif_make_badarg(env); | |
if (futhark_entry_add_i64(*ctx, res, *xs, *ys) != 0) return enif_make_badarg(env); | |
ret = enif_make_resource(env, res); | |
enif_release_resource(res); | |
return enif_make_tuple2(env, atom_ok, ret); | |
} | |
static ErlNifFunc nif_funcs[] = { | |
{"futhark_context_config_new", 0, futhark_context_config_new_nif}, | |
{"futhark_context_new", 1, futhark_context_new_nif}, | |
{"futhark_new_i64_1d", 2, futhark_new_i64_1d_nif}, | |
{"futhark_i64_1d_to_binary", 2, futhark_i64_1d_to_binary_nif}, | |
{"futhark_new_u8_1d", 2, futhark_new_u8_1d_nif}, | |
{"futhark_u8_1d_to_binary", 2, futhark_u8_1d_to_binary_nif}, | |
{"futhark_entry_add", 3, futhark_entry_add_nif}, | |
{"futhark_entry_add_i64", 3, futhark_entry_add_i64_nif}, | |
{"futhark_context_sync", 1, futhark_context_sync_nif} | |
}; | |
ERL_NIF_INIT(Elixir.Map.NIF, nif_funcs, &load, NULL, NULL, NULL) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
c("lib_map.ex") | |
{:ok, cfg} = Map.NIF.futhark_context_config_new() | |
{:ok, ctx} = Map.NIF.futhark_context_new(cfg) | |
xs_binary = <<0, 1>> | |
{:ok, xs} = Map.NIF.futhark_new_u8_1d(ctx, xs_binary) | |
{:ok, ^xs_binary} = Map.NIF.futhark_u8_1d_to_binary(ctx, xs) | |
{:ok, ys} = Map.NIF.futhark_new_u8_1d(ctx, <<1, 4>>) | |
{:ok, zs} = Map.NIF.futhark_entry_add(ctx, xs, ys) | |
{:ok, <<1, 5>> = zs_binary} = Map.NIF.futhark_u8_1d_to_binary(ctx, zs) | |
xs_binary = <<1::integer-signed-64-little>> | |
{:ok, xs} = Map.NIF.futhark_new_i64_1d(ctx, xs_binary) | |
{:ok, ^xs_binary} = Map.NIF.futhark_i64_1d_to_binary(ctx, xs) | |
{:ok, ys} = Map.NIF.futhark_new_i64_1d(ctx, <<1279::integer-signed-64-little>>) | |
{:ok, zs} = Map.NIF.futhark_entry_add_i64(ctx, xs, ys) | |
{:ok, <<1280::integer-signed-64-little>> = zs_binary} = Map.NIF.futhark_i64_1d_to_binary(ctx, zs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment