Munksgaard/lib_map.c

## lib_map.c
// Generated by Futhark 0.25.0 (prerelease - include info below when reporting bugs)
// git: 6a2e6e1 (Sun Apr 30 19:03:19 2023 +0200) [modified]

// We need to define _GNU_SOURCE before
// _any_ headers files are imported to get
// the usage statistics of a thread (i.e. have RUSAGE_THREAD) on GNU/Linux
// https://manpages.courier-mta.org/htmlman2/getrusage.2.html
#ifndef _GNU_SOURCE // Avoid possible double-definition warning.
#define _GNU_SOURCE
#endif

#ifdef __clang__
#pragma clang diagnostic ignored "-Wunused-function"
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wparentheses"
#pragma clang diagnostic ignored "-Wunused-label"
#elif __GNUC__
#pragma GCC diagnostic ignored "-Wunused-function"
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wparentheses"
#pragma GCC diagnostic ignored "-Wunused-label"
#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
#endif

// Headers
#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdio.h>
#include <float.h>

#define CL_TARGET_OPENCL_VERSION 120
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifdef __APPLE__
#define CL_SILENCE_DEPRECATION
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

#ifdef __cplusplus
extern "C" {
#endif

// Initialisation
struct futhark_context_config;
struct futhark_context_config *futhark_context_config_new(void);
void futhark_context_config_free(struct futhark_context_config *cfg);
int futhark_context_config_set_tuning_param(struct futhark_context_config *cfg, const char *param_name, size_t new_value);
struct futhark_context;
struct futhark_context *futhark_context_new(struct futhark_context_config *cfg);
void futhark_context_free(struct futhark_context *cfg);
void futhark_context_config_add_build_option(struct futhark_context_config *cfg, const char *opt);
void futhark_context_config_set_device(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_set_platform(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_select_device_interactively(struct futhark_context_config *cfg);
void futhark_context_config_list_devices(struct futhark_context_config *cfg);
void futhark_context_config_dump_program_to(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_load_program_from(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_dump_binary_to(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_load_binary_from(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_set_default_group_size(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_default_threshold(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_command_queue(struct futhark_context_config *cfg, cl_command_queue);
void futhark_context_config_set_debugging(struct futhark_context_config *cfg, int flag);
void futhark_context_config_set_profiling(struct futhark_context_config *cfg, int flag);
void futhark_context_config_set_logging(struct futhark_context_config *cfg, int flag);
int futhark_get_tuning_param_count(void);
const char *futhark_get_tuning_param_name(int);
const char *futhark_get_tuning_param_class(int);

// Arrays
struct futhark_i64_1d;
struct futhark_i64_1d *futhark_new_i64_1d(struct futhark_context *ctx, const int64_t *data, int64_t dim0);
struct futhark_i64_1d *futhark_new_raw_i64_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0);
int futhark_free_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr);
int futhark_values_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr, int64_t *data);
cl_mem futhark_values_raw_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr);
const int64_t *futhark_shape_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr);
struct futhark_u8_1d;
struct futhark_u8_1d *futhark_new_u8_1d(struct futhark_context *ctx, const uint8_t *data, int64_t dim0);
struct futhark_u8_1d *futhark_new_raw_u8_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0);
int futhark_free_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr);
int futhark_values_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr, uint8_t *data);
cl_mem futhark_values_raw_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr);
const int64_t *futhark_shape_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr);

// Opaque values


// Entry points
int futhark_entry_add(struct futhark_context *ctx, struct futhark_u8_1d **out0, const struct futhark_u8_1d *in0, const struct futhark_u8_1d *in1);
int futhark_entry_add_i64(struct futhark_context *ctx, struct futhark_i64_1d **out0, const struct futhark_i64_1d *in0, const struct futhark_i64_1d *in1);

// Miscellaneous
int futhark_context_sync(struct futhark_context *ctx);
cl_command_queue futhark_context_get_command_queue(struct futhark_context *ctx);
void futhark_context_config_set_cache_file(struct futhark_context_config *cfg, const char *f);
char *futhark_context_report(struct futhark_context *ctx);
char *futhark_context_get_error(struct futhark_context *ctx);
void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f);
void futhark_context_pause_profiling(struct futhark_context *ctx);
void futhark_context_unpause_profiling(struct futhark_context *ctx);
int futhark_context_clear_caches(struct futhark_context *ctx);
#define FUTHARK_BACKEND_opencl
#define FUTHARK_SUCCESS 0
#define FUTHARK_PROGRAM_ERROR 2
#define FUTHARK_OUT_OF_MEMORY 3

#ifdef __cplusplus
}
#endif

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>
#include <stdint.h>
// If NDEBUG is set, the assert() macro will do nothing. Since Futhark
// (unfortunately) makes use of assert() for error detection (and even some
// side effects), we want to avoid that.
#undef NDEBUG
#include <assert.h>
#include <stdarg.h>
// Start of util.h.
//
// Various helper functions that are useful in all generated C code.

#include <errno.h>
#include <string.h>

static const char *fut_progname = "(embedded Futhark)";

static void futhark_panic(int eval, const char *fmt, ...) __attribute__((noreturn));
static char* msgprintf(const char *s, ...);
static void* slurp_file(const char *filename, size_t *size);
static int dump_file(const char *file, const void *buf, size_t n);
struct str_builder;
static void str_builder_init(struct str_builder *b);
static void str_builder(struct str_builder *b, const char *s, ...);
static char *strclone(const char *str);

static void futhark_panic(int eval, const char *fmt, ...) {
  va_list ap;
  va_start(ap, fmt);
  fprintf(stderr, "%s: ", fut_progname);
  vfprintf(stderr, fmt, ap);
  va_end(ap);
  exit(eval);
}

// For generating arbitrary-sized error messages.  It is the callers
// responsibility to free the buffer at some point.
static char* msgprintf(const char *s, ...) {
  va_list vl;
  va_start(vl, s);
  size_t needed = 1 + (size_t)vsnprintf(NULL, 0, s, vl);
  char *buffer = (char*) malloc(needed);
  va_start(vl, s); // Must re-init.
  vsnprintf(buffer, needed, s, vl);
  return buffer;
}

static inline void check_err(int errval, int sets_errno, const char *fun, int line,
                             const char *msg, ...) {
  if (errval) {
    char errnum[10];

    va_list vl;
    va_start(vl, msg);

    fprintf(stderr, "ERROR: ");
    vfprintf(stderr, msg, vl);
    fprintf(stderr, " in %s() at line %d with error code %s\n",
            fun, line,
            sets_errno ? strerror(errno) : errnum);
    exit(errval);
  }
}

#define CHECK_ERR(err, ...) check_err(err, 0, __func__, __LINE__, __VA_ARGS__)
#define CHECK_ERRNO(err, ...) check_err(err, 1, __func__, __LINE__, __VA_ARGS__)

// Read the rest of an open file into a NUL-terminated string; returns
// NULL on error.
static void* fslurp_file(FILE *f, size_t *size) {
  long start = ftell(f);
  fseek(f, 0, SEEK_END);
  long src_size = ftell(f)-start;
  fseek(f, start, SEEK_SET);
  unsigned char *s = (unsigned char*) malloc((size_t)src_size + 1);
  if (fread(s, 1, (size_t)src_size, f) != (size_t)src_size) {
    free(s);
    s = NULL;
  } else {
    s[src_size] = '\0';
  }

  if (size) {
    *size = (size_t)src_size;
  }

  return s;
}

// Read a file into a NUL-terminated string; returns NULL on error.
static void* slurp_file(const char *filename, size_t *size) {
  FILE *f = fopen(filename, "rb"); // To avoid Windows messing with linebreaks.
  if (f == NULL) return NULL;
  unsigned char *s = fslurp_file(f, size);
  fclose(f);
  return s;
}

// Dump 'n' bytes from 'buf' into the file at the designated location.
// Returns 0 on success.
static int dump_file(const char *file, const void *buf, size_t n) {
  FILE *f = fopen(file, "w");

  if (f == NULL) {
    return 1;
  }

  if (fwrite(buf, sizeof(char), n, f) != n) {
    return 1;
  }

  if (fclose(f) != 0) {
    return 1;
  }

  return 0;
}

struct str_builder {
  char *str;
  size_t capacity; // Size of buffer.
  size_t used; // Bytes used, *not* including final zero.
};

static void str_builder_init(struct str_builder *b) {
  b->capacity = 10;
  b->used = 0;
  b->str = malloc(b->capacity);
  b->str[0] = 0;
}

static void str_builder(struct str_builder *b, const char *s, ...) {
  va_list vl;
  va_start(vl, s);
  size_t needed = (size_t)vsnprintf(NULL, 0, s, vl);

  while (b->capacity < b->used + needed + 1) {
    b->capacity *= 2;
    b->str = realloc(b->str, b->capacity);
  }

  va_start(vl, s); // Must re-init.
  vsnprintf(b->str+b->used, b->capacity-b->used, s, vl);
  b->used += needed;
}


static char *strclone(const char *str) {
  size_t size = strlen(str) + 1;
  char *copy = (char*) malloc(size);
  if (copy == NULL) {
    return NULL;
  }

  memcpy(copy, str, size);
  return copy;
}

// End of util.h.
// Start of cache.h

#define CACHE_HASH_SIZE 8 // In 32-bit words.

struct cache_hash {
  uint32_t hash[CACHE_HASH_SIZE];
};

// Initialise a blank cache.
static void cache_hash_init(struct cache_hash *c);

// Hash some bytes and add them to the accumulated hash.
static void cache_hash(struct cache_hash *out, const char *in, size_t n);

// Try to restore cache contents from a file with the given name.
// Assumes the cache is invalid if it contains the given hash.
// Allocates memory and reads the cache conents, which is returned in
// *buf with size *buflen.  If the cache is successfully loaded, this
// function returns 0.  Otherwise it returns nonzero.  Errno is set if
// the failure to load the cache is due to anything except invalid
// cache conents.  Note that failing to restore the cache is not
// necessarily a problem: it might just be invalid or not created yet.
static int cache_restore(const char *fname, const struct cache_hash *hash,
                         unsigned char **buf, size_t *buflen);

// Store cache contents in the given file, with the given hash.
static int cache_store(const char *fname, const struct cache_hash *hash,
                       const unsigned char *buf, size_t buflen);

// Now for the implementation.

static void cache_hash_init(struct cache_hash *c) {
  memset(c->hash, 0, CACHE_HASH_SIZE * sizeof(uint32_t));
}

static void cache_hash(struct cache_hash *out, const char *in, size_t n) {
  // Adaptation of djb2 for larger output size by storing intermediate
  // states.
  uint32_t hash = 5381;
  for (size_t i = 0; i < n; i++) {
    hash = ((hash << 5) + hash) + in[i];
    out->hash[i % CACHE_HASH_SIZE] ^= hash;
  }
}

#define CACHE_HEADER_SIZE 8
static const char cache_header[CACHE_HEADER_SIZE] = "FUTHARK\0";

static int cache_restore(const char *fname, const struct cache_hash *hash,
                         unsigned char **buf, size_t *buflen) {
  FILE *f = fopen(fname, "rb");

  if (f == NULL) {
    return 1;
  }

  char f_header[CACHE_HEADER_SIZE];

  if (fread(f_header, sizeof(char), CACHE_HEADER_SIZE, f) != CACHE_HEADER_SIZE) {
    goto error;
  }

  if (memcmp(f_header, cache_header, CACHE_HEADER_SIZE) != 0) {
    goto error;
  }

  if (fseek(f, 0, SEEK_END) != 0) {
    goto error;
  }
  int64_t f_size = (int64_t)ftell(f);
  if (fseek(f, CACHE_HEADER_SIZE, SEEK_SET) != 0) {
    goto error;
  }

  int64_t expected_size;

  if (fread(&expected_size, sizeof(int64_t), 1, f) != 1) {
    goto error;
  }

  if (f_size != expected_size) {
    errno = 0;
    goto error;
  }

  int32_t f_hash[CACHE_HASH_SIZE];

  if (fread(f_hash, sizeof(int32_t), CACHE_HASH_SIZE, f) != CACHE_HASH_SIZE) {
    goto error;
  }

  if (memcmp(f_hash, hash->hash, CACHE_HASH_SIZE) != 0) {
    errno = 0;
    goto error;
  }

  *buflen = f_size - CACHE_HEADER_SIZE - sizeof(int64_t) - CACHE_HASH_SIZE*sizeof(int32_t);
  *buf = malloc(*buflen);
  if (fread(*buf, sizeof(char), *buflen, f) != *buflen) {
    free(*buf);
    goto error;
  }

  fclose(f);

  return 0;

 error:
  fclose(f);
  return 1;
}

static int cache_store(const char *fname, const struct cache_hash *hash,
                       const unsigned char *buf, size_t buflen) {
  FILE *f = fopen(fname, "wb");

  if (f == NULL) {
    return 1;
  }

  if (fwrite(cache_header, CACHE_HEADER_SIZE, 1, f) != 1) {
    goto error;
  }

  int64_t size = CACHE_HEADER_SIZE + sizeof(int64_t) + CACHE_HASH_SIZE*sizeof(int32_t) + buflen;

  if (fwrite(&size, sizeof(size), 1, f) != 1) {
    goto error;
  }

  if (fwrite(hash->hash, sizeof(int32_t), CACHE_HASH_SIZE, f) != CACHE_HASH_SIZE) {
    goto error;
  }

  if (fwrite(buf, sizeof(unsigned char), buflen, f) != buflen) {
    goto error;
  }

  fclose(f);

  return 0;

 error:
  fclose(f);
  return 1;
}

// End of cache.h
// Start of half.h.

// Conversion functions are from http://half.sourceforge.net/, but
// translated to C.
//
// Copyright (c) 2012-2021 Christian Rau
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#ifndef __OPENCL_VERSION__
#define __constant
#endif

__constant static const uint16_t base_table[512] = {
  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100,
  0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00,
  0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00,
  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100,
  0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00,
  0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00,
  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 };

__constant static const unsigned char shift_table[512] = {
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13,
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 };

__constant static const uint32_t mantissa_table[2048] = {
  0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000,
  0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000,
  0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000,
  0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000,
  0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000,
  0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000,
  0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000,
  0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000,
  0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000,
  0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000,
  0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000,
  0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000,
  0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000,
  0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000,
  0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000,
  0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000,
  0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000,
  0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000,
  0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000,
  0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000,
  0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000,
  0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000,
  0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000,
  0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000,
  0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000,
  0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000,
  0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000,
  0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000,
  0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000,
  0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000,
  0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000,
  0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000,
  0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000,
  0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000,
  0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000,
  0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000,
  0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000,
  0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000,
  0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000,
  0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000,
  0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000,
  0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000,
  0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000,
  0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000,
  0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000,
  0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000,
  0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000,
  0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000,
  0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000,
  0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000,
  0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000,
  0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000,
  0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000,
  0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000,
  0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000,
  0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000,
  0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000,
  0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000,
  0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000,
  0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000,
  0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000,
  0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000,
  0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000,
  0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000,
  0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000,
  0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000,
  0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000,
  0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000,
  0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000,
  0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000,
  0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000,
  0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000,
  0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000,
  0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000,
  0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000,
  0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000,
  0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000,
  0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000,
  0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000,
  0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000,
  0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000,
  0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000,
  0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000,
  0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000,
  0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000,
  0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000,
  0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000,
  0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000,
  0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000,
  0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000,
  0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000,
  0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000,
  0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000,
  0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000,
  0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000,
  0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000,
  0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000,
  0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000,
  0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000,
  0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000,
  0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000,
  0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000,
  0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000,
  0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000,
  0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,
  0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000,
  0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000,
  0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000,
  0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000,
  0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000,
  0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000,
  0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000,
  0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000,
  0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000,
  0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000,
  0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000,
  0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000,
  0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000,
  0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000,
  0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000,
  0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000,
  0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000,
  0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000,
  0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000,
  0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000,
  0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000,
  0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000,
  0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 };
__constant static const uint32_t exponent_table[64] = {
  0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000,
  0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000,
  0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
  0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 };
__constant static const unsigned short offset_table[64] = {
  0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
  0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 };

static uint16_t float2halfbits(float value) {
  union { float x; uint32_t y; } u;
  u.x = value;
  uint32_t bits = u.y;

  uint16_t hbits = base_table[bits>>23] + (uint16_t)((bits&0x7FFFFF)>>shift_table[bits>>23]);;

  return hbits;
}

static float halfbits2float(uint16_t value) {
  uint32_t bits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10];

  union { uint32_t x; float y; } u;
  u.x = bits;
  return u.y;
}

static uint16_t halfbitsnextafter(uint16_t from, uint16_t to) {
  int fabs = from & 0x7FFF, tabs = to & 0x7FFF;
  if(fabs > 0x7C00 || tabs > 0x7C00) {
    return ((from&0x7FFF)>0x7C00) ? (from|0x200) : (to|0x200);
  }
  if(from == to || !(fabs|tabs)) {
    return to;
  }
  if(!fabs) {
    return (to&0x8000)+1;
  }
  unsigned int out =
    from +
    (((from>>15)^(unsigned int)((from^(0x8000|(0x8000-(from>>15))))<(to^(0x8000|(0x8000-(to>>15))))))<<1)
    - 1;
  return out;
}

// End of half.h.
// Start of timing.h.

// The function get_wall_time() returns the wall time in microseconds
// (with an unspecified offset).

#ifdef _WIN32

#include <windows.h>

static int64_t get_wall_time(void) {
  LARGE_INTEGER time,freq;
  assert(QueryPerformanceFrequency(&freq));
  assert(QueryPerformanceCounter(&time));
  return ((double)time.QuadPart / freq.QuadPart) * 1000000;
}

#else
// Assuming POSIX

#include <time.h>
#include <sys/time.h>

static int64_t get_wall_time(void) {
  struct timeval time;
  assert(gettimeofday(&time,NULL) == 0);
  return time.tv_sec * 1000000 + time.tv_usec;
}

static int64_t get_wall_time_ns(void) {
  struct timespec time;
  assert(clock_gettime(CLOCK_REALTIME, &time) == 0);
  return time.tv_sec * 1000000000 + time.tv_nsec;
}

#endif

// End of timing.h.
// Start of lock.h.

// A very simple cross-platform implementation of locks.  Uses
// pthreads on Unix and some Windows thing there.  Futhark's
// host-level code is not multithreaded, but user code may be, so we
// need some mechanism for ensuring atomic access to API functions.
// This is that mechanism.  It is not exposed to user code at all, so
// we do not have to worry about name collisions.

#ifdef _WIN32

typedef HANDLE lock_t;

static void create_lock(lock_t *lock) {
  *lock = CreateMutex(NULL,  // Default security attributes.
                      FALSE, // Initially unlocked.
                      NULL); // Unnamed.
}

static void lock_lock(lock_t *lock) {
  assert(WaitForSingleObject(*lock, INFINITE) == WAIT_OBJECT_0);
}

static void lock_unlock(lock_t *lock) {
  assert(ReleaseMutex(*lock));
}

static void free_lock(lock_t *lock) {
  CloseHandle(*lock);
}

#else
// Assuming POSIX

#include <pthread.h>

typedef pthread_mutex_t lock_t;

static void create_lock(lock_t *lock) {
  int r = pthread_mutex_init(lock, NULL);
  assert(r == 0);
}

static void lock_lock(lock_t *lock) {
  int r = pthread_mutex_lock(lock);
  assert(r == 0);
}

static void lock_unlock(lock_t *lock) {
  int r = pthread_mutex_unlock(lock);
  assert(r == 0);
}

static void free_lock(lock_t *lock) {
  // Nothing to do for pthreads.
  (void)lock;
}

#endif

// End of lock.h.
// Start of free_list.h.

typedef uintptr_t fl_mem;

// An entry in the free list.  May be invalid, to avoid having to
// deallocate entries as soon as they are removed.  There is also a
// tag, to help with memory reuse.
struct free_list_entry {
  size_t size;
  fl_mem mem;
  const char *tag;
  unsigned char valid;
};

struct free_list {
  struct free_list_entry *entries; // Pointer to entries.
  int capacity;                    // Number of entries.
  int used;                        // Number of valid entries.
  lock_t lock;                     // Thread safety.
};

static void free_list_init(struct free_list *l) {
  l->capacity = 30; // Picked arbitrarily.
  l->used = 0;
  l->entries = (struct free_list_entry*) malloc(sizeof(struct free_list_entry) * l->capacity);
  for (int i = 0; i < l->capacity; i++) {
    l->entries[i].valid = 0;
  }
  create_lock(&l->lock);
}

// Remove invalid entries from the free list.
static void free_list_pack(struct free_list *l) {
  lock_lock(&l->lock);
  int p = 0;
  for (int i = 0; i < l->capacity; i++) {
    if (l->entries[i].valid) {
      l->entries[p] = l->entries[i];
      if (i > p) {
        l->entries[i].valid = 0;
      }
      p++;
    }
  }

  // Now p is the number of used elements.  We don't want it to go
  // less than the default capacity (although in practice it's OK as
  // long as it doesn't become 1).
  if (p < 30) {
    p = 30;
  }
  l->entries = realloc(l->entries, p * sizeof(struct free_list_entry));
  l->capacity = p;
  lock_unlock(&l->lock);
}

static void free_list_destroy(struct free_list *l) {
  assert(l->used == 0);
  free(l->entries);
  free_lock(&l->lock);
}

// Not part of the interface, so no locking.
static int free_list_find_invalid(struct free_list *l) {
  int i;
  for (i = 0; i < l->capacity; i++) {
    if (!l->entries[i].valid) {
      break;
    }
  }
  return i;
}

static void free_list_insert(struct free_list *l, size_t size, fl_mem mem, const char *tag) {
  lock_lock(&l->lock);
  int i = free_list_find_invalid(l);

  if (i == l->capacity) {
    // List is full; so we have to grow it.
    int new_capacity = l->capacity * 2 * sizeof(struct free_list_entry);
    l->entries = realloc(l->entries, new_capacity);
    for (int j = 0; j < l->capacity; j++) {
      l->entries[j+l->capacity].valid = 0;
    }
    l->capacity *= 2;
  }

  // Now 'i' points to the first invalid entry.
  l->entries[i].valid = 1;
  l->entries[i].size = size;
  l->entries[i].mem = mem;
  l->entries[i].tag = tag;

  l->used++;
  lock_unlock(&l->lock);
}

// Determine whether this entry in the free list is acceptable for
// satisfying the request.  Not public, so no locking.
static bool free_list_acceptable(size_t size, const char* tag, struct free_list_entry *entry) {
  // We check not just the hard requirement (is the entry acceptable
  // and big enough?) but also put a cap on how much wasted space
  // (internal fragmentation) we allow.  This is necessarily a
  // heuristic, and a crude one.

  if (!entry->valid) {
    return false;
  }

  if (size > entry->size) {
    return false;
  }

  // We know the block fits.  Now the question is whether it is too
  // big.  Our policy is as follows:
  //
  // 1) We don't care about wasted space below 4096 bytes (to avoid
  // churn in tiny allocations).
  //
  // 2) If the tag matches, we allow _any_ amount of wasted space.
  //
  // 3) Otherwise we allow up to 50% wasted space.

  if (entry->size < 4096) {
    return true;
  }

  if (entry->tag == tag) {
    return true;
  }

  if (entry->size < size * 2) {
    return true;
  }

  return false;
}

// Find and remove a memory block of the indicated tag, or if that
// does not exist, another memory block with exactly the desired size.
// Returns 0 on success.
static int free_list_find(struct free_list *l, size_t size, const char *tag,
                          size_t *size_out, fl_mem *mem_out) {
  lock_lock(&l->lock);
  int size_match = -1;
  int i;
  int ret = 1;
  for (i = 0; i < l->capacity; i++) {
    if (free_list_acceptable(size, tag, &l->entries[i]) &&
        (size_match < 0 || l->entries[i].size < l->entries[size_match].size)) {
      // If this entry is valid, has sufficient size, and is smaller than the
      // best entry found so far, use this entry.
      size_match = i;
    }
  }

  if (size_match >= 0) {
    l->entries[size_match].valid = 0;
    *size_out = l->entries[size_match].size;
    *mem_out = l->entries[size_match].mem;
    l->used--;
    ret = 0;
  }
  lock_unlock(&l->lock);
  return ret;
}

// Remove the first block in the free list.  Returns 0 if a block was
// removed, and nonzero if the free list was already empty.
static int free_list_first(struct free_list *l, fl_mem *mem_out) {
  lock_lock(&l->lock);
  int ret = 1;
  for (int i = 0; i < l->capacity; i++) {
    if (l->entries[i].valid) {
      l->entries[i].valid = 0;
      *mem_out = l->entries[i].mem;
      l->used--;
      ret = 0;
      break;
    }
  }
  lock_unlock(&l->lock);
  return ret;
}

// End of free_list.h.

#ifdef _MSC_VER
#define inline __inline
#endif
#include <string.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <ctype.h>


#define CL_TARGET_OPENCL_VERSION 120
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifdef __APPLE__
#define CL_SILENCE_DEPRECATION
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif


#define FUTHARK_F64_ENABLED

// Start of scalar.h.

// Implementation of the primitive scalar operations.  Very
// repetitive.  This code is inserted directly into both CUDA and
// OpenCL programs, as well as the CPU code, so it has some #ifdefs to
// work everywhere.  Some operations are defined as macros because
// this allows us to use them as constant expressions in things like
// array sizes and static initialisers.

// Some of the #ifdefs are because OpenCL uses type-generic functions
// for some operations (e.g. sqrt), while C and CUDA sensibly use
// distinct functions for different precisions (e.g. sqrtf() and
// sqrt()).  This is quite annoying.  Due to C's unfortunate casting
// rules, it is also really easy to accidentally implement
// floating-point functions in the wrong precision, so be careful.

// Double-precision definitions are only included if the preprocessor
// macro FUTHARK_F64_ENABLED is set.

static inline uint8_t add8(uint8_t x, uint8_t y) {
  return x + y;
}

static inline uint16_t add16(uint16_t x, uint16_t y) {
  return x + y;
}

static inline uint32_t add32(uint32_t x, uint32_t y) {
  return x + y;
}

static inline uint64_t add64(uint64_t x, uint64_t y) {
  return x + y;
}

static inline uint8_t sub8(uint8_t x, uint8_t y) {
  return x - y;
}

static inline uint16_t sub16(uint16_t x, uint16_t y) {
  return x - y;
}

static inline uint32_t sub32(uint32_t x, uint32_t y) {
  return x - y;
}

static inline uint64_t sub64(uint64_t x, uint64_t y) {
  return x - y;
}

static inline uint8_t mul8(uint8_t x, uint8_t y) {
  return x * y;
}

static inline uint16_t mul16(uint16_t x, uint16_t y) {
  return x * y;
}

static inline uint32_t mul32(uint32_t x, uint32_t y) {
  return x * y;
}

static inline uint64_t mul64(uint64_t x, uint64_t y) {
  return x * y;
}

#if ISPC

static inline uint8_t udiv8(uint8_t x, uint8_t y) {
  // This strange pattern is used to prevent the ISPC compiler from
  // causing SIGFPEs and bogus results on divisions where inactive lanes
  // have 0-valued divisors. It ensures that any inactive lane instead
  // has a divisor of 1. https://github.com/ispc/ispc/issues/2292
  uint8_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return x / ys;
}

static inline uint16_t udiv16(uint16_t x, uint16_t y) {
  uint16_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return x / ys;
}

static inline uint32_t udiv32(uint32_t x, uint32_t y) {
  uint32_t ys = 1;
  foreach_active(i){
    ys = y;
  }


  return x / ys;
}

static inline uint64_t udiv64(uint64_t x, uint64_t y) {
  uint64_t ys = 1;
  foreach_active(i){
    ys = y;
  }


  return x / ys;
}

static inline uint8_t udiv_up8(uint8_t x, uint8_t y) {
  uint8_t ys = 1;
  foreach_active(i){
    ys = y;
  }


  return (x + y - 1) / ys;
}

static inline uint16_t udiv_up16(uint16_t x, uint16_t y) {
  uint16_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return (x + y - 1) / ys;
}

static inline uint32_t udiv_up32(uint32_t x, uint32_t y) {
  uint32_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return (x + y - 1) / ys;
}

static inline uint64_t udiv_up64(uint64_t x, uint64_t y) {
  uint64_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return (x + y - 1) / ys;
}

static inline uint8_t umod8(uint8_t x, uint8_t y) {
  uint8_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return x % ys;
}

static inline uint16_t umod16(uint16_t x, uint16_t y) {
  uint16_t ys = 1;
  foreach_active(i){
    ys = y;
  }


  return x % ys;
}

static inline uint32_t umod32(uint32_t x, uint32_t y) {
  uint32_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return x % ys;
}

static inline uint64_t umod64(uint64_t x, uint64_t y) {
  uint64_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return x % ys;
}

static inline uint8_t udiv_safe8(uint8_t x, uint8_t y) {
  uint8_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : x / ys;
}

static inline uint16_t udiv_safe16(uint16_t x, uint16_t y) {
  uint16_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : x / ys;
}

static inline uint32_t udiv_safe32(uint32_t x, uint32_t y) {
  uint32_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : x / ys;
}

static inline uint64_t udiv_safe64(uint64_t x, uint64_t y) {
  uint64_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : x / ys;
}

static inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) {
  uint8_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : (x + y - 1) / ys;
}

static inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) {
  uint16_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : (x + y - 1) / ys;
}

static inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) {
  uint32_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : (x + y - 1) / ys;
}

static inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) {
  uint64_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : (x + y - 1) / ys;
}

static inline uint8_t umod_safe8(uint8_t x, uint8_t y) {
  uint8_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : x % ys;
}

static inline uint16_t umod_safe16(uint16_t x, uint16_t y) {
  uint16_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : x % ys;
}

static inline uint32_t umod_safe32(uint32_t x, uint32_t y) {
  uint32_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : x % ys;
}

static inline uint64_t umod_safe64(uint64_t x, uint64_t y) {
  uint64_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : x % ys;
}

static inline int8_t sdiv8(int8_t x, int8_t y) {
  int8_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  int8_t q = x / ys;
  int8_t r = x % ys;

  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}

static inline int16_t sdiv16(int16_t x, int16_t y) {
  int16_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  int16_t q = x / ys;
  int16_t r = x % ys;

  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}

static inline int32_t sdiv32(int32_t x, int32_t y) {
  int32_t ys = 1;
  foreach_active(i){
    ys = y;
  }
  int32_t q = x / ys;
  int32_t r = x % ys;

  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}

static inline int64_t sdiv64(int64_t x, int64_t y) {
  int64_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  int64_t q = x / ys;
  int64_t r = x % ys;

  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}

static inline int8_t sdiv_up8(int8_t x, int8_t y) {
  return sdiv8(x + y - 1, y);
}

static inline int16_t sdiv_up16(int16_t x, int16_t y) {
  return sdiv16(x + y - 1, y);
}

static inline int32_t sdiv_up32(int32_t x, int32_t y) {
  return sdiv32(x + y - 1, y);
}

static inline int64_t sdiv_up64(int64_t x, int64_t y) {
  return sdiv64(x + y - 1, y);
}

static inline int8_t smod8(int8_t x, int8_t y) {
  int8_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  int8_t r = x % ys;

  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}

static inline int16_t smod16(int16_t x, int16_t y) {
  int16_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  int16_t r = x % ys;

  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}

static inline int32_t smod32(int32_t x, int32_t y) {
  int32_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  int32_t r = x % ys;

  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}

static inline int64_t smod64(int64_t x, int64_t y) {
  int64_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  int64_t r = x % ys;

  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}

static inline int8_t sdiv_safe8(int8_t x, int8_t y) {
  return y == 0 ? 0 : sdiv8(x, y);
}

static inline int16_t sdiv_safe16(int16_t x, int16_t y) {
  return y == 0 ? 0 : sdiv16(x, y);
}

static inline int32_t sdiv_safe32(int32_t x, int32_t y) {
  return y == 0 ? 0 : sdiv32(x, y);
}

static inline int64_t sdiv_safe64(int64_t x, int64_t y) {
  return y == 0 ? 0 : sdiv64(x, y);
}

static inline int8_t sdiv_up_safe8(int8_t x, int8_t y) {
  return sdiv_safe8(x + y - 1, y);
}

static inline int16_t sdiv_up_safe16(int16_t x, int16_t y) {
  return sdiv_safe16(x + y - 1, y);
}

static inline int32_t sdiv_up_safe32(int32_t x, int32_t y) {
  return sdiv_safe32(x + y - 1, y);
}

static inline int64_t sdiv_up_safe64(int64_t x, int64_t y) {
  return sdiv_safe64(x + y - 1, y);
}

static inline int8_t smod_safe8(int8_t x, int8_t y) {
  return y == 0 ? 0 : smod8(x, y);
}

static inline int16_t smod_safe16(int16_t x, int16_t y) {
  return y == 0 ? 0 : smod16(x, y);
}

static inline int32_t smod_safe32(int32_t x, int32_t y) {
  return y == 0 ? 0 : smod32(x, y);
}

static inline int64_t smod_safe64(int64_t x, int64_t y) {
  return y == 0 ? 0 : smod64(x, y);
}

static inline int8_t squot8(int8_t x, int8_t y) {
  int8_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return x / ys;
}

static inline int16_t squot16(int16_t x, int16_t y) {
  int16_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return x / ys;
}

static inline int32_t squot32(int32_t x, int32_t y) {
  int32_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return x / ys;
}

static inline int64_t squot64(int64_t x, int64_t y) {
  int64_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return x / ys;
}

static inline int8_t srem8(int8_t x, int8_t y) {
  int8_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return x % ys;
}

static inline int16_t srem16(int16_t x, int16_t y) {
  int16_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return x % ys;
}

static inline int32_t srem32(int32_t x, int32_t y) {
  int32_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return x % ys;
}

static inline int64_t srem64(int64_t x, int64_t y) {
  int8_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return x % ys;
}

static inline int8_t squot_safe8(int8_t x, int8_t y) {
  int8_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : x / ys;
}

static inline int16_t squot_safe16(int16_t x, int16_t y) {
  int16_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : x / ys;
}

static inline int32_t squot_safe32(int32_t x, int32_t y) {
  int32_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : x / ys;
}

static inline int64_t squot_safe64(int64_t x, int64_t y) {
  int64_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : x / ys;
}

static inline int8_t srem_safe8(int8_t x, int8_t y) {
  int8_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : x % ys;
}

static inline int16_t srem_safe16(int16_t x, int16_t y) {
  int16_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : x % ys;
}

static inline int32_t srem_safe32(int32_t x, int32_t y) {
  int32_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : x % ys;
}

static inline int64_t srem_safe64(int64_t x, int64_t y) {
  int64_t ys = 1;
  foreach_active(i){
    ys = y;
  }

  return y == 0 ? 0 : x % ys;
}

#else

static inline uint8_t udiv8(uint8_t x, uint8_t y) {
  return x / y;
}

static inline uint16_t udiv16(uint16_t x, uint16_t y) {
  return x / y;
}

static inline uint32_t udiv32(uint32_t x, uint32_t y) {
  return x / y;
}

static inline uint64_t udiv64(uint64_t x, uint64_t y) {
  return x / y;
}

static inline uint8_t udiv_up8(uint8_t x, uint8_t y) {
  return (x + y - 1) / y;
}

static inline uint16_t udiv_up16(uint16_t x, uint16_t y) {
  return (x + y - 1) / y;
}

static inline uint32_t udiv_up32(uint32_t x, uint32_t y) {
  return (x + y - 1) / y;
}

static inline uint64_t udiv_up64(uint64_t x, uint64_t y) {
  return (x + y - 1) / y;
}

static inline uint8_t umod8(uint8_t x, uint8_t y) {
  return x % y;
}

static inline uint16_t umod16(uint16_t x, uint16_t y) {
  return x % y;
}

static inline uint32_t umod32(uint32_t x, uint32_t y) {
  return x % y;
}

static inline uint64_t umod64(uint64_t x, uint64_t y) {
  return x % y;
}

static inline uint8_t udiv_safe8(uint8_t x, uint8_t y) {
  return y == 0 ? 0 : x / y;
}

static inline uint16_t udiv_safe16(uint16_t x, uint16_t y) {
  return y == 0 ? 0 : x / y;
}

static inline uint32_t udiv_safe32(uint32_t x, uint32_t y) {
  return y == 0 ? 0 : x / y;
}

static inline uint64_t udiv_safe64(uint64_t x, uint64_t y) {
  return y == 0 ? 0 : x / y;
}

static inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) {
  return y == 0 ? 0 : (x + y - 1) / y;
}

static inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) {
  return y == 0 ? 0 : (x + y - 1) / y;
}

static inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) {
  return y == 0 ? 0 : (x + y - 1) / y;
}

static inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) {
  return y == 0 ? 0 : (x + y - 1) / y;
}

static inline uint8_t umod_safe8(uint8_t x, uint8_t y) {
  return y == 0 ? 0 : x % y;
}

static inline uint16_t umod_safe16(uint16_t x, uint16_t y) {
  return y == 0 ? 0 : x % y;
}

static inline uint32_t umod_safe32(uint32_t x, uint32_t y) {
  return y == 0 ? 0 : x % y;
}

static inline uint64_t umod_safe64(uint64_t x, uint64_t y) {
  return y == 0 ? 0 : x % y;
}

static inline int8_t sdiv8(int8_t x, int8_t y) {
  int8_t q = x / y;
  int8_t r = x % y;

  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}

static inline int16_t sdiv16(int16_t x, int16_t y) {
  int16_t q = x / y;
  int16_t r = x % y;

  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}

static inline int32_t sdiv32(int32_t x, int32_t y) {
  int32_t q = x / y;
  int32_t r = x % y;

  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}

static inline int64_t sdiv64(int64_t x, int64_t y) {
  int64_t q = x / y;
  int64_t r = x % y;

  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}

static inline int8_t sdiv_up8(int8_t x, int8_t y) {
  return sdiv8(x + y - 1, y);
}

static inline int16_t sdiv_up16(int16_t x, int16_t y) {
  return sdiv16(x + y - 1, y);
}

static inline int32_t sdiv_up32(int32_t x, int32_t y) {
  return sdiv32(x + y - 1, y);
}

static inline int64_t sdiv_up64(int64_t x, int64_t y) {
  return sdiv64(x + y - 1, y);
}

static inline int8_t smod8(int8_t x, int8_t y) {
  int8_t r = x % y;

  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}

static inline int16_t smod16(int16_t x, int16_t y) {
  int16_t r = x % y;

  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}

static inline int32_t smod32(int32_t x, int32_t y) {
  int32_t r = x % y;

  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}

static inline int64_t smod64(int64_t x, int64_t y) {
  int64_t r = x % y;

  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}

static inline int8_t sdiv_safe8(int8_t x, int8_t y) {
  return y == 0 ? 0 : sdiv8(x, y);
}

static inline int16_t sdiv_safe16(int16_t x, int16_t y) {
  return y == 0 ? 0 : sdiv16(x, y);
}

static inline int32_t sdiv_safe32(int32_t x, int32_t y) {
  return y == 0 ? 0 : sdiv32(x, y);
}

static inline int64_t sdiv_safe64(int64_t x, int64_t y) {
  return y == 0 ? 0 : sdiv64(x, y);
}

static inline int8_t sdiv_up_safe8(int8_t x, int8_t y) {
  return sdiv_safe8(x + y - 1, y);
}

static inline int16_t sdiv_up_safe16(int16_t x, int16_t y) {
  return sdiv_safe16(x + y - 1, y);
}

static inline int32_t sdiv_up_safe32(int32_t x, int32_t y) {
  return sdiv_safe32(x + y - 1, y);
}

static inline int64_t sdiv_up_safe64(int64_t x, int64_t y) {
  return sdiv_safe64(x + y - 1, y);
}

static inline int8_t smod_safe8(int8_t x, int8_t y) {
  return y == 0 ? 0 : smod8(x, y);
}

static inline int16_t smod_safe16(int16_t x, int16_t y) {
  return y == 0 ? 0 : smod16(x, y);
}

static inline int32_t smod_safe32(int32_t x, int32_t y) {
  return y == 0 ? 0 : smod32(x, y);
}

static inline int64_t smod_safe64(int64_t x, int64_t y) {
  return y == 0 ? 0 : smod64(x, y);
}

static inline int8_t squot8(int8_t x, int8_t y) {
  return x / y;
}

static inline int16_t squot16(int16_t x, int16_t y) {
  return x / y;
}

static inline int32_t squot32(int32_t x, int32_t y) {
  return x / y;
}

static inline int64_t squot64(int64_t x, int64_t y) {
  return x / y;
}

static inline int8_t srem8(int8_t x, int8_t y) {
  return x % y;
}

static inline int16_t srem16(int16_t x, int16_t y) {
  return x % y;
}

static inline int32_t srem32(int32_t x, int32_t y) {
  return x % y;
}

static inline int64_t srem64(int64_t x, int64_t y) {
  return x % y;
}

static inline int8_t squot_safe8(int8_t x, int8_t y) {
  return y == 0 ? 0 : x / y;
}

static inline int16_t squot_safe16(int16_t x, int16_t y) {
  return y == 0 ? 0 : x / y;
}

static inline int32_t squot_safe32(int32_t x, int32_t y) {
  return y == 0 ? 0 : x / y;
}

static inline int64_t squot_safe64(int64_t x, int64_t y) {
  return y == 0 ? 0 : x / y;
}

static inline int8_t srem_safe8(int8_t x, int8_t y) {
  return y == 0 ? 0 : x % y;
}

static inline int16_t srem_safe16(int16_t x, int16_t y) {
  return y == 0 ? 0 : x % y;
}

static inline int32_t srem_safe32(int32_t x, int32_t y) {
  return y == 0 ? 0 : x % y;
}

static inline int64_t srem_safe64(int64_t x, int64_t y) {
  return y == 0 ? 0 : x % y;
}

#endif

static inline int8_t smin8(int8_t x, int8_t y) {
  return x < y ? x : y;
}

static inline int16_t smin16(int16_t x, int16_t y) {
  return x < y ? x : y;
}

static inline int32_t smin32(int32_t x, int32_t y) {
  return x < y ? x : y;
}

static inline int64_t smin64(int64_t x, int64_t y) {
  return x < y ? x : y;
}

static inline uint8_t umin8(uint8_t x, uint8_t y) {
  return x < y ? x : y;
}

static inline uint16_t umin16(uint16_t x, uint16_t y) {
  return x < y ? x : y;
}

static inline uint32_t umin32(uint32_t x, uint32_t y) {
  return x < y ? x : y;
}

static inline uint64_t umin64(uint64_t x, uint64_t y) {
  return x < y ? x : y;
}

static inline int8_t smax8(int8_t x, int8_t y) {
  return x < y ? y : x;
}

static inline int16_t smax16(int16_t x, int16_t y) {
  return x < y ? y : x;
}

static inline int32_t smax32(int32_t x, int32_t y) {
  return x < y ? y : x;
}

static inline int64_t smax64(int64_t x, int64_t y) {
  return x < y ? y : x;
}

static inline uint8_t umax8(uint8_t x, uint8_t y) {
  return x < y ? y : x;
}

static inline uint16_t umax16(uint16_t x, uint16_t y) {
  return x < y ? y : x;
}

static inline uint32_t umax32(uint32_t x, uint32_t y) {
  return x < y ? y : x;
}

static inline uint64_t umax64(uint64_t x, uint64_t y) {
  return x < y ? y : x;
}

static inline uint8_t shl8(uint8_t x, uint8_t y) {
  return (uint8_t)(x << y);
}

static inline uint16_t shl16(uint16_t x, uint16_t y) {
  return (uint16_t)(x << y);
}

static inline uint32_t shl32(uint32_t x, uint32_t y) {
  return x << y;
}

static inline uint64_t shl64(uint64_t x, uint64_t y) {
  return x << y;
}

static inline uint8_t lshr8(uint8_t x, uint8_t y) {
  return x >> y;
}

static inline uint16_t lshr16(uint16_t x, uint16_t y) {
  return x >> y;
}

static inline uint32_t lshr32(uint32_t x, uint32_t y) {
  return x >> y;
}

static inline uint64_t lshr64(uint64_t x, uint64_t y) {
  return x >> y;
}

static inline int8_t ashr8(int8_t x, int8_t y) {
  return x >> y;
}

static inline int16_t ashr16(int16_t x, int16_t y) {
  return x >> y;
}

static inline int32_t ashr32(int32_t x, int32_t y) {
  return x >> y;
}

static inline int64_t ashr64(int64_t x, int64_t y) {
  return x >> y;
}

static inline uint8_t and8(uint8_t x, uint8_t y) {
  return x & y;
}

static inline uint16_t and16(uint16_t x, uint16_t y) {
  return x & y;
}

static inline uint32_t and32(uint32_t x, uint32_t y) {
  return x & y;
}

static inline uint64_t and64(uint64_t x, uint64_t y) {
  return x & y;
}

static inline uint8_t or8(uint8_t x, uint8_t y) {
  return x | y;
}

static inline uint16_t or16(uint16_t x, uint16_t y) {
  return x | y;
}

static inline uint32_t or32(uint32_t x, uint32_t y) {
  return x | y;
}

static inline uint64_t or64(uint64_t x, uint64_t y) {
  return x | y;
}

static inline uint8_t xor8(uint8_t x, uint8_t y) {
  return x ^ y;
}

static inline uint16_t xor16(uint16_t x, uint16_t y) {
  return x ^ y;
}

static inline uint32_t xor32(uint32_t x, uint32_t y) {
  return x ^ y;
}

static inline uint64_t xor64(uint64_t x, uint64_t y) {
  return x ^ y;
}

static inline bool ult8(uint8_t x, uint8_t y) {
  return x < y;
}

static inline bool ult16(uint16_t x, uint16_t y) {
  return x < y;
}

static inline bool ult32(uint32_t x, uint32_t y) {
  return x < y;
}

static inline bool ult64(uint64_t x, uint64_t y) {
  return x < y;
}

static inline bool ule8(uint8_t x, uint8_t y) {
  return x <= y;
}

static inline bool ule16(uint16_t x, uint16_t y) {
  return x <= y;
}

static inline bool ule32(uint32_t x, uint32_t y) {
  return x <= y;
}

static inline bool ule64(uint64_t x, uint64_t y) {
  return x <= y;
}

static inline bool slt8(int8_t x, int8_t y) {
  return x < y;
}

static inline bool slt16(int16_t x, int16_t y) {
  return x < y;
}

static inline bool slt32(int32_t x, int32_t y) {
  return x < y;
}

static inline bool slt64(int64_t x, int64_t y) {
  return x < y;
}

static inline bool sle8(int8_t x, int8_t y) {
  return x <= y;
}

static inline bool sle16(int16_t x, int16_t y) {
  return x <= y;
}

static inline bool sle32(int32_t x, int32_t y) {
  return x <= y;
}

static inline bool sle64(int64_t x, int64_t y) {
  return x <= y;
}

static inline uint8_t pow8(uint8_t x, uint8_t y) {
  uint8_t res = 1, rem = y;

  while (rem != 0) {
    if (rem & 1)
      res *= x;
    rem >>= 1;
    x *= x;
  }
  return res;
}

static inline uint16_t pow16(uint16_t x, uint16_t y) {
  uint16_t res = 1, rem = y;

  while (rem != 0) {
    if (rem & 1)
      res *= x;
    rem >>= 1;
    x *= x;
  }
  return res;
}

static inline uint32_t pow32(uint32_t x, uint32_t y) {
  uint32_t res = 1, rem = y;

  while (rem != 0) {
    if (rem & 1)
      res *= x;
    rem >>= 1;
    x *= x;
  }
  return res;
}

static inline uint64_t pow64(uint64_t x, uint64_t y) {
  uint64_t res = 1, rem = y;

  while (rem != 0) {
    if (rem & 1)
      res *= x;
    rem >>= 1;
    x *= x;
  }
  return res;
}

static inline bool itob_i8_bool(int8_t x) {
  return x != 0;
}

static inline bool itob_i16_bool(int16_t x) {
  return x != 0;
}

static inline bool itob_i32_bool(int32_t x) {
  return x != 0;
}

static inline bool itob_i64_bool(int64_t x) {
  return x != 0;
}

static inline int8_t btoi_bool_i8(bool x) {
  return x;
}

static inline int16_t btoi_bool_i16(bool x) {
  return x;
}

static inline int32_t btoi_bool_i32(bool x) {
  return x;
}

static inline int64_t btoi_bool_i64(bool x) {
  return x;
}

#define sext_i8_i8(x) ((int8_t) (int8_t) (x))
#define sext_i8_i16(x) ((int16_t) (int8_t) (x))
#define sext_i8_i32(x) ((int32_t) (int8_t) (x))
#define sext_i8_i64(x) ((int64_t) (int8_t) (x))
#define sext_i16_i8(x) ((int8_t) (int16_t) (x))
#define sext_i16_i16(x) ((int16_t) (int16_t) (x))
#define sext_i16_i32(x) ((int32_t) (int16_t) (x))
#define sext_i16_i64(x) ((int64_t) (int16_t) (x))
#define sext_i32_i8(x) ((int8_t) (int32_t) (x))
#define sext_i32_i16(x) ((int16_t) (int32_t) (x))
#define sext_i32_i32(x) ((int32_t) (int32_t) (x))
#define sext_i32_i64(x) ((int64_t) (int32_t) (x))
#define sext_i64_i8(x) ((int8_t) (int64_t) (x))
#define sext_i64_i16(x) ((int16_t) (int64_t) (x))
#define sext_i64_i32(x) ((int32_t) (int64_t) (x))
#define sext_i64_i64(x) ((int64_t) (int64_t) (x))
#define zext_i8_i8(x) ((int8_t) (uint8_t) (x))
#define zext_i8_i16(x) ((int16_t) (uint8_t) (x))
#define zext_i8_i32(x) ((int32_t) (uint8_t) (x))
#define zext_i8_i64(x) ((int64_t) (uint8_t) (x))
#define zext_i16_i8(x) ((int8_t) (uint16_t) (x))
#define zext_i16_i16(x) ((int16_t) (uint16_t) (x))
#define zext_i16_i32(x) ((int32_t) (uint16_t) (x))
#define zext_i16_i64(x) ((int64_t) (uint16_t) (x))
#define zext_i32_i8(x) ((int8_t) (uint32_t) (x))
#define zext_i32_i16(x) ((int16_t) (uint32_t) (x))
#define zext_i32_i32(x) ((int32_t) (uint32_t) (x))
#define zext_i32_i64(x) ((int64_t) (uint32_t) (x))
#define zext_i64_i8(x) ((int8_t) (uint64_t) (x))
#define zext_i64_i16(x) ((int16_t) (uint64_t) (x))
#define zext_i64_i32(x) ((int32_t) (uint64_t) (x))
#define zext_i64_i64(x) ((int64_t) (uint64_t) (x))

static int8_t abs8(int8_t x) {
  return (int8_t)abs(x);
}

static int16_t abs16(int16_t x) {
  return (int16_t)abs(x);
}

static int32_t abs32(int32_t x) {
  return abs(x);
}

static int64_t abs64(int64_t x) {
#if defined(__OPENCL_VERSION__) || defined(ISPC)
  return abs(x);
#else
  return llabs(x);
#endif
}

#if defined(__OPENCL_VERSION__)
static int32_t futrts_popc8(int8_t x) {
  return popcount(x);
}

static int32_t futrts_popc16(int16_t x) {
  return popcount(x);
}

static int32_t futrts_popc32(int32_t x) {
  return popcount(x);
}

static int32_t futrts_popc64(int64_t x) {
  return popcount(x);
}
#elif defined(__CUDA_ARCH__)

static int32_t futrts_popc8(int8_t x) {
  return __popc(zext_i8_i32(x));
}

static int32_t futrts_popc16(int16_t x) {
  return __popc(zext_i16_i32(x));
}

static int32_t futrts_popc32(int32_t x) {
  return __popc(x);
}

static int32_t futrts_popc64(int64_t x) {
  return __popcll(x);
}

#else // Not OpenCL or CUDA, but plain C.

static int32_t futrts_popc8(uint8_t x) {
  int c = 0;
  for (; x; ++c) { x &= x - 1; }
  return c;
}

static int32_t futrts_popc16(uint16_t x) {
  int c = 0;
  for (; x; ++c) { x &= x - 1; }
  return c;
}

static int32_t futrts_popc32(uint32_t x) {
  int c = 0;
  for (; x; ++c) { x &= x - 1; }
  return c;
}

static int32_t futrts_popc64(uint64_t x) {
  int c = 0;
  for (; x; ++c) { x &= x - 1; }
  return c;
}
#endif

#if defined(__OPENCL_VERSION__)
static uint8_t  futrts_umul_hi8 ( uint8_t a,  uint8_t b) { return mul_hi(a, b); }
static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return mul_hi(a, b); }
static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return mul_hi(a, b); }
static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return mul_hi(a, b); }
static uint8_t  futrts_smul_hi8 ( int8_t a,  int8_t b) { return mul_hi(a, b); }
static uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return mul_hi(a, b); }
static uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return mul_hi(a, b); }
static uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return mul_hi(a, b); }
#elif defined(__CUDA_ARCH__)
static  uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return __umulhi(a, b); }
static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return __umul64hi(a, b); }
static  uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; }
static uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }
static uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return __mulhi(a, b); }
static uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return __mul64hi(a, b); }
#elif ISPC
static uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }
static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) {
  uint64_t ah = a >> 32;
  uint64_t al = a & 0xffffffff;
  uint64_t bh = b >> 32;
  uint64_t bl = b & 0xffffffff;

  uint64_t p1 = al * bl;
  uint64_t p2 = al * bh;
  uint64_t p3 = ah * bl;
  uint64_t p4 = ah * bh;

  uint64_t p1h = p1 >> 32;
  uint64_t p2h = p2 >> 32;
  uint64_t p3h = p3 >> 32;
  uint64_t p2l = p2 & 0xffffffff;
  uint64_t p3l = p3 & 0xffffffff;

  uint64_t l = p1h + p2l + p3l;
  uint64_t m = (p2 >> 32) + (p3 >> 32);
  uint64_t h = (l >> 32) + m + p4;

  return h;
}
static  int8_t futrts_smul_hi8 ( int8_t a,  int8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
static int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
static int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }
static int64_t futrts_smul_hi64(int64_t a, int64_t b) {
  uint64_t ah = a >> 32;
  uint64_t al = a & 0xffffffff;
  uint64_t bh = b >> 32;
  uint64_t bl = b & 0xffffffff;

  uint64_t p1 =  al * bl;
  int64_t  p2 = al * bh;
  int64_t  p3 = ah * bl;
  uint64_t p4 =  ah * bh;

  uint64_t p1h = p1 >> 32;
  uint64_t p2h = p2 >> 32;
  uint64_t p3h = p3 >> 32;
  uint64_t p2l = p2 & 0xffffffff;
  uint64_t p3l = p3 & 0xffffffff;

  uint64_t l = p1h + p2l + p3l;
  uint64_t m = (p2 >> 32) + (p3 >> 32);
  uint64_t h = (l >> 32) + m + p4;

  return h;
}

#else // Not OpenCL, ISPC, or CUDA, but plain C.
static uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }
static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return ((__uint128_t)a) * ((__uint128_t)b) >> 64; }
static int8_t futrts_smul_hi8(int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; }
static int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }
static int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((int64_t)a) * ((int64_t)b) >> 32; }
static int64_t futrts_smul_hi64(int64_t a, int64_t b) { return ((__int128_t)a) * ((__int128_t)b) >> 64; }
#endif

#if defined(__OPENCL_VERSION__)
static  uint8_t futrts_umad_hi8 ( uint8_t a,  uint8_t b,  uint8_t c) { return mad_hi(a, b, c); }
static uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return mad_hi(a, b, c); }
static uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return mad_hi(a, b, c); }
static uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return mad_hi(a, b, c); }
static  uint8_t futrts_smad_hi8( int8_t a,  int8_t b,   int8_t c) { return mad_hi(a, b, c); }
static uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return mad_hi(a, b, c); }
static uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return mad_hi(a, b, c); }
static uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return mad_hi(a, b, c); }
#else // Not OpenCL

static  uint8_t futrts_umad_hi8( uint8_t a,  uint8_t b,  uint8_t c) { return futrts_umul_hi8(a, b) + c; }
static uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return futrts_umul_hi16(a, b) + c; }
static uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return futrts_umul_hi32(a, b) + c; }
static uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return futrts_umul_hi64(a, b) + c; }
static  uint8_t futrts_smad_hi8 ( int8_t a,  int8_t b,  int8_t c) { return futrts_smul_hi8(a, b) + c; }
static uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return futrts_smul_hi16(a, b) + c; }
static uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return futrts_smul_hi32(a, b) + c; }
static uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return futrts_smul_hi64(a, b) + c; }
#endif

#if defined(__OPENCL_VERSION__)
static int32_t futrts_clzz8(int8_t x) {
  return clz(x);
}

static int32_t futrts_clzz16(int16_t x) {
  return clz(x);
}

static int32_t futrts_clzz32(int32_t x) {
  return clz(x);
}

static int32_t futrts_clzz64(int64_t x) {
  return clz(x);
}

#elif defined(__CUDA_ARCH__)

static int32_t futrts_clzz8(int8_t x) {
  return __clz(zext_i8_i32(x)) - 24;
}

static int32_t futrts_clzz16(int16_t x) {
  return __clz(zext_i16_i32(x)) - 16;
}

static int32_t futrts_clzz32(int32_t x) {
  return __clz(x);
}

static int32_t futrts_clzz64(int64_t x) {
  return __clzll(x);
}

#elif ISPC

static int32_t futrts_clzz8(int8_t x) {
  return count_leading_zeros((int32_t)(uint8_t)x)-24;
}

static int32_t futrts_clzz16(int16_t x) {
  return count_leading_zeros((int32_t)(uint16_t)x)-16;
}

static int32_t futrts_clzz32(int32_t x) {
  return count_leading_zeros(x);
}

static int32_t futrts_clzz64(int64_t x) {
  return count_leading_zeros(x);
}

#else // Not OpenCL, ISPC or CUDA, but plain C.

static int32_t futrts_clzz8(int8_t x) {
  return x == 0 ? 8 : __builtin_clz((uint32_t)zext_i8_i32(x)) - 24;
}

static int32_t futrts_clzz16(int16_t x) {
  return x == 0 ? 16 : __builtin_clz((uint32_t)zext_i16_i32(x)) - 16;
}

static int32_t futrts_clzz32(int32_t x) {
  return x == 0 ? 32 : __builtin_clz((uint32_t)x);
}

static int32_t futrts_clzz64(int64_t x) {
  return x == 0 ? 64 : __builtin_clzll((uint64_t)x);
}
#endif

#if defined(__OPENCL_VERSION__)
static int32_t futrts_ctzz8(int8_t x) {
  int i = 0;
  for (; i < 8 && (x & 1) == 0; i++, x >>= 1)
    ;
  return i;
}

static int32_t futrts_ctzz16(int16_t x) {
  int i = 0;
  for (; i < 16 && (x & 1) == 0; i++, x >>= 1)
    ;
  return i;
}

static int32_t futrts_ctzz32(int32_t x) {
  int i = 0;
  for (; i < 32 && (x & 1) == 0; i++, x >>= 1)
    ;
  return i;
}

static int32_t futrts_ctzz64(int64_t x) {
  int i = 0;
  for (; i < 64 && (x & 1) == 0; i++, x >>= 1)
    ;
  return i;
}

#elif defined(__CUDA_ARCH__)

static int32_t futrts_ctzz8(int8_t x) {
  int y = __ffs(x);
  return y == 0 ? 8 : y - 1;
}

static int32_t futrts_ctzz16(int16_t x) {
  int y = __ffs(x);
  return y == 0 ? 16 : y - 1;
}

static int32_t futrts_ctzz32(int32_t x) {
  int y = __ffs(x);
  return y == 0 ? 32 : y - 1;
}

static int32_t futrts_ctzz64(int64_t x) {
  int y = __ffsll(x);
  return y == 0 ? 64 : y - 1;
}

#elif ISPC

static int32_t futrts_ctzz8(int8_t x) {
  return x == 0 ? 8 : count_trailing_zeros((int32_t)x);
}

static int32_t futrts_ctzz16(int16_t x) {
  return x == 0 ? 16 : count_trailing_zeros((int32_t)x);
}

static int32_t futrts_ctzz32(int32_t x) {
  return count_trailing_zeros(x);
}

static int32_t futrts_ctzz64(int64_t x) {
  return count_trailing_zeros(x);
}

#else // Not OpenCL or CUDA, but plain C.

static int32_t futrts_ctzz8(int8_t x) {
  return x == 0 ? 8 : __builtin_ctz((uint32_t)x);
}

static int32_t futrts_ctzz16(int16_t x) {
  return x == 0 ? 16 : __builtin_ctz((uint32_t)x);
}

static int32_t futrts_ctzz32(int32_t x) {
  return x == 0 ? 32 : __builtin_ctz((uint32_t)x);
}

static int32_t futrts_ctzz64(int64_t x) {
  return x == 0 ? 64 : __builtin_ctzll((uint64_t)x);
}
#endif

static inline float fdiv32(float x, float y) {
  return x / y;
}

static inline float fadd32(float x, float y) {
  return x + y;
}

static inline float fsub32(float x, float y) {
  return x - y;
}

static inline float fmul32(float x, float y) {
  return x * y;
}

static inline bool cmplt32(float x, float y) {
  return x < y;
}

static inline bool cmple32(float x, float y) {
  return x <= y;
}

static inline float sitofp_i8_f32(int8_t x) {
  return (float) x;
}

static inline float sitofp_i16_f32(int16_t x) {
  return (float) x;
}

static inline float sitofp_i32_f32(int32_t x) {
  return (float) x;
}

static inline float sitofp_i64_f32(int64_t x) {
  return (float) x;
}

static inline float uitofp_i8_f32(uint8_t x) {
  return (float) x;
}

static inline float uitofp_i16_f32(uint16_t x) {
  return (float) x;
}

static inline float uitofp_i32_f32(uint32_t x) {
  return (float) x;
}

static inline float uitofp_i64_f32(uint64_t x) {
  return (float) x;
}

#ifdef __OPENCL_VERSION__
static inline float fabs32(float x) {
  return fabs(x);
}

static inline float fmax32(float x, float y) {
  return fmax(x, y);
}

static inline float fmin32(float x, float y) {
  return fmin(x, y);
}

static inline float fpow32(float x, float y) {
  return pow(x, y);
}

#elif ISPC

static inline float fabs32(float x) {
  return abs(x);
}

static inline float fmax32(float x, float y) {
  return isnan(x) ? y : isnan(y) ? x : max(x, y);
}

static inline float fmin32(float x, float y) {
  return isnan(x) ? y : isnan(y) ? x : min(x, y);
}

static inline float fpow32(float a, float b) {
  float ret;
  foreach_active (i) {
      uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
      ret = insert(ret, i, r);
  }
  return ret;
}

#else // Not OpenCL, but CUDA or plain C.

static inline float fabs32(float x) {
  return fabsf(x);
}

static inline float fmax32(float x, float y) {
  return fmaxf(x, y);
}

static inline float fmin32(float x, float y) {
  return fminf(x, y);
}

static inline float fpow32(float x, float y) {
  return powf(x, y);
}
#endif

static inline bool futrts_isnan32(float x) {
  return isnan(x);
}

#if ISPC

static inline bool futrts_isinf32(float x) {
  return !isnan(x) && isnan(x - x);
}

static inline bool futrts_isfinite32(float x) {
  return !isnan(x) && !futrts_isinf32(x);
}

#else

static inline bool futrts_isinf32(float x) {
  return isinf(x);
}

#endif

static inline int8_t fptosi_f32_i8(float x) {
  if (futrts_isnan32(x) || futrts_isinf32(x)) {
    return 0;
  } else {
    return (int8_t) x;
  }
}

static inline int16_t fptosi_f32_i16(float x) {
  if (futrts_isnan32(x) || futrts_isinf32(x)) {
    return 0;
  } else {
    return (int16_t) x;
  }
}

static inline int32_t fptosi_f32_i32(float x) {
  if (futrts_isnan32(x) || futrts_isinf32(x)) {
    return 0;
  } else {
    return (int32_t) x;
  }
}

static inline int64_t fptosi_f32_i64(float x) {
  if (futrts_isnan32(x) || futrts_isinf32(x)) {
    return 0;
  } else {
    return (int64_t) x;
  };
}

static inline uint8_t fptoui_f32_i8(float x) {
  if (futrts_isnan32(x) || futrts_isinf32(x)) {
    return 0;
  } else {
    return (uint8_t) (int8_t) x;
  }
}

static inline uint16_t fptoui_f32_i16(float x) {
  if (futrts_isnan32(x) || futrts_isinf32(x)) {
    return 0;
  } else {
    return (uint16_t) (int16_t) x;
  }
}

static inline uint32_t fptoui_f32_i32(float x) {
  if (futrts_isnan32(x) || futrts_isinf32(x)) {
    return 0;
  } else {
    return (uint32_t) (int32_t) x;
  }
}

static inline uint64_t fptoui_f32_i64(float x) {
  if (futrts_isnan32(x) || futrts_isinf32(x)) {
    return 0;
  } else {
    return (uint64_t) (int64_t) x;
  }
}

static inline bool ftob_f32_bool(float x) {
  return x != 0;
}

static inline float btof_bool_f32(bool x) {
  return x ? 1 : 0;
}

#ifdef __OPENCL_VERSION__
static inline float futrts_log32(float x) {
  return log(x);
}

static inline float futrts_log2_32(float x) {
  return log2(x);
}

static inline float futrts_log10_32(float x) {
  return log10(x);
}

static inline float futrts_log1p_32(float x) {
  return log1p(x);
}

static inline float futrts_sqrt32(float x) {
  return sqrt(x);
}

static inline float futrts_cbrt32(float x) {
  return cbrt(x);
}

static inline float futrts_exp32(float x) {
  return exp(x);
}

static inline float futrts_cos32(float x) {
  return cos(x);
}

static inline float futrts_sin32(float x) {
  return sin(x);
}

static inline float futrts_tan32(float x) {
  return tan(x);
}

static inline float futrts_acos32(float x) {
  return acos(x);
}

static inline float futrts_asin32(float x) {
  return asin(x);
}

static inline float futrts_atan32(float x) {
  return atan(x);
}

static inline float futrts_cosh32(float x) {
  return cosh(x);
}

static inline float futrts_sinh32(float x) {
  return sinh(x);
}

static inline float futrts_tanh32(float x) {
  return tanh(x);
}

static inline float futrts_acosh32(float x) {
  return acosh(x);
}

static inline float futrts_asinh32(float x) {
  return asinh(x);
}

static inline float futrts_atanh32(float x) {
  return atanh(x);
}

static inline float futrts_atan2_32(float x, float y) {
  return atan2(x, y);
}

static inline float futrts_hypot32(float x, float y) {
  return hypot(x, y);
}

static inline float futrts_gamma32(float x) {
  return tgamma(x);
}

static inline float futrts_lgamma32(float x) {
  return lgamma(x);
}

static inline float futrts_erf32(float x) {
  return erf(x);
}

static inline float futrts_erfc32(float x) {
  return erfc(x);
}

static inline float fmod32(float x, float y) {
  return fmod(x, y);
}

static inline float futrts_round32(float x) {
  return rint(x);
}

static inline float futrts_floor32(float x) {
  return floor(x);
}

static inline float futrts_ceil32(float x) {
  return ceil(x);
}

static inline float futrts_nextafter32(float x, float y) {
  return nextafter(x, y);
}

static inline float futrts_lerp32(float v0, float v1, float t) {
  return mix(v0, v1, t);
}

static inline float futrts_mad32(float a, float b, float c) {
  return mad(a, b, c);
}

static inline float futrts_fma32(float a, float b, float c) {
  return fma(a, b, c);
}

#elif ISPC

static inline float futrts_log32(float x) {
  return futrts_isfinite32(x) || (futrts_isinf32(x) && x < 0)? log(x) : x;
}

static inline float futrts_log2_32(float x) {
  return futrts_log32(x) / log(2.0f);
}

static inline float futrts_log10_32(float x) {
  return futrts_log32(x) / log(10.0f);
}

static inline float futrts_log1p_32(float x) {
  if(x == -1.0f || (futrts_isinf32(x) && x > 0.0f)) return x / 0.0f;
  float y = 1.0f + x;
  float z = y - 1.0f;
  return log(y) - (z-x)/y;
}

static inline float futrts_sqrt32(float x) {
  return sqrt(x);
}

extern "C" unmasked uniform float cbrtf(uniform float);
static inline float futrts_cbrt32(float x) {
  float res;
  foreach_active (i) {
    uniform float r = cbrtf(extract(x, i));
    res = insert(res, i, r);
  }
  return res;
}

static inline float futrts_exp32(float x) {
  return exp(x);
}

static inline float futrts_cos32(float x) {
  return cos(x);
}

static inline float futrts_sin32(float x) {
  return sin(x);
}

static inline float futrts_tan32(float x) {
  return tan(x);
}

static inline float futrts_acos32(float x) {
  return acos(x);
}

static inline float futrts_asin32(float x) {
  return asin(x);
}

static inline float futrts_atan32(float x) {
  return atan(x);
}

static inline float futrts_cosh32(float x) {
  return (exp(x)+exp(-x)) / 2.0f;
}

static inline float futrts_sinh32(float x) {
  return (exp(x)-exp(-x)) / 2.0f;
}

static inline float futrts_tanh32(float x) {
  return futrts_sinh32(x)/futrts_cosh32(x);
}

static inline float futrts_acosh32(float x) {
  float f = x+sqrt(x*x-1);
  if(futrts_isfinite32(f)) return log(f);
  return f;
}

static inline float futrts_asinh32(float x) {
  float f = x+sqrt(x*x+1);
  if(futrts_isfinite32(f)) return log(f);
  return f;

}

static inline float futrts_atanh32(float x) {
  float f = (1+x)/(1-x);
  if(futrts_isfinite32(f)) return log(f)/2.0f;
  return f;

}

static inline float futrts_atan2_32(float x, float y) {
  return (x == 0.0f && y == 0.0f) ? 0.0f : atan2(x, y);
}

static inline float futrts_hypot32(float x, float y) {
  if (futrts_isfinite32(x) && futrts_isfinite32(y)) {
    x = abs(x);
    y = abs(y);
    float a;
    float b;
    if (x >= y){
        a = x;
        b = y;
    } else {
        a = y;
        b = x;
    }
    if(b == 0){
      return a;
    }

    int e;
    float an;
    float bn;
    an = frexp (a, &e);
    bn = ldexp (b, - e);
    float cn;
    cn = sqrt (an * an + bn * bn);
    return ldexp (cn, e);
  } else {
    if (futrts_isinf32(x) || futrts_isinf32(y)) return INFINITY;
    else return x + y;
  }

}

extern "C" unmasked uniform float tgammaf(uniform float x);
static inline float futrts_gamma32(float x) {
  float res;
  foreach_active (i) {
    uniform float r = tgammaf(extract(x, i));
    res = insert(res, i, r);
  }
  return res;
}

extern "C" unmasked uniform float lgammaf(uniform float x);
static inline float futrts_lgamma32(float x) {
  float res;
  foreach_active (i) {
    uniform float r = lgammaf(extract(x, i));
    res = insert(res, i, r);
  }
  return res;
}

extern "C" unmasked uniform float erff(uniform float x);
static inline float futrts_erf32(float x) {
  float res;
  foreach_active (i) {
    uniform float r = erff(extract(x, i));
    res = insert(res, i, r);
  }
  return res;
}

extern "C" unmasked uniform float erfcf(uniform float x);
static inline float futrts_erfc32(float x) {
  float res;
  foreach_active (i) {
    uniform float r = erfcf(extract(x, i));
    res = insert(res, i, r);
  }
  return res;
}

static inline float fmod32(float x, float y) {
  return x - y * trunc(x/y);
}

static inline float futrts_round32(float x) {
  return round(x);
}

static inline float futrts_floor32(float x) {
  return floor(x);
}

static inline float futrts_ceil32(float x) {
  return ceil(x);
}

extern "C" unmasked uniform float nextafterf(uniform float x, uniform float y);
static inline float futrts_nextafter32(float x, float y) {
  float res;
  foreach_active (i) {
    uniform float r = nextafterf(extract(x, i), extract(y, i));
    res = insert(res, i, r);
  }
  return res;
}

static inline float futrts_lerp32(float v0, float v1, float t) {
  return v0 + (v1 - v0) * t;
}

static inline float futrts_mad32(float a, float b, float c) {
  return a * b + c;
}

static inline float futrts_fma32(float a, float b, float c) {
  return a * b + c;
}

#else // Not OpenCL or ISPC, but CUDA or plain C.

static inline float futrts_log32(float x) {
  return logf(x);
}

static inline float futrts_log2_32(float x) {
  return log2f(x);
}

static inline float futrts_log10_32(float x) {
  return log10f(x);
}

static inline float futrts_log1p_32(float x) {
  return log1pf(x);
}

static inline float futrts_sqrt32(float x) {
  return sqrtf(x);
}

static inline float futrts_cbrt32(float x) {
  return cbrtf(x);
}

static inline float futrts_exp32(float x) {
  return expf(x);
}

static inline float futrts_cos32(float x) {
  return cosf(x);
}

static inline float futrts_sin32(float x) {
  return sinf(x);
}

static inline float futrts_tan32(float x) {
  return tanf(x);
}

static inline float futrts_acos32(float x) {
  return acosf(x);
}

static inline float futrts_asin32(float x) {
  return asinf(x);
}

static inline float futrts_atan32(float x) {
  return atanf(x);
}

static inline float futrts_cosh32(float x) {
  return coshf(x);
}

static inline float futrts_sinh32(float x) {
  return sinhf(x);
}

static inline float futrts_tanh32(float x) {
  return tanhf(x);
}

static inline float futrts_acosh32(float x) {
  return acoshf(x);
}

static inline float futrts_asinh32(float x) {
  return asinhf(x);
}

static inline float futrts_atanh32(float x) {
  return atanhf(x);
}

static inline float futrts_atan2_32(float x, float y) {
  return atan2f(x, y);
}

static inline float futrts_hypot32(float x, float y) {
  return hypotf(x, y);
}

static inline float futrts_gamma32(float x) {
  return tgammaf(x);
}

static inline float futrts_lgamma32(float x) {
  return lgammaf(x);
}

static inline float futrts_erf32(float x) {
  return erff(x);
}

static inline float futrts_erfc32(float x) {
  return erfcf(x);
}

static inline float fmod32(float x, float y) {
  return fmodf(x, y);
}

static inline float futrts_round32(float x) {
  return rintf(x);
}

static inline float futrts_floor32(float x) {
  return floorf(x);
}

static inline float futrts_ceil32(float x) {
  return ceilf(x);
}

static inline float futrts_nextafter32(float x, float y) {
  return nextafterf(x, y);
}

static inline float futrts_lerp32(float v0, float v1, float t) {
  return v0 + (v1 - v0) * t;
}

static inline float futrts_mad32(float a, float b, float c) {
  return a * b + c;
}

static inline float futrts_fma32(float a, float b, float c) {
  return fmaf(a, b, c);
}
#endif

#if ISPC
static inline int32_t futrts_to_bits32(float x) {
  return intbits(x);
}

static inline float futrts_from_bits32(int32_t x) {
  return floatbits(x);
}
#else
static inline int32_t futrts_to_bits32(float x) {
  union {
    float f;
    int32_t t;
  } p;

  p.f = x;
  return p.t;
}

static inline float futrts_from_bits32(int32_t x) {
  union {
    int32_t f;
    float t;
  } p;

  p.f = x;
  return p.t;
}
#endif

static inline float fsignum32(float x) {
  return futrts_isnan32(x) ? x : (x > 0 ? 1 : 0) - (x < 0 ? 1 : 0);
}

#ifdef FUTHARK_F64_ENABLED

#if ISPC
static inline bool futrts_isinf64(float x) {
  return !isnan(x) && isnan(x - x);
}

static inline bool futrts_isfinite64(float x) {
  return !isnan(x) && !futrts_isinf64(x);
}

static inline double fdiv64(double x, double y) {
  return x / y;
}

static inline double fadd64(double x, double y) {
  return x + y;
}

static inline double fsub64(double x, double y) {
  return x - y;
}

static inline double fmul64(double x, double y) {
  return x * y;
}

static inline bool cmplt64(double x, double y) {
  return x < y;
}

static inline bool cmple64(double x, double y) {
  return x <= y;
}

static inline double sitofp_i8_f64(int8_t x) {
  return (double) x;
}

static inline double sitofp_i16_f64(int16_t x) {
  return (double) x;
}

static inline double sitofp_i32_f64(int32_t x) {
  return (double) x;
}

static inline double sitofp_i64_f64(int64_t x) {
  return (double) x;
}

static inline double uitofp_i8_f64(uint8_t x) {
  return (double) x;
}

static inline double uitofp_i16_f64(uint16_t x) {
  return (double) x;
}

static inline double uitofp_i32_f64(uint32_t x) {
  return (double) x;
}

static inline double uitofp_i64_f64(uint64_t x) {
  return (double) x;
}

static inline double fabs64(double x) {
  return abs(x);
}

static inline double fmax64(double x, double y) {
  return isnan(x) ? y : isnan(y) ? x : max(x, y);
}

static inline double fmin64(double x, double y) {
  return isnan(x) ? y : isnan(y) ? x : min(x, y);
}

static inline double fpow64(double a, double b) {
  float ret;
  foreach_active (i) {
      uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
      ret = insert(ret, i, r);
  }
  return ret;
}

static inline double futrts_log64(double x) {
  return futrts_isfinite64(x) || (futrts_isinf64(x) && x < 0)? log(x) : x;
}

static inline double futrts_log2_64(double x) {
  return futrts_log64(x)/log(2.0d);
}

static inline double futrts_log10_64(double x) {
  return futrts_log64(x)/log(10.0d);
}

static inline double futrts_log1p_64(double x) {
  if(x == -1.0d || (futrts_isinf64(x) && x > 0.0d)) return x / 0.0d;
  double y = 1.0d + x;
  double z = y - 1.0d;
  return log(y) - (z-x)/y;
}

static inline double futrts_sqrt64(double x) {
  return sqrt(x);
}

extern "C" unmasked uniform double cbrt(uniform double);
static inline double futrts_cbrt64(double x) {
  double res;
  foreach_active (i) {
    uniform double r = cbrtf(extract(x, i));
    res = insert(res, i, r);
  }
  return res;
}

static inline double futrts_exp64(double x) {
  return exp(x);
}

static inline double futrts_cos64(double x) {
  return cos(x);
}

static inline double futrts_sin64(double x) {
  return sin(x);
}

static inline double futrts_tan64(double x) {
  return tan(x);
}

static inline double futrts_acos64(double x) {
  return acos(x);
}

static inline double futrts_asin64(double x) {
  return asin(x);
}

static inline double futrts_atan64(double x) {
  return atan(x);
}

static inline double futrts_cosh64(double x) {
  return (exp(x)+exp(-x)) / 2.0d;
}

static inline double futrts_sinh64(double x) {
  return (exp(x)-exp(-x)) / 2.0d;
}

static inline double futrts_tanh64(double x) {
  return futrts_sinh64(x)/futrts_cosh64(x);
}

static inline double futrts_acosh64(double x) {
  double f = x+sqrt(x*x-1.0d);
  if(futrts_isfinite64(f)) return log(f);
  return f;
}

static inline double futrts_asinh64(double x) {
  double f = x+sqrt(x*x+1.0d);
  if(futrts_isfinite64(f)) return log(f);
  return f;
}

static inline double futrts_atanh64(double x) {
  double f = (1.0d+x)/(1.0d-x);
  if(futrts_isfinite64(f)) return log(f)/2.0d;
  return f;

}

static inline double futrts_atan2_64(double x, double y) {
  return atan2(x, y);
}

extern "C" unmasked uniform double hypot(uniform double x, uniform double y);
static inline double futrts_hypot64(double x, double y) {
  double res;
  foreach_active (i) {
    uniform double r = hypot(extract(x, i), extract(y, i));
    res = insert(res, i, r);
  }
  return res;
}

extern "C" unmasked uniform double tgamma(uniform double x);
static inline double futrts_gamma64(double x) {
  double res;
  foreach_active (i) {
    uniform double r = tgamma(extract(x, i));
    res = insert(res, i, r);
  }
  return res;
}

extern "C" unmasked uniform double lgamma(uniform double x);
static inline double futrts_lgamma64(double x) {
  double res;
  foreach_active (i) {
    uniform double r = lgamma(extract(x, i));
    res = insert(res, i, r);
  }
  return res;
}

extern "C" unmasked uniform double erf(uniform double x);
static inline double futrts_erf64(double x) {
  double res;
  foreach_active (i) {
    uniform double r = erf(extract(x, i));
    res = insert(res, i, r);
  }
  return res;
}

extern "C" unmasked uniform double erfc(uniform double x);
static inline double futrts_erfc64(double x) {
  double res;
  foreach_active (i) {
    uniform double r = erfc(extract(x, i));
    res = insert(res, i, r);
  }
  return res;
}

static inline double futrts_fma64(double a, double b, double c) {
  return a * b + c;
}

static inline double futrts_round64(double x) {
  return round(x);
}

static inline double futrts_ceil64(double x) {
  return ceil(x);
}

extern "C" unmasked uniform double nextafter(uniform float x, uniform double y);
static inline float futrts_nextafter64(double x, double y) {
  double res;
  foreach_active (i) {
    uniform double r = nextafter(extract(x, i), extract(y, i));
    res = insert(res, i, r);
  }
  return res;
}

static inline double futrts_floor64(double x) {
  return floor(x);
}

static inline bool futrts_isnan64(double x) {
  return isnan(x);
}

static inline int8_t fptosi_f64_i8(double x) {
  if (futrts_isnan64(x) || futrts_isinf64(x)) {
    return 0;
  } else {
    return (int8_t) x;
  }
}

static inline int16_t fptosi_f64_i16(double x) {
  if (futrts_isnan64(x) || futrts_isinf64(x)) {
    return 0;
  } else {
    return (int16_t) x;
  }
}

static inline int32_t fptosi_f64_i32(double x) {
  if (futrts_isnan64(x) || futrts_isinf64(x)) {
    return 0;
  } else {
    return (int32_t) x;
  }
}

static inline int64_t fptosi_f64_i64(double x) {
  if (futrts_isnan64(x) || futrts_isinf64(x)) {
    return 0;
  } else {
    return (int64_t) x;
  }
}

static inline uint8_t fptoui_f64_i8(double x) {
  if (futrts_isnan64(x) || futrts_isinf64(x)) {
    return 0;
  } else {
    return (uint8_t) (int8_t) x;
  }
}

static inline uint16_t fptoui_f64_i16(double x) {
  if (futrts_isnan64(x) || futrts_isinf64(x)) {
    return 0;
  } else {
    return (uint16_t) (int16_t) x;
  }
}

static inline uint32_t fptoui_f64_i32(double x) {
  if (futrts_isnan64(x) || futrts_isinf64(x)) {
    return 0;
  } else {
    return (uint32_t) (int32_t) x;
  }
}

static inline uint64_t fptoui_f64_i64(double x) {
  if (futrts_isnan64(x) || futrts_isinf64(x)) {
    return 0;
  } else {
    return (uint64_t) (int64_t) x;
  }
}

static inline bool ftob_f64_bool(double x) {
  return x != 0.0;
}

static inline double btof_bool_f64(bool x) {
  return x ? 1.0 : 0.0;
}

static inline int64_t futrts_to_bits64(double x) {
  int64_t res;
  foreach_active (i) {
    uniform double tmp = extract(x, i);
    uniform int64_t r = *((uniform int64_t* uniform)&tmp);
    res = insert(res, i, r);
  }
  return res;
}

static inline double futrts_from_bits64(int64_t x) {
  double res;
  foreach_active (i) {
    uniform int64_t tmp = extract(x, i);
    uniform double r = *((uniform double* uniform)&tmp);
    res = insert(res, i, r);
  }
  return res;
}

static inline double fmod64(double x, double y) {
  return x - y * trunc(x/y);
}

static inline double fsignum64(double x) {
  return futrts_isnan64(x) ? x : (x > 0 ? 1.0d : 0.0d) - (x < 0 ? 1.0d : 0.0d);
}

static inline double futrts_lerp64(double v0, double v1, double t) {
  return v0 + (v1 - v0) * t;
}

static inline double futrts_mad64(double a, double b, double c) {
  return a * b + c;
}

static inline float fpconv_f32_f32(float x) {
  return (float) x;
}

static inline double fpconv_f32_f64(float x) {
  return (double) x;
}

static inline float fpconv_f64_f32(double x) {
  return (float) x;
}

static inline double fpconv_f64_f64(double x) {
  return (double) x;
}

#else

static inline double fdiv64(double x, double y) {
  return x / y;
}

static inline double fadd64(double x, double y) {
  return x + y;
}

static inline double fsub64(double x, double y) {
  return x - y;
}

static inline double fmul64(double x, double y) {
  return x * y;
}

static inline bool cmplt64(double x, double y) {
  return x < y;
}

static inline bool cmple64(double x, double y) {
  return x <= y;
}

static inline double sitofp_i8_f64(int8_t x) {
  return (double) x;
}

static inline double sitofp_i16_f64(int16_t x) {
  return (double) x;
}

static inline double sitofp_i32_f64(int32_t x) {
  return (double) x;
}

static inline double sitofp_i64_f64(int64_t x) {
  return (double) x;
}

static inline double uitofp_i8_f64(uint8_t x) {
  return (double) x;
}

static inline double uitofp_i16_f64(uint16_t x) {
  return (double) x;
}

static inline double uitofp_i32_f64(uint32_t x) {
  return (double) x;
}

static inline double uitofp_i64_f64(uint64_t x) {
  return (double) x;
}

static inline double fabs64(double x) {
  return fabs(x);
}

static inline double fmax64(double x, double y) {
  return fmax(x, y);
}

static inline double fmin64(double x, double y) {
  return fmin(x, y);
}

static inline double fpow64(double x, double y) {
  return pow(x, y);
}

static inline double futrts_log64(double x) {
  return log(x);
}

static inline double futrts_log2_64(double x) {
  return log2(x);
}

static inline double futrts_log10_64(double x) {
  return log10(x);
}

static inline double futrts_log1p_64(double x) {
  return log1p(x);
}

static inline double futrts_sqrt64(double x) {
  return sqrt(x);
}

static inline double futrts_cbrt64(double x) {
  return cbrt(x);
}

static inline double futrts_exp64(double x) {
  return exp(x);
}

static inline double futrts_cos64(double x) {
  return cos(x);
}

static inline double futrts_sin64(double x) {
  return sin(x);
}

static inline double futrts_tan64(double x) {
  return tan(x);
}

static inline double futrts_acos64(double x) {
  return acos(x);
}

static inline double futrts_asin64(double x) {
  return asin(x);
}

static inline double futrts_atan64(double x) {
  return atan(x);
}

static inline double futrts_cosh64(double x) {
  return cosh(x);
}

static inline double futrts_sinh64(double x) {
  return sinh(x);
}

static inline double futrts_tanh64(double x) {
  return tanh(x);
}

static inline double futrts_acosh64(double x) {
  return acosh(x);
}

static inline double futrts_asinh64(double x) {
  return asinh(x);
}

static inline double futrts_atanh64(double x) {
  return atanh(x);
}

static inline double futrts_atan2_64(double x, double y) {
  return atan2(x, y);
}

static inline double futrts_hypot64(double x, double y) {
  return hypot(x, y);
}

static inline double futrts_gamma64(double x) {
  return tgamma(x);
}

static inline double futrts_lgamma64(double x) {
  return lgamma(x);
}

static inline double futrts_erf64(double x) {
  return erf(x);
}

static inline double futrts_erfc64(double x) {
  return erfc(x);
}

static inline double futrts_fma64(double a, double b, double c) {
  return fma(a, b, c);
}

static inline double futrts_round64(double x) {
  return rint(x);
}

static inline double futrts_ceil64(double x) {
  return ceil(x);
}

static inline float futrts_nextafter64(float x, float y) {
  return nextafter(x, y);
}

static inline double futrts_floor64(double x) {
  return floor(x);
}

static inline bool futrts_isnan64(double x) {
  return isnan(x);
}

static inline bool futrts_isinf64(double x) {
  return isinf(x);
}

static inline int8_t fptosi_f64_i8(double x) {
  if (futrts_isnan64(x) || futrts_isinf64(x)) {
    return 0;
  } else {
    return (int8_t) x;
  }
}

static inline int16_t fptosi_f64_i16(double x) {
  if (futrts_isnan64(x) || futrts_isinf64(x)) {
    return 0;
  } else {
    return (int16_t) x;
  }
}

static inline int32_t fptosi_f64_i32(double x) {
  if (futrts_isnan64(x) || futrts_isinf64(x)) {
    return 0;
  } else {
    return (int32_t) x;
  }
}

static inline int64_t fptosi_f64_i64(double x) {
  if (futrts_isnan64(x) || futrts_isinf64(x)) {
    return 0;
  } else {
    return (int64_t) x;
  }
}

static inline uint8_t fptoui_f64_i8(double x) {
  if (futrts_isnan64(x) || futrts_isinf64(x)) {
    return 0;
  } else {
    return (uint8_t) (int8_t) x;
  }
}

static inline uint16_t fptoui_f64_i16(double x) {
  if (futrts_isnan64(x) || futrts_isinf64(x)) {
    return 0;
  } else {
    return (uint16_t) (int16_t) x;
  }
}

static inline uint32_t fptoui_f64_i32(double x) {
  if (futrts_isnan64(x) || futrts_isinf64(x)) {
    return 0;
  } else {
    return (uint32_t) (int32_t) x;
  }
}

static inline uint64_t fptoui_f64_i64(double x) {
  if (futrts_isnan64(x) || futrts_isinf64(x)) {
    return 0;
  } else {
    return (uint64_t) (int64_t) x;
  }
}

static inline bool ftob_f64_bool(double x) {
  return x != 0;
}

static inline double btof_bool_f64(bool x) {
  return x ? 1 : 0;
}

static inline int64_t futrts_to_bits64(double x) {
  union {
    double f;
    int64_t t;
  } p;

  p.f = x;
  return p.t;
}

static inline double futrts_from_bits64(int64_t x) {
  union {
    int64_t f;
    double t;
  } p;

  p.f = x;
  return p.t;
}

static inline double fmod64(double x, double y) {
  return fmod(x, y);
}

static inline double fsignum64(double x) {
  return futrts_isnan64(x) ? x : (x > 0) - (x < 0);
}

static inline double futrts_lerp64(double v0, double v1, double t) {
#ifdef __OPENCL_VERSION__
  return mix(v0, v1, t);
#else
  return v0 + (v1 - v0) * t;
#endif
}

static inline double futrts_mad64(double a, double b, double c) {
#ifdef __OPENCL_VERSION__
  return mad(a, b, c);
#else
  return a * b + c;
#endif
}

static inline float fpconv_f32_f32(float x) {
  return (float) x;
}

static inline double fpconv_f32_f64(float x) {
  return (double) x;
}

static inline float fpconv_f64_f32(double x) {
  return (float) x;
}

static inline double fpconv_f64_f64(double x) {
  return (double) x;
}

#endif

#endif

// End of scalar.h.
// Start of scalar_f16.h.

// Half-precision is emulated if needed (e.g. in straight C) with the
// native type used if possible.  The emulation works by typedef'ing
// 'float' to 'f16', and then implementing all operations on single
// precision.  To cut down on duplication, we use the same code for
// those Futhark functions that require just operators or casts.  The
// in-memory representation for arrays will still be 16 bits even
// under emulation, so the compiler will have to be careful when
// generating reads or writes.

#if !defined(cl_khr_fp16) && !(defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) && !(defined(ISPC))
#define EMULATE_F16
#endif

#if !defined(EMULATE_F16) && defined(__OPENCL_VERSION__)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif

#ifdef EMULATE_F16

// Note that the half-precision storage format is still 16 bits - the
// compiler will have to be real careful!
typedef float f16;

#elif ISPC
typedef float16 f16;

#else

#ifdef __CUDA_ARCH__
#include <cuda_fp16.h>
#endif

typedef half f16;

#endif

// Some of these functions convert to single precision because half
// precision versions are not available.

static inline f16 fadd16(f16 x, f16 y) {
  return x + y;
}

static inline f16 fsub16(f16 x, f16 y) {
  return x - y;
}

static inline f16 fmul16(f16 x, f16 y) {
  return x * y;
}

static inline bool cmplt16(f16 x, f16 y) {
  return x < y;
}

static inline bool cmple16(f16 x, f16 y) {
  return x <= y;
}

static inline f16 sitofp_i8_f16(int8_t x) {
  return (f16) x;
}

static inline f16 sitofp_i16_f16(int16_t x) {
  return (f16) x;
}

static inline f16 sitofp_i32_f16(int32_t x) {
  return (f16) x;
}

static inline f16 sitofp_i64_f16(int64_t x) {
  return (f16) x;
}

static inline f16 uitofp_i8_f16(uint8_t x) {
  return (f16) x;
}

static inline f16 uitofp_i16_f16(uint16_t x) {
  return (f16) x;
}

static inline f16 uitofp_i32_f16(uint32_t x) {
  return (f16) x;
}

static inline f16 uitofp_i64_f16(uint64_t x) {
  return (f16) x;
}

static inline int8_t fptosi_f16_i8(f16 x) {
  return (int8_t) (float) x;
}

static inline int16_t fptosi_f16_i16(f16 x) {
  return (int16_t) x;
}

static inline int32_t fptosi_f16_i32(f16 x) {
  return (int32_t) x;
}

static inline int64_t fptosi_f16_i64(f16 x) {
  return (int64_t) x;
}

static inline uint8_t fptoui_f16_i8(f16 x) {
  return (uint8_t) (float) x;
}

static inline uint16_t fptoui_f16_i16(f16 x) {
  return (uint16_t) x;
}

static inline uint32_t fptoui_f16_i32(f16 x) {
  return (uint32_t) x;
}

static inline uint64_t fptoui_f16_i64(f16 x) {
  return (uint64_t) x;
}

static inline bool ftob_f16_bool(f16 x) {
  return x != (f16)0;
}

static inline f16 btof_bool_f16(bool x) {
  return x ? 1 : 0;
}

#ifndef EMULATE_F16
static inline bool futrts_isnan16(f16 x) {
  return isnan((float)x);
}

#ifdef __OPENCL_VERSION__

static inline f16 fabs16(f16 x) {
  return fabs(x);
}

static inline f16 fmax16(f16 x, f16 y) {
  return fmax(x, y);
}

static inline f16 fmin16(f16 x, f16 y) {
  return fmin(x, y);
}

static inline f16 fpow16(f16 x, f16 y) {
  return pow(x, y);
}

#elif ISPC
static inline f16 fabs16(f16 x) {
  return abs(x);
}

static inline f16 fmax16(f16 x, f16 y) {
  return futrts_isnan16(x) ? y : futrts_isnan16(y) ? x : max(x, y);
}

static inline f16 fmin16(f16 x, f16 y) {
  return futrts_isnan16(x) ? y : futrts_isnan16(y) ? x : min(x, y);
}

static inline f16 fpow16(f16 x, f16 y) {
  return pow(x, y);
}
#else // Assuming CUDA.

static inline f16 fabs16(f16 x) {
  return fabsf(x);
}

static inline f16 fmax16(f16 x, f16 y) {
  return fmaxf(x, y);
}

static inline f16 fmin16(f16 x, f16 y) {
  return fminf(x, y);
}

static inline f16 fpow16(f16 x, f16 y) {
  return powf(x, y);
}
#endif

#if ISPC
static inline bool futrts_isinf16(float x) {
  return !futrts_isnan16(x) && futrts_isnan16(x - x);
}
static inline bool futrts_isfinite16(float x) {
  return !futrts_isnan16(x) && !futrts_isinf16(x);
}

#else

static inline bool futrts_isinf16(f16 x) {
  return isinf((float)x);
}
#endif

#ifdef __OPENCL_VERSION__
static inline f16 futrts_log16(f16 x) {
  return log(x);
}

static inline f16 futrts_log2_16(f16 x) {
  return log2(x);
}

static inline f16 futrts_log10_16(f16 x) {
  return log10(x);
}

static inline f16 futrts_log1p_16(f16 x) {
  return log1p(x);
}

static inline f16 futrts_sqrt16(f16 x) {
  return sqrt(x);
}

static inline f16 futrts_cbrt16(f16 x) {
  return cbrt(x);
}

static inline f16 futrts_exp16(f16 x) {
  return exp(x);
}

static inline f16 futrts_cos16(f16 x) {
  return cos(x);
}

static inline f16 futrts_sin16(f16 x) {
  return sin(x);
}

static inline f16 futrts_tan16(f16 x) {
  return tan(x);
}

static inline f16 futrts_acos16(f16 x) {
  return acos(x);
}

static inline f16 futrts_asin16(f16 x) {
  return asin(x);
}

static inline f16 futrts_atan16(f16 x) {
  return atan(x);
}

static inline f16 futrts_cosh16(f16 x) {
  return cosh(x);
}

static inline f16 futrts_sinh16(f16 x) {
  return sinh(x);
}

static inline f16 futrts_tanh16(f16 x) {
  return tanh(x);
}

static inline f16 futrts_acosh16(f16 x) {
  return acosh(x);
}

static inline f16 futrts_asinh16(f16 x) {
  return asinh(x);
}

static inline f16 futrts_atanh16(f16 x) {
  return atanh(x);
}

static inline f16 futrts_atan2_16(f16 x, f16 y) {
  return atan2(x, y);
}

static inline f16 futrts_hypot16(f16 x, f16 y) {
  return hypot(x, y);
}

static inline f16 futrts_gamma16(f16 x) {
  return tgamma(x);
}

static inline f16 futrts_lgamma16(f16 x) {
  return lgamma(x);
}

static inline f16 futrts_erf16(f16 x) {
  return erf(x);
}

static inline f16 futrts_erfc16(f16 x) {
  return erfc(x);
}

static inline f16 fmod16(f16 x, f16 y) {
  return fmod(x, y);
}

static inline f16 futrts_round16(f16 x) {
  return rint(x);
}

static inline f16 futrts_floor16(f16 x) {
  return floor(x);
}

static inline f16 futrts_ceil16(f16 x) {
  return ceil(x);
}

static inline f16 futrts_nextafter16(f16 x, f16 y) {
  return nextafter(x, y);
}

static inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {
  return mix(v0, v1, t);
}

static inline f16 futrts_mad16(f16 a, f16 b, f16 c) {
  return mad(a, b, c);
}

static inline f16 futrts_fma16(f16 a, f16 b, f16 c) {
  return fma(a, b, c);
}
#elif ISPC

static inline f16 futrts_log16(f16 x) {
  return futrts_isfinite16(x) || (futrts_isinf16(x) && x < 0) ? log(x) : x;
}

static inline f16 futrts_log2_16(f16 x) {
  return futrts_log16(x) / log(2.0f16);
}

static inline f16 futrts_log10_16(f16 x) {
  return futrts_log16(x) / log(10.0f16);
}

static inline f16 futrts_log1p_16(f16 x) {
  if(x == -1.0f16 || (futrts_isinf16(x) && x > 0.0f16)) return x / 0.0f16;
  f16 y = 1.0f16 + x;
  f16 z = y - 1.0f16;
  return log(y) - (z-x)/y;
}

static inline f16 futrts_sqrt16(f16 x) {
  return (float16)sqrt((float)x);
}

static inline f16 futrts_exp16(f16 x) {
  return exp(x);
}

static inline f16 futrts_cos16(f16 x) {
  return (float16)cos((float)x);
}

static inline f16 futrts_sin16(f16 x) {
  return (float16)sin((float)x);
}

static inline f16 futrts_tan16(f16 x) {
  return (float16)tan((float)x);
}

static inline f16 futrts_acos16(f16 x) {
  return (float16)acos((float)x);
}

static inline f16 futrts_asin16(f16 x) {
  return (float16)asin((float)x);
}

static inline f16 futrts_atan16(f16 x) {
  return (float16)atan((float)x);
}

static inline f16 futrts_cosh16(f16 x) {
  return (exp(x)+exp(-x)) / 2.0f16;
}

static inline f16 futrts_sinh16(f16 x) {
  return (exp(x)-exp(-x)) / 2.0f16;
}

static inline f16 futrts_tanh16(f16 x) {
  return futrts_sinh16(x)/futrts_cosh16(x);
}

static inline f16 futrts_acosh16(f16 x) {
  float16 f = x+(float16)sqrt((float)(x*x-1));
  if(futrts_isfinite16(f)) return log(f);
  return f;
}

static inline f16 futrts_asinh16(f16 x) {
  float16 f = x+(float16)sqrt((float)(x*x+1));
  if(futrts_isfinite16(f)) return log(f);
  return f;
}

static inline f16 futrts_atanh16(f16 x) {
  float16 f = (1+x)/(1-x);
  if(futrts_isfinite16(f)) return log(f)/2.0f16;
  return f;
}

static inline f16 futrts_atan2_16(f16 x, f16 y) {
  return (float16)atan2((float)x, (float)y);
}

static inline f16 futrts_hypot16(f16 x, f16 y) {
  return (float16)futrts_hypot32((float)x, (float)y);
}

extern "C" unmasked uniform float tgammaf(uniform float x);
static inline f16 futrts_gamma16(f16 x) {
  f16 res;
  foreach_active (i) {
    uniform f16 r = (f16)tgammaf(extract((float)x, i));
    res = insert(res, i, r);
  }
  return res;
}

extern "C" unmasked uniform float lgammaf(uniform float x);
static inline f16 futrts_lgamma16(f16 x) {
  f16 res;
  foreach_active (i) {
    uniform f16 r = (f16)lgammaf(extract((float)x, i));
    res = insert(res, i, r);
  }
  return res;
}

static inline f16 futrts_cbrt16(f16 x) {
  f16 res = (f16)futrts_cbrt32((float)x);
  return res;
}

static inline f16 futrts_erf16(f16 x) {
  f16 res = (f16)futrts_erf32((float)x);
  return res;
}

static inline f16 futrts_erfc16(f16 x) {
  f16 res = (f16)futrts_erfc32((float)x);
  return res;
}

static inline f16 fmod16(f16 x, f16 y) {
  return x - y * (float16)trunc((float) (x/y));
}

static inline f16 futrts_round16(f16 x) {
  return (float16)round((float)x);
}

static inline f16 futrts_floor16(f16 x) {
  return (float16)floor((float)x);
}

static inline f16 futrts_ceil16(f16 x) {
  return (float16)ceil((float)x);
}

static inline f16 futrts_nextafter16(f16 x, f16 y) {
  return (float16)futrts_nextafter32((float)x, (float) y);
}

static inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {
  return v0 + (v1 - v0) * t;
}

static inline f16 futrts_mad16(f16 a, f16 b, f16 c) {
  return a * b + c;
}

static inline f16 futrts_fma16(f16 a, f16 b, f16 c) {
  return a * b + c;
}

#else // Assume CUDA.

static inline f16 futrts_log16(f16 x) {
  return hlog(x);
}

static inline f16 futrts_log2_16(f16 x) {
  return hlog2(x);
}

static inline f16 futrts_log10_16(f16 x) {
  return hlog10(x);
}

static inline f16 futrts_log1p_16(f16 x) {
  return (f16)log1pf((float)x);
}

static inline f16 futrts_sqrt16(f16 x) {
  return hsqrt(x);
}

static inline f16 futrts_cbrt16(f16 x) {
  return cbrtf(x);
}

static inline f16 futrts_exp16(f16 x) {
  return hexp(x);
}

static inline f16 futrts_cos16(f16 x) {
  return hcos(x);
}

static inline f16 futrts_sin16(f16 x) {
  return hsin(x);
}

static inline f16 futrts_tan16(f16 x) {
  return tanf(x);
}

static inline f16 futrts_acos16(f16 x) {
  return acosf(x);
}

static inline f16 futrts_asin16(f16 x) {
  return asinf(x);
}

static inline f16 futrts_atan16(f16 x) {
  return atanf(x);
}

static inline f16 futrts_cosh16(f16 x) {
  return coshf(x);
}

static inline f16 futrts_sinh16(f16 x) {
  return sinhf(x);
}

static inline f16 futrts_tanh16(f16 x) {
  return tanhf(x);
}

static inline f16 futrts_acosh16(f16 x) {
  return acoshf(x);
}

static inline f16 futrts_asinh16(f16 x) {
  return asinhf(x);
}

static inline f16 futrts_atanh16(f16 x) {
  return atanhf(x);
}

static inline f16 futrts_atan2_16(f16 x, f16 y) {
  return atan2f(x, y);
}

static inline f16 futrts_hypot16(f16 x, f16 y) {
  return hypotf(x, y);
}

static inline f16 futrts_gamma16(f16 x) {
  return tgammaf(x);
}

static inline f16 futrts_lgamma16(f16 x) {
  return lgammaf(x);
}

static inline f16 futrts_erf16(f16 x) {
  return erff(x);
}

static inline f16 futrts_erfc16(f16 x) {
  return erfcf(x);
}

static inline f16 fmod16(f16 x, f16 y) {
  return fmodf(x, y);
}

static inline f16 futrts_round16(f16 x) {
  return rintf(x);
}

static inline f16 futrts_floor16(f16 x) {
  return hfloor(x);
}

static inline f16 futrts_ceil16(f16 x) {
  return hceil(x);
}

static inline f16 futrts_nextafter16(f16 x, f16 y) {
  return __ushort_as_half(halfbitsnextafter(__half_as_ushort(x), __half_as_ushort(y)));
}

static inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {
  return v0 + (v1 - v0) * t;
}

static inline f16 futrts_mad16(f16 a, f16 b, f16 c) {
  return a * b + c;
}

static inline f16 futrts_fma16(f16 a, f16 b, f16 c) {
  return fmaf(a, b, c);
}

#endif

// The CUDA __half type cannot be put in unions for some reason, so we
// use bespoke conversion functions instead.
#ifdef __CUDA_ARCH__
static inline int16_t futrts_to_bits16(f16 x) {
  return __half_as_ushort(x);
}
static inline f16 futrts_from_bits16(int16_t x) {
  return __ushort_as_half(x);
}
#elif ISPC

static inline int16_t futrts_to_bits16(f16 x) {
  varying int16_t y = *((varying int16_t * uniform)&x);
  return y;
}

static inline f16 futrts_from_bits16(int16_t x) {
  varying f16 y = *((varying f16 * uniform)&x);
  return y;
}
#else
static inline int16_t futrts_to_bits16(f16 x) {
  union {
    f16 f;
    int16_t t;
  } p;

  p.f = x;
  return p.t;
}

static inline f16 futrts_from_bits16(int16_t x) {
  union {
    int16_t f;
    f16 t;
  } p;

  p.f = x;
  return p.t;
}
#endif

#else // No native f16 - emulate.

static inline f16 fabs16(f16 x) {
  return fabs32(x);
}

static inline f16 fmax16(f16 x, f16 y) {
  return fmax32(x, y);
}

static inline f16 fmin16(f16 x, f16 y) {
  return fmin32(x, y);
}

static inline f16 fpow16(f16 x, f16 y) {
  return fpow32(x, y);
}

static inline bool futrts_isnan16(f16 x) {
  return futrts_isnan32(x);
}

static inline bool futrts_isinf16(f16 x) {
  return futrts_isinf32(x);
}

static inline f16 futrts_log16(f16 x) {
  return futrts_log32(x);
}

static inline f16 futrts_log2_16(f16 x) {
  return futrts_log2_32(x);
}

static inline f16 futrts_log10_16(f16 x) {
  return futrts_log10_32(x);
}

static inline f16 futrts_log1p_16(f16 x) {
  return futrts_log1p_32(x);
}

static inline f16 futrts_sqrt16(f16 x) {
  return futrts_sqrt32(x);
}

static inline f16 futrts_cbrt16(f16 x) {
  return futrts_cbrt32(x);
}

static inline f16 futrts_exp16(f16 x) {
  return futrts_exp32(x);
}

static inline f16 futrts_cos16(f16 x) {
  return futrts_cos32(x);
}

static inline f16 futrts_sin16(f16 x) {
  return futrts_sin32(x);
}

static inline f16 futrts_tan16(f16 x) {
  return futrts_tan32(x);
}

static inline f16 futrts_acos16(f16 x) {
  return futrts_acos32(x);
}

static inline f16 futrts_asin16(f16 x) {
  return futrts_asin32(x);
}

static inline f16 futrts_atan16(f16 x) {
  return futrts_atan32(x);
}

static inline f16 futrts_cosh16(f16 x) {
  return futrts_cosh32(x);
}

static inline f16 futrts_sinh16(f16 x) {
  return futrts_sinh32(x);
}

static inline f16 futrts_tanh16(f16 x) {
  return futrts_tanh32(x);
}

static inline f16 futrts_acosh16(f16 x) {
  return futrts_acosh32(x);
}

static inline f16 futrts_asinh16(f16 x) {
  return futrts_asinh32(x);
}

static inline f16 futrts_atanh16(f16 x) {
  return futrts_atanh32(x);
}

static inline f16 futrts_atan2_16(f16 x, f16 y) {
  return futrts_atan2_32(x, y);
}

static inline f16 futrts_hypot16(f16 x, f16 y) {
  return futrts_hypot32(x, y);
}

static inline f16 futrts_gamma16(f16 x) {
  return futrts_gamma32(x);
}

static inline f16 futrts_lgamma16(f16 x) {
  return futrts_lgamma32(x);
}

static inline f16 futrts_erf16(f16 x) {
  return futrts_erf32(x);
}

static inline f16 futrts_erfc16(f16 x) {
  return futrts_erfc32(x);
}

static inline f16 fmod16(f16 x, f16 y) {
  return fmod32(x, y);
}

static inline f16 futrts_round16(f16 x) {
  return futrts_round32(x);
}

static inline f16 futrts_floor16(f16 x) {
  return futrts_floor32(x);
}

static inline f16 futrts_ceil16(f16 x) {
  return futrts_ceil32(x);
}

static inline f16 futrts_nextafter16(f16 x, f16 y) {
  return halfbits2float(halfbitsnextafter(float2halfbits(x), float2halfbits(y)));
}

static inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {
  return futrts_lerp32(v0, v1, t);
}

static inline f16 futrts_mad16(f16 a, f16 b, f16 c) {
  return futrts_mad32(a, b, c);
}

static inline f16 futrts_fma16(f16 a, f16 b, f16 c) {
  return futrts_fma32(a, b, c);
}

// Even when we are using an OpenCL that does not support cl_khr_fp16,
// it must still support vload_half for actually creating a
// half-precision number, which can then be efficiently converted to a
// float.  Similarly for vstore_half.
#ifdef __OPENCL_VERSION__

static inline int16_t futrts_to_bits16(f16 x) {
  int16_t y;
  // Violating strict aliasing here.
  vstore_half((float)x, 0, (half*)&y);
  return y;
}

static inline f16 futrts_from_bits16(int16_t x) {
  return (f16)vload_half(0, (half*)&x);
}

#else

static inline int16_t futrts_to_bits16(f16 x) {
  return (int16_t)float2halfbits(x);
}

static inline f16 futrts_from_bits16(int16_t x) {
  return halfbits2float((uint16_t)x);
}

static inline f16 fsignum16(f16 x) {
  return futrts_isnan16(x) ? x : (x > 0 ? 1 : 0) - (x < 0 ? 1 : 0);
}

#endif

#endif

static inline float fpconv_f16_f16(f16 x) {
  return x;
}

static inline float fpconv_f16_f32(f16 x) {
  return x;
}

static inline f16 fpconv_f32_f16(float x) {
  return (f16) x;
}

#ifdef FUTHARK_F64_ENABLED

static inline double fpconv_f16_f64(f16 x) {
  return (double) x;
}

#if ISPC
static inline f16 fpconv_f64_f16(double x) {
  return (f16) ((float)x);
}
#else
static inline f16 fpconv_f64_f16(double x) {
  return (f16) x;
}
#endif
#endif


// End of scalar_f16.h.

// Start of context_prototypes.h
//
// Prototypes for the functions in context.h, or that will be called
// from those functions, that need to be available very early.

struct futhark_context_config;
struct futhark_context;

static void set_error(struct futhark_context* ctx, char *error);

// These are called in context/config new/free functions and contain
// shared setup.  They are generated by the compiler itself.
static int init_constants(struct futhark_context*);
static int free_constants(struct futhark_context*);
static void setup_program(struct futhark_context* ctx);
static void teardown_program(struct futhark_context *ctx);

// Allocate host memory.  Must be freed with host_free().
static void host_alloc(struct futhark_context* ctx, size_t size, const char* tag, size_t* size_out, void** mem_out);
// Allocate memory allocated with host_alloc().
static void host_free(struct futhark_context* ctx, size_t size, const char* tag, void* mem);

// Functions that must be defined by the backend.
static void backend_context_config_setup(struct futhark_context_config* cfg);
static void backend_context_config_teardown(struct futhark_context_config* cfg);
static int backend_context_setup(struct futhark_context *ctx);
static void backend_context_teardown(struct futhark_context *ctx);

// End of of context_prototypes.h

struct memblock_device {
    int *references;
    cl_mem mem;
    int64_t size;
    const char *desc;
};
struct memblock {
    int *references;
    unsigned char *mem;
    int64_t size;
    const char *desc;
};
struct constants {
    int dummy;
};
struct tuning_params {
    int64_t *addzisegmap_group_sizze_6879;
    int64_t *add_i64zisegmap_group_sizze_6899;
};
static const int num_tuning_params = 2;
static const char *tuning_param_names[] = {"add.segmap_group_size_6879", "add_i64.segmap_group_size_6899", NULL};
static const char *tuning_param_vars[] = {"addzisegmap_group_sizze_6879", "add_i64zisegmap_group_sizze_6899", NULL};
static const char *tuning_param_classes[] = {"group_size", "group_size", NULL};
static int64_t tuning_param_defaults[] = {0, 0, 0};
static const int max_failure_args = 0;
static const int f64_required = 0;
static const char *opencl_program[] = {"\n// Clang-based OpenCL implementations need this for 'static' to work.\n#ifdef cl_clang_storage_class_specifiers\n#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable\n#endif\n#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n\n// Some OpenCL programs dislike empty progams, or programs with no kernels.\n// Declare a dummy kernel to ensure they remain our friends.\n__kernel void dummy_kernel(__global unsigned char *dummy, int n)\n{\n    const int thread_gid = get_global_id(0);\n    if (thread_gid >= n) return;\n}\n\n#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n\ntypedef char int8_t;\ntypedef short int16_t;\ntypedef int int32_t;\ntypedef long int64_t;\n\ntypedef uchar uint8_t;\ntypedef ushort uint16_t;\ntypedef uint uint32_t;\ntypedef ulong uint64_t;\n\n// NVIDIAs OpenCL does not create device-wide memory fences (see #734), so we\n// use inline assembly if we detect we are on an NVIDIA GPU.\n#ifdef cl_nv_pragma_unroll\nstatic inline void mem_fence_global() {\n  asm(\"membar.gl;\");\n}\n#else\nstatic inline void mem_fence_global() {\n  mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n}\n#endif\nstatic inline void mem_fence_local() {\n  mem_fence(CLK_LOCAL_MEM_FENCE);\n}\n// Start of half.h.\n\n// Conversion functions are from http://half.sourceforge.net/, but\n// translated to C.\n//\n// Copyright (c) 2012-2021 Christian Rau\n//\n// Permission is hereby granted, free of charge, to any person obtaining a copy\n// of this software and associated documentation files (the \"Software\"), to deal\n// in the Software without restriction, including without limitation the rights\n// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n// copies of the Software, and to permit persons to whom the Software is\n// furnished to do so, subject to the following conditions:\n//\n// The above copyright notice and this permission notice shall be included in\n// all copies or substantial portions of the Softwa", "re.\n//\n// THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n// THE SOFTWARE.\n\n#ifndef __OPENCL_VERSION__\n#define __constant\n#endif\n\n__constant static const uint16_t base_table[512] = {\n  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,\n  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100,\n  0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00,\n  0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00,\n  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C0", "0, 0x7C00, 0x7C00, 0x7C00,\n  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n  0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,\n  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,\n  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100,\n  0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00,\n  0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00,\n  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n  0xFC00, 0xFC00, 0xFC0",
                                       "0, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,\n  0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 };\n\n__constant static const unsigned char shift_table[512] = {\n  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n  24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,\n  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13,\n  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n  24, 24, 2", "4, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n  24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,\n  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,\n  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 };\n\n__constant static const uint32_t mantissa_table[2048] = {\n  0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000,\n  0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000,\n  0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000,\n  0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000,\n  0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000,\n  0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA00", "00, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000,\n  0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000,\n  0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000,\n  0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000,\n  0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000,\n  0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000,\n  0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000,\n  0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000,\n  0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000,\n  0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000,\n  0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A00",
                                       "00, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000,\n  0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000,\n  0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000,\n  0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000,\n  0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000,\n  0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000,\n  0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000,\n  0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000,\n  0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000,\n  0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000,\n  0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF80", "00,\n  0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000,\n  0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000,\n  0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000,\n  0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000,\n  0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000,\n  0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000,\n  0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000,\n  0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000,\n  0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000,\n  0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000,\n  0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x3811", "0000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000,\n  0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000,\n  0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000,\n  0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000,\n  0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000,\n  0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000,\n  0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000,\n  0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000,\n  0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000,\n  0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000,\n  0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A",
                                       "4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000,\n  0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000,\n  0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000,\n  0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000,\n  0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000,\n  0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000,\n  0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000,\n  0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000,\n  0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000,\n  0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000,\n  0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x3863", "8000, 0x3863C000,\n  0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000,\n  0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000,\n  0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000,\n  0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000,\n  0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000,\n  0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000,\n  0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000,\n  0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000,\n  0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000,\n  0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000,\n  0x38060000, 0x38062000, 0x38064000, 0x38", "066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000,\n  0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000,\n  0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000,\n  0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000,\n  0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000,\n  0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000,\n  0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000,\n  0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000,\n  0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000,\n  0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000,\n  0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x38",
                                       "1B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000,\n  0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000,\n  0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000,\n  0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000,\n  0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000,\n  0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000,\n  0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000,\n  0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000,\n  0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000,\n  0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000,\n  0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x38", "2FA000, 0x382FC000, 0x382FE000,\n  0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000,\n  0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000,\n  0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000,\n  0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000,\n  0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000,\n  0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000,\n  0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000,\n  0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000,\n  0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000,\n  0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000,\n  0x38440000, 0x38442000, 0x", "38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000,\n  0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000,\n  0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000,\n  0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000,\n  0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000,\n  0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000,\n  0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,\n  0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000,\n  0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000,\n  0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000,\n  0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x",
                                       "3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000,\n  0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000,\n  0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000,\n  0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000,\n  0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000,\n  0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000,\n  0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000,\n  0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000,\n  0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000,\n  0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000,\n  0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x", "386D8000, 0x386DA000, 0x386DC000, 0x386DE000,\n  0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000,\n  0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000,\n  0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000,\n  0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000,\n  0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000,\n  0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000,\n  0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000,\n  0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000,\n  0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 };\n__constant static const uint32_t exponent_table[64] = {\n  0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06", "000000, 0x06800000, 0x07000000, 0x07800000,\n  0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000,\n  0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,\n  0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 };\n__constant static const unsigned short offset_table[64] = {\n  0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,\n  0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 };\n\nstatic uint16_t float2halfbits(float value) {\n  union { float x; uint32_t y; } u;\n  u.x = value;\n  uint32_t bits = u.y;\n\n  uint16_t hbits = base_table[bits>>23] + (uint16_t)((bits&0x7FFFFF)>>shift_table[bits>>23]);;\n\n  return hbits;\n}\n\nstatic float halfbits2float(uint16_t value) {\n  uint32_t bits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10];\n\n  union { uint32_t x; float y; } u;\n  u.x = bits;\n  return u.y;\n}\n\nstatic uint16_t halfbitsnextafter(uint16_t from, uint16_t to) {\n  int fabs = from & 0x7FFF, tabs = to & 0x7FFF;\n  if(fabs > 0x7C00 || tabs > 0x7C00) {\n    return ((from&0x7FFF)>0x7C00) ? (from|0x200) : (to|0x200);\n  }\n  if(from == to || !(fabs|tabs)) {\n    return to;\n  }\n  if(!fabs) {\n    return (to&0x8000)+1;\n  }\n  unsigned int out =\n    from +\n    (((from>>15)^(unsigned int)((from^(0x8000|(0x8000-(from>>15))))<(to^(0x8000|(0x8000-(to>>15))))))<<1)\n    - 1;\n  return out;",
                                       "\n}\n\n// End of half.h.\n// Start of scalar.h.\n\n// Implementation of the primitive scalar operations.  Very\n// repetitive.  This code is inserted directly into both CUDA and\n// OpenCL programs, as well as the CPU code, so it has some #ifdefs to\n// work everywhere.  Some operations are defined as macros because\n// this allows us to use them as constant expressions in things like\n// array sizes and static initialisers.\n\n// Some of the #ifdefs are because OpenCL uses type-generic functions\n// for some operations (e.g. sqrt), while C and CUDA sensibly use\n// distinct functions for different precisions (e.g. sqrtf() and\n// sqrt()).  This is quite annoying.  Due to C's unfortunate casting\n// rules, it is also really easy to accidentally implement\n// floating-point functions in the wrong precision, so be careful.\n\n// Double-precision definitions are only included if the preprocessor\n// macro FUTHARK_F64_ENABLED is set.\n\nstatic inline uint8_t add8(uint8_t x, uint8_t y) {\n  return x + y;\n}\n\nstatic inline uint16_t add16(uint16_t x, uint16_t y) {\n  return x + y;\n}\n\nstatic inline uint32_t add32(uint32_t x, uint32_t y) {\n  return x + y;\n}\n\nstatic inline uint64_t add64(uint64_t x, uint64_t y) {\n  return x + y;\n}\n\nstatic inline uint8_t sub8(uint8_t x, uint8_t y) {\n  return x - y;\n}\n\nstatic inline uint16_t sub16(uint16_t x, uint16_t y) {\n  return x - y;\n}\n\nstatic inline uint32_t sub32(uint32_t x, uint32_t y) {\n  return x - y;\n}\n\nstatic inline uint64_t sub64(uint64_t x, uint64_t y) {\n  return x - y;\n}\n\nstatic inline uint8_t mul8(uint8_t x, uint8_t y) {\n  return x * y;\n}\n\nstatic inline uint16_t mul16(uint16_t x, uint16_t y) {\n  return x * y;\n}\n\nstatic inline uint32_t mul32(uint32_t x, uint32_t y) {\n  return x * y;\n}\n\nstatic inline uint64_t mul64(uint64_t x, uint64_t y) {\n  return x * y;\n}\n\n#if ISPC\n\nstatic inline uint8_t udiv8(uint8_t x, uint8_t y) {\n  // This strange pattern is used to prevent the ISPC compiler from\n  // causing SIGFPEs and bogus results on divisions where inactive lan", "es\n  // have 0-valued divisors. It ensures that any inactive lane instead\n  // has a divisor of 1. https://github.com/ispc/ispc/issues/2292\n  uint8_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n\n  return x / ys;\n}\n\nstatic inline uint16_t udiv16(uint16_t x, uint16_t y) {\n  uint16_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return x / ys;\n}\n\nstatic inline uint32_t udiv32(uint32_t x, uint32_t y) {\n  uint32_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n\n  return x / ys;\n}\n\nstatic inline uint64_t udiv64(uint64_t x, uint64_t y) {\n  uint64_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n\n  return x / ys;\n}\n\nstatic inline uint8_t udiv_up8(uint8_t x, uint8_t y) {\n  uint8_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n\n  return (x + y - 1) / ys;\n}\n\nstatic inline uint16_t udiv_up16(uint16_t x, uint16_t y) {\n  uint16_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return (x + y - 1) / ys;\n}\n\nstatic inline uint32_t udiv_up32(uint32_t x, uint32_t y) {\n  uint32_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return (x + y - 1) / ys;\n}\n\nstatic inline uint64_t udiv_up64(uint64_t x, uint64_t y) {\n  uint64_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return (x + y - 1) / ys;\n}\n\nstatic inline uint8_t umod8(uint8_t x, uint8_t y) {\n  uint8_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return x % ys;\n}\n\nstatic inline uint16_t umod16(uint16_t x, uint16_t y) {\n  uint16_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n\n  return x % ys;\n}\n\nstatic inline uint32_t umod32(uint32_t x, uint32_t y) {\n  uint32_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return x % ys;\n}\n\nstatic inline uint64_t umod64(uint64_t x, uint64_t y) {\n  uint64_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return x % ys;\n}\n\nstatic inline uint8_t udiv_safe8(uint8_t x, uint8_t y) {\n  uint8_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : x / ys;\n}\n\nstatic inline uint16_t udiv_safe16(uint16_t x, uint16_t y) {\n  uint16_t ys = 1;\n  foreach_active(i){\n    ", "ys = y;\n  }\n  \n  return y == 0 ? 0 : x / ys;\n}\n\nstatic inline uint32_t udiv_safe32(uint32_t x, uint32_t y) {\n  uint32_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : x / ys;\n}\n\nstatic inline uint64_t udiv_safe64(uint64_t x, uint64_t y) {\n  uint64_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : x / ys;\n}\n\nstatic inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) {\n  uint8_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : (x + y - 1) / ys;\n}\n\nstatic inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) {\n  uint16_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : (x + y - 1) / ys;\n}\n\nstatic inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) {\n  uint32_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : (x + y - 1) / ys;\n}\n\nstatic inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) {\n  uint64_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : (x + y - 1) / ys;\n}\n\nstatic inline uint8_t umod_safe8(uint8_t x, uint8_t y) {\n  uint8_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : x % ys;\n}\n\nstatic inline uint16_t umod_safe16(uint16_t x, uint16_t y) {\n  uint16_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : x % ys;\n}\n\nstatic inline uint32_t umod_safe32(uint32_t x, uint32_t y) {\n  uint32_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : x % ys;\n}\n\nstatic inline uint64_t umod_safe64(uint64_t x, uint64_t y) {\n  uint64_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : x % ys;\n}\n\nstatic inline int8_t sdiv8(int8_t x, int8_t y) {\n  int8_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  int8_t q = x / ys;\n  int8_t r = x % ys;\n\n  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int16_t sdiv16(int16_t x, int16_t y) {\n  int16_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  int16_t q = x / ys;\n  int16_t r = x % ys;\n\n  return q - ((r != 0",
                                       " && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int32_t sdiv32(int32_t x, int32_t y) {\n  int32_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  int32_t q = x / ys;\n  int32_t r = x % ys;\n\n  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int64_t sdiv64(int64_t x, int64_t y) {\n  int64_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  int64_t q = x / ys;\n  int64_t r = x % ys;\n\n  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int8_t sdiv_up8(int8_t x, int8_t y) {\n  return sdiv8(x + y - 1, y);\n}\n\nstatic inline int16_t sdiv_up16(int16_t x, int16_t y) {\n  return sdiv16(x + y - 1, y);\n}\n\nstatic inline int32_t sdiv_up32(int32_t x, int32_t y) {\n  return sdiv32(x + y - 1, y);\n}\n\nstatic inline int64_t sdiv_up64(int64_t x, int64_t y) {\n  return sdiv64(x + y - 1, y);\n}\n\nstatic inline int8_t smod8(int8_t x, int8_t y) {\n  int8_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  int8_t r = x % ys;\n\n  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int16_t smod16(int16_t x, int16_t y) {\n  int16_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  int16_t r = x % ys;\n\n  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int32_t smod32(int32_t x, int32_t y) {\n  int32_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  int32_t r = x % ys;\n\n  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int64_t smod64(int64_t x, int64_t y) {\n  int64_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  int64_t r = x % ys;\n\n  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int8_t sdiv_safe8(int8_t x, int8_t y) {\n  return y == 0 ? 0 : sdiv8(x, y);\n}\n\nstatic inline int16_t sdiv_safe16(int16_t x, int16_t y) {\n  return y == 0 ? 0 : sdiv16(x, y);\n}\n\nstatic inline int32_t sdiv_safe32(int32_t x, int32_t y) {\n  return y == 0 ? 0 : sdiv32(x, y);\n}\n\nstatic inline int64_t sdiv_safe64(int64_t x, int64_t y) {\n  return y == 0 ? 0 : sdi", "v64(x, y);\n}\n\nstatic inline int8_t sdiv_up_safe8(int8_t x, int8_t y) {\n  return sdiv_safe8(x + y - 1, y);\n}\n\nstatic inline int16_t sdiv_up_safe16(int16_t x, int16_t y) {\n  return sdiv_safe16(x + y - 1, y);\n}\n\nstatic inline int32_t sdiv_up_safe32(int32_t x, int32_t y) {\n  return sdiv_safe32(x + y - 1, y);\n}\n\nstatic inline int64_t sdiv_up_safe64(int64_t x, int64_t y) {\n  return sdiv_safe64(x + y - 1, y);\n}\n\nstatic inline int8_t smod_safe8(int8_t x, int8_t y) {\n  return y == 0 ? 0 : smod8(x, y);\n}\n\nstatic inline int16_t smod_safe16(int16_t x, int16_t y) {\n  return y == 0 ? 0 : smod16(x, y);\n}\n\nstatic inline int32_t smod_safe32(int32_t x, int32_t y) {\n  return y == 0 ? 0 : smod32(x, y);\n}\n\nstatic inline int64_t smod_safe64(int64_t x, int64_t y) {\n  return y == 0 ? 0 : smod64(x, y);\n}\n\nstatic inline int8_t squot8(int8_t x, int8_t y) {\n  int8_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return x / ys;\n}\n\nstatic inline int16_t squot16(int16_t x, int16_t y) {\n  int16_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return x / ys;\n}\n\nstatic inline int32_t squot32(int32_t x, int32_t y) {\n  int32_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return x / ys;\n}\n\nstatic inline int64_t squot64(int64_t x, int64_t y) {\n  int64_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return x / ys;\n}\n\nstatic inline int8_t srem8(int8_t x, int8_t y) {\n  int8_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return x % ys;\n}\n\nstatic inline int16_t srem16(int16_t x, int16_t y) {\n  int16_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return x % ys;\n}\n\nstatic inline int32_t srem32(int32_t x, int32_t y) {\n  int32_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return x % ys;\n}\n\nstatic inline int64_t srem64(int64_t x, int64_t y) {\n  int8_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return x % ys;\n}\n\nstatic inline int8_t squot_safe8(int8_t x, int8_t y) {\n  int8_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : x / ys;\n}\n\nstatic inline int16_t ", "squot_safe16(int16_t x, int16_t y) {\n  int16_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : x / ys;\n}\n\nstatic inline int32_t squot_safe32(int32_t x, int32_t y) {\n  int32_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : x / ys;\n}\n\nstatic inline int64_t squot_safe64(int64_t x, int64_t y) {\n  int64_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : x / ys;\n}\n\nstatic inline int8_t srem_safe8(int8_t x, int8_t y) {\n  int8_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : x % ys;\n}\n\nstatic inline int16_t srem_safe16(int16_t x, int16_t y) {\n  int16_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : x % ys;\n}\n\nstatic inline int32_t srem_safe32(int32_t x, int32_t y) {\n  int32_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : x % ys;\n}\n\nstatic inline int64_t srem_safe64(int64_t x, int64_t y) {\n  int64_t ys = 1;\n  foreach_active(i){\n    ys = y;\n  }\n  \n  return y == 0 ? 0 : x % ys;\n}\n\n#else\n\nstatic inline uint8_t udiv8(uint8_t x, uint8_t y) {\n  return x / y;\n}\n\nstatic inline uint16_t udiv16(uint16_t x, uint16_t y) {\n  return x / y;\n}\n\nstatic inline uint32_t udiv32(uint32_t x, uint32_t y) {\n  return x / y;\n}\n\nstatic inline uint64_t udiv64(uint64_t x, uint64_t y) {\n  return x / y;\n}\n\nstatic inline uint8_t udiv_up8(uint8_t x, uint8_t y) {\n  return (x + y - 1) / y;\n}\n\nstatic inline uint16_t udiv_up16(uint16_t x, uint16_t y) {\n  return (x + y - 1) / y;\n}\n\nstatic inline uint32_t udiv_up32(uint32_t x, uint32_t y) {\n  return (x + y - 1) / y;\n}\n\nstatic inline uint64_t udiv_up64(uint64_t x, uint64_t y) {\n  return (x + y - 1) / y;\n}\n\nstatic inline uint8_t umod8(uint8_t x, uint8_t y) {\n  return x % y;\n}\n\nstatic inline uint16_t umod16(uint16_t x, uint16_t y) {\n  return x % y;\n}\n\nstatic inline uint32_t umod32(uint32_t x, uint32_t y) {\n  return x % y;\n}\n\nstatic inline uint64_t umod64(uint64_t x, uint64_t y) {\n  return x % y;\n}\n\nstatic inline uint8_t udiv_safe8(u",
                                       "int8_t x, uint8_t y) {\n  return y == 0 ? 0 : x / y;\n}\n\nstatic inline uint16_t udiv_safe16(uint16_t x, uint16_t y) {\n  return y == 0 ? 0 : x / y;\n}\n\nstatic inline uint32_t udiv_safe32(uint32_t x, uint32_t y) {\n  return y == 0 ? 0 : x / y;\n}\n\nstatic inline uint64_t udiv_safe64(uint64_t x, uint64_t y) {\n  return y == 0 ? 0 : x / y;\n}\n\nstatic inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) {\n  return y == 0 ? 0 : (x + y - 1) / y;\n}\n\nstatic inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) {\n  return y == 0 ? 0 : (x + y - 1) / y;\n}\n\nstatic inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) {\n  return y == 0 ? 0 : (x + y - 1) / y;\n}\n\nstatic inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) {\n  return y == 0 ? 0 : (x + y - 1) / y;\n}\n\nstatic inline uint8_t umod_safe8(uint8_t x, uint8_t y) {\n  return y == 0 ? 0 : x % y;\n}\n\nstatic inline uint16_t umod_safe16(uint16_t x, uint16_t y) {\n  return y == 0 ? 0 : x % y;\n}\n\nstatic inline uint32_t umod_safe32(uint32_t x, uint32_t y) {\n  return y == 0 ? 0 : x % y;\n}\n\nstatic inline uint64_t umod_safe64(uint64_t x, uint64_t y) {\n  return y == 0 ? 0 : x % y;\n}\n\nstatic inline int8_t sdiv8(int8_t x, int8_t y) {\n  int8_t q = x / y;\n  int8_t r = x % y;\n\n  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int16_t sdiv16(int16_t x, int16_t y) {\n  int16_t q = x / y;\n  int16_t r = x % y;\n\n  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int32_t sdiv32(int32_t x, int32_t y) {\n  int32_t q = x / y;\n  int32_t r = x % y;\n\n  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int64_t sdiv64(int64_t x, int64_t y) {\n  int64_t q = x / y;\n  int64_t r = x % y;\n\n  return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);\n}\n\nstatic inline int8_t sdiv_up8(int8_t x, int8_t y) {\n  return sdiv8(x + y - 1, y);\n}\n\nstatic inline int16_t sdiv_up16(int16_t x, int16_t y) {\n  return sdiv16(x + y - 1, y);\n}\n\nstatic inline int32_t sdiv_up32(int32_t x, int32_t y) {\n  return sdiv32(x + y - 1, y);\n}\n\nstatic inline int64", "_t sdiv_up64(int64_t x, int64_t y) {\n  return sdiv64(x + y - 1, y);\n}\n\nstatic inline int8_t smod8(int8_t x, int8_t y) {\n  int8_t r = x % y;\n\n  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int16_t smod16(int16_t x, int16_t y) {\n  int16_t r = x % y;\n\n  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int32_t smod32(int32_t x, int32_t y) {\n  int32_t r = x % y;\n\n  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int64_t smod64(int64_t x, int64_t y) {\n  int64_t r = x % y;\n\n  return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);\n}\n\nstatic inline int8_t sdiv_safe8(int8_t x, int8_t y) {\n  return y == 0 ? 0 : sdiv8(x, y);\n}\n\nstatic inline int16_t sdiv_safe16(int16_t x, int16_t y) {\n  return y == 0 ? 0 : sdiv16(x, y);\n}\n\nstatic inline int32_t sdiv_safe32(int32_t x, int32_t y) {\n  return y == 0 ? 0 : sdiv32(x, y);\n}\n\nstatic inline int64_t sdiv_safe64(int64_t x, int64_t y) {\n  return y == 0 ? 0 : sdiv64(x, y);\n}\n\nstatic inline int8_t sdiv_up_safe8(int8_t x, int8_t y) {\n  return sdiv_safe8(x + y - 1, y);\n}\n\nstatic inline int16_t sdiv_up_safe16(int16_t x, int16_t y) {\n  return sdiv_safe16(x + y - 1, y);\n}\n\nstatic inline int32_t sdiv_up_safe32(int32_t x, int32_t y) {\n  return sdiv_safe32(x + y - 1, y);\n}\n\nstatic inline int64_t sdiv_up_safe64(int64_t x, int64_t y) {\n  return sdiv_safe64(x + y - 1, y);\n}\n\nstatic inline int8_t smod_safe8(int8_t x, int8_t y) {\n  return y == 0 ? 0 : smod8(x, y);\n}\n\nstatic inline int16_t smod_safe16(int16_t x, int16_t y) {\n  return y == 0 ? 0 : smod16(x, y);\n}\n\nstatic inline int32_t smod_safe32(int32_t x, int32_t y) {\n  return y == 0 ? 0 : smod32(x, y);\n}\n\nstatic inline int64_t smod_safe64(int64_t x, int64_t y) {\n  return y == 0 ? 0 : smod64(x, y);\n}\n\nstatic inline int8_t squot8(int8_t x, int8_t y) {\n  return x / y;\n}\n\nstatic inline int16_t squot16(int16_t x, int16_t y) {\n  return x / y;\n}\n\nstatic inline int32_t squot32(int32", "_t x, int32_t y) {\n  return x / y;\n}\n\nstatic inline int64_t squot64(int64_t x, int64_t y) {\n  return x / y;\n}\n\nstatic inline int8_t srem8(int8_t x, int8_t y) {\n  return x % y;\n}\n\nstatic inline int16_t srem16(int16_t x, int16_t y) {\n  return x % y;\n}\n\nstatic inline int32_t srem32(int32_t x, int32_t y) {\n  return x % y;\n}\n\nstatic inline int64_t srem64(int64_t x, int64_t y) {\n  return x % y;\n}\n\nstatic inline int8_t squot_safe8(int8_t x, int8_t y) {\n  return y == 0 ? 0 : x / y;\n}\n\nstatic inline int16_t squot_safe16(int16_t x, int16_t y) {\n  return y == 0 ? 0 : x / y;\n}\n\nstatic inline int32_t squot_safe32(int32_t x, int32_t y) {\n  return y == 0 ? 0 : x / y;\n}\n\nstatic inline int64_t squot_safe64(int64_t x, int64_t y) {\n  return y == 0 ? 0 : x / y;\n}\n\nstatic inline int8_t srem_safe8(int8_t x, int8_t y) {\n  return y == 0 ? 0 : x % y;\n}\n\nstatic inline int16_t srem_safe16(int16_t x, int16_t y) {\n  return y == 0 ? 0 : x % y;\n}\n\nstatic inline int32_t srem_safe32(int32_t x, int32_t y) {\n  return y == 0 ? 0 : x % y;\n}\n\nstatic inline int64_t srem_safe64(int64_t x, int64_t y) {\n  return y == 0 ? 0 : x % y;\n}\n\n#endif\n\nstatic inline int8_t smin8(int8_t x, int8_t y) {\n  return x < y ? x : y;\n}\n\nstatic inline int16_t smin16(int16_t x, int16_t y) {\n  return x < y ? x : y;\n}\n\nstatic inline int32_t smin32(int32_t x, int32_t y) {\n  return x < y ? x : y;\n}\n\nstatic inline int64_t smin64(int64_t x, int64_t y) {\n  return x < y ? x : y;\n}\n\nstatic inline uint8_t umin8(uint8_t x, uint8_t y) {\n  return x < y ? x : y;\n}\n\nstatic inline uint16_t umin16(uint16_t x, uint16_t y) {\n  return x < y ? x : y;\n}\n\nstatic inline uint32_t umin32(uint32_t x, uint32_t y) {\n  return x < y ? x : y;\n}\n\nstatic inline uint64_t umin64(uint64_t x, uint64_t y) {\n  return x < y ? x : y;\n}\n\nstatic inline int8_t smax8(int8_t x, int8_t y) {\n  return x < y ? y : x;\n}\n\nstatic inline int16_t smax16(int16_t x, int16_t y) {\n  return x < y ? y : x;\n}\n\nstatic inline int32_t smax32(int32_t x, int32_t y) {\n  return x < y ? y : x;\n}\n\ns",
                                       "tatic inline int64_t smax64(int64_t x, int64_t y) {\n  return x < y ? y : x;\n}\n\nstatic inline uint8_t umax8(uint8_t x, uint8_t y) {\n  return x < y ? y : x;\n}\n\nstatic inline uint16_t umax16(uint16_t x, uint16_t y) {\n  return x < y ? y : x;\n}\n\nstatic inline uint32_t umax32(uint32_t x, uint32_t y) {\n  return x < y ? y : x;\n}\n\nstatic inline uint64_t umax64(uint64_t x, uint64_t y) {\n  return x < y ? y : x;\n}\n\nstatic inline uint8_t shl8(uint8_t x, uint8_t y) {\n  return (uint8_t)(x << y);\n}\n\nstatic inline uint16_t shl16(uint16_t x, uint16_t y) {\n  return (uint16_t)(x << y);\n}\n\nstatic inline uint32_t shl32(uint32_t x, uint32_t y) {\n  return x << y;\n}\n\nstatic inline uint64_t shl64(uint64_t x, uint64_t y) {\n  return x << y;\n}\n\nstatic inline uint8_t lshr8(uint8_t x, uint8_t y) {\n  return x >> y;\n}\n\nstatic inline uint16_t lshr16(uint16_t x, uint16_t y) {\n  return x >> y;\n}\n\nstatic inline uint32_t lshr32(uint32_t x, uint32_t y) {\n  return x >> y;\n}\n\nstatic inline uint64_t lshr64(uint64_t x, uint64_t y) {\n  return x >> y;\n}\n\nstatic inline int8_t ashr8(int8_t x, int8_t y) {\n  return x >> y;\n}\n\nstatic inline int16_t ashr16(int16_t x, int16_t y) {\n  return x >> y;\n}\n\nstatic inline int32_t ashr32(int32_t x, int32_t y) {\n  return x >> y;\n}\n\nstatic inline int64_t ashr64(int64_t x, int64_t y) {\n  return x >> y;\n}\n\nstatic inline uint8_t and8(uint8_t x, uint8_t y) {\n  return x & y;\n}\n\nstatic inline uint16_t and16(uint16_t x, uint16_t y) {\n  return x & y;\n}\n\nstatic inline uint32_t and32(uint32_t x, uint32_t y) {\n  return x & y;\n}\n\nstatic inline uint64_t and64(uint64_t x, uint64_t y) {\n  return x & y;\n}\n\nstatic inline uint8_t or8(uint8_t x, uint8_t y) {\n  return x | y;\n}\n\nstatic inline uint16_t or16(uint16_t x, uint16_t y) {\n  return x | y;\n}\n\nstatic inline uint32_t or32(uint32_t x, uint32_t y) {\n  return x | y;\n}\n\nstatic inline uint64_t or64(uint64_t x, uint64_t y) {\n  return x | y;\n}\n\nstatic inline uint8_t xor8(uint8_t x, uint8_t y) {\n  return x ^ y;\n}\n\nstatic inline uint16_t xor16(uint16_", "t x, uint16_t y) {\n  return x ^ y;\n}\n\nstatic inline uint32_t xor32(uint32_t x, uint32_t y) {\n  return x ^ y;\n}\n\nstatic inline uint64_t xor64(uint64_t x, uint64_t y) {\n  return x ^ y;\n}\n\nstatic inline bool ult8(uint8_t x, uint8_t y) {\n  return x < y;\n}\n\nstatic inline bool ult16(uint16_t x, uint16_t y) {\n  return x < y;\n}\n\nstatic inline bool ult32(uint32_t x, uint32_t y) {\n  return x < y;\n}\n\nstatic inline bool ult64(uint64_t x, uint64_t y) {\n  return x < y;\n}\n\nstatic inline bool ule8(uint8_t x, uint8_t y) {\n  return x <= y;\n}\n\nstatic inline bool ule16(uint16_t x, uint16_t y) {\n  return x <= y;\n}\n\nstatic inline bool ule32(uint32_t x, uint32_t y) {\n  return x <= y;\n}\n\nstatic inline bool ule64(uint64_t x, uint64_t y) {\n  return x <= y;\n}\n\nstatic inline bool slt8(int8_t x, int8_t y) {\n  return x < y;\n}\n\nstatic inline bool slt16(int16_t x, int16_t y) {\n  return x < y;\n}\n\nstatic inline bool slt32(int32_t x, int32_t y) {\n  return x < y;\n}\n\nstatic inline bool slt64(int64_t x, int64_t y) {\n  return x < y;\n}\n\nstatic inline bool sle8(int8_t x, int8_t y) {\n  return x <= y;\n}\n\nstatic inline bool sle16(int16_t x, int16_t y) {\n  return x <= y;\n}\n\nstatic inline bool sle32(int32_t x, int32_t y) {\n  return x <= y;\n}\n\nstatic inline bool sle64(int64_t x, int64_t y) {\n  return x <= y;\n}\n\nstatic inline uint8_t pow8(uint8_t x, uint8_t y) {\n  uint8_t res = 1, rem = y;\n\n  while (rem != 0) {\n    if (rem & 1)\n      res *= x;\n    rem >>= 1;\n    x *= x;\n  }\n  return res;\n}\n\nstatic inline uint16_t pow16(uint16_t x, uint16_t y) {\n  uint16_t res = 1, rem = y;\n\n  while (rem != 0) {\n    if (rem & 1)\n      res *= x;\n    rem >>= 1;\n    x *= x;\n  }\n  return res;\n}\n\nstatic inline uint32_t pow32(uint32_t x, uint32_t y) {\n  uint32_t res = 1, rem = y;\n\n  while (rem != 0) {\n    if (rem & 1)\n      res *= x;\n    rem >>= 1;\n    x *= x;\n  }\n  return res;\n}\n\nstatic inline uint64_t pow64(uint64_t x, uint64_t y) {\n  uint64_t res = 1, rem = y;\n\n  while (rem != 0) {\n    if (rem & 1)\n      res *= x;\n    rem >>= 1;\n    ", "x *= x;\n  }\n  return res;\n}\n\nstatic inline bool itob_i8_bool(int8_t x) {\n  return x != 0;\n}\n\nstatic inline bool itob_i16_bool(int16_t x) {\n  return x != 0;\n}\n\nstatic inline bool itob_i32_bool(int32_t x) {\n  return x != 0;\n}\n\nstatic inline bool itob_i64_bool(int64_t x) {\n  return x != 0;\n}\n\nstatic inline int8_t btoi_bool_i8(bool x) {\n  return x;\n}\n\nstatic inline int16_t btoi_bool_i16(bool x) {\n  return x;\n}\n\nstatic inline int32_t btoi_bool_i32(bool x) {\n  return x;\n}\n\nstatic inline int64_t btoi_bool_i64(bool x) {\n  return x;\n}\n\n#define sext_i8_i8(x) ((int8_t) (int8_t) (x))\n#define sext_i8_i16(x) ((int16_t) (int8_t) (x))\n#define sext_i8_i32(x) ((int32_t) (int8_t) (x))\n#define sext_i8_i64(x) ((int64_t) (int8_t) (x))\n#define sext_i16_i8(x) ((int8_t) (int16_t) (x))\n#define sext_i16_i16(x) ((int16_t) (int16_t) (x))\n#define sext_i16_i32(x) ((int32_t) (int16_t) (x))\n#define sext_i16_i64(x) ((int64_t) (int16_t) (x))\n#define sext_i32_i8(x) ((int8_t) (int32_t) (x))\n#define sext_i32_i16(x) ((int16_t) (int32_t) (x))\n#define sext_i32_i32(x) ((int32_t) (int32_t) (x))\n#define sext_i32_i64(x) ((int64_t) (int32_t) (x))\n#define sext_i64_i8(x) ((int8_t) (int64_t) (x))\n#define sext_i64_i16(x) ((int16_t) (int64_t) (x))\n#define sext_i64_i32(x) ((int32_t) (int64_t) (x))\n#define sext_i64_i64(x) ((int64_t) (int64_t) (x))\n#define zext_i8_i8(x) ((int8_t) (uint8_t) (x))\n#define zext_i8_i16(x) ((int16_t) (uint8_t) (x))\n#define zext_i8_i32(x) ((int32_t) (uint8_t) (x))\n#define zext_i8_i64(x) ((int64_t) (uint8_t) (x))\n#define zext_i16_i8(x) ((int8_t) (uint16_t) (x))\n#define zext_i16_i16(x) ((int16_t) (uint16_t) (x))\n#define zext_i16_i32(x) ((int32_t) (uint16_t) (x))\n#define zext_i16_i64(x) ((int64_t) (uint16_t) (x))\n#define zext_i32_i8(x) ((int8_t) (uint32_t) (x))\n#define zext_i32_i16(x) ((int16_t) (uint32_t) (x))\n#define zext_i32_i32(x) ((int32_t) (uint32_t) (x))\n#define zext_i32_i64(x) ((int64_t) (uint32_t) (x))\n#define zext_i64_i8(x) ((int8_t) (uint64_t) (x))\n#define zext_i64_i16(x) ((int16_t) (",
                                       "uint64_t) (x))\n#define zext_i64_i32(x) ((int32_t) (uint64_t) (x))\n#define zext_i64_i64(x) ((int64_t) (uint64_t) (x))\n\nstatic int8_t abs8(int8_t x) {\n  return (int8_t)abs(x);\n}\n\nstatic int16_t abs16(int16_t x) {\n  return (int16_t)abs(x);\n}\n\nstatic int32_t abs32(int32_t x) {\n  return abs(x);\n}\n\nstatic int64_t abs64(int64_t x) {\n#if defined(__OPENCL_VERSION__) || defined(ISPC)\n  return abs(x);\n#else\n  return llabs(x);\n#endif\n}\n\n#if defined(__OPENCL_VERSION__)\nstatic int32_t futrts_popc8(int8_t x) {\n  return popcount(x);\n}\n\nstatic int32_t futrts_popc16(int16_t x) {\n  return popcount(x);\n}\n\nstatic int32_t futrts_popc32(int32_t x) {\n  return popcount(x);\n}\n\nstatic int32_t futrts_popc64(int64_t x) {\n  return popcount(x);\n}\n#elif defined(__CUDA_ARCH__)\n\nstatic int32_t futrts_popc8(int8_t x) {\n  return __popc(zext_i8_i32(x));\n}\n\nstatic int32_t futrts_popc16(int16_t x) {\n  return __popc(zext_i16_i32(x));\n}\n\nstatic int32_t futrts_popc32(int32_t x) {\n  return __popc(x);\n}\n\nstatic int32_t futrts_popc64(int64_t x) {\n  return __popcll(x);\n}\n\n#else // Not OpenCL or CUDA, but plain C.\n\nstatic int32_t futrts_popc8(uint8_t x) {\n  int c = 0;\n  for (; x; ++c) { x &= x - 1; }\n  return c;\n}\n\nstatic int32_t futrts_popc16(uint16_t x) {\n  int c = 0;\n  for (; x; ++c) { x &= x - 1; }\n  return c;\n}\n\nstatic int32_t futrts_popc32(uint32_t x) {\n  int c = 0;\n  for (; x; ++c) { x &= x - 1; }\n  return c;\n}\n\nstatic int32_t futrts_popc64(uint64_t x) {\n  int c = 0;\n  for (; x; ++c) { x &= x - 1; }\n  return c;\n}\n#endif\n\n#if defined(__OPENCL_VERSION__)\nstatic uint8_t  futrts_umul_hi8 ( uint8_t a,  uint8_t b) { return mul_hi(a, b); }\nstatic uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return mul_hi(a, b); }\nstatic uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return mul_hi(a, b); }\nstatic uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return mul_hi(a, b); }\nstatic uint8_t  futrts_smul_hi8 ( int8_t a,  int8_t b) { return mul_hi(a, b); }\nstatic uint16_t futrts_smul_hi16(int16_t a, int16_t b", ") { return mul_hi(a, b); }\nstatic uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return mul_hi(a, b); }\nstatic uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return mul_hi(a, b); }\n#elif defined(__CUDA_ARCH__)\nstatic  uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }\nstatic uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }\nstatic uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return __umulhi(a, b); }\nstatic uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return __umul64hi(a, b); }\nstatic  uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; }\nstatic uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }\nstatic uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return __mulhi(a, b); }\nstatic uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return __mul64hi(a, b); }\n#elif ISPC\nstatic uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }\nstatic uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }\nstatic uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }\nstatic uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) {\n  uint64_t ah = a >> 32;\n  uint64_t al = a & 0xffffffff;\n  uint64_t bh = b >> 32;\n  uint64_t bl = b & 0xffffffff;\n\n  uint64_t p1 = al * bl;\n  uint64_t p2 = al * bh;\n  uint64_t p3 = ah * bl;\n  uint64_t p4 = ah * bh;\n\n  uint64_t p1h = p1 >> 32;\n  uint64_t p2h = p2 >> 32;\n  uint64_t p3h = p3 >> 32;\n  uint64_t p2l = p2 & 0xffffffff;\n  uint64_t p3l = p3 & 0xffffffff;\n\n  uint64_t l = p1h + p2l + p3l;\n  uint64_t m = (p2 >> 32) + (p3 >> 32);\n  uint64_t h = (l >> 32) + m + p4;\n\n  return h;\n}\nstatic  int8_t futrts_smul_hi8 ( int8_t a,  int8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }\nstatic int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((uint32_t)a", ") * ((uint32_t)b) >> 16; }\nstatic int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }\nstatic int64_t futrts_smul_hi64(int64_t a, int64_t b) {\n  uint64_t ah = a >> 32;\n  uint64_t al = a & 0xffffffff;\n  uint64_t bh = b >> 32;\n  uint64_t bl = b & 0xffffffff;\n\n  uint64_t p1 =  al * bl;\n  int64_t  p2 = al * bh;\n  int64_t  p3 = ah * bl;\n  uint64_t p4 =  ah * bh;\n\n  uint64_t p1h = p1 >> 32;\n  uint64_t p2h = p2 >> 32;\n  uint64_t p3h = p3 >> 32;\n  uint64_t p2l = p2 & 0xffffffff;\n  uint64_t p3l = p3 & 0xffffffff;\n\n  uint64_t l = p1h + p2l + p3l;\n  uint64_t m = (p2 >> 32) + (p3 >> 32);\n  uint64_t h = (l >> 32) + m + p4;\n\n  return h;\n}\n\n#else // Not OpenCL, ISPC, or CUDA, but plain C.\nstatic uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }\nstatic uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }\nstatic uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }\nstatic uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return ((__uint128_t)a) * ((__uint128_t)b) >> 64; }\nstatic int8_t futrts_smul_hi8(int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; }\nstatic int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }\nstatic int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((int64_t)a) * ((int64_t)b) >> 32; }\nstatic int64_t futrts_smul_hi64(int64_t a, int64_t b) { return ((__int128_t)a) * ((__int128_t)b) >> 64; }\n#endif\n\n#if defined(__OPENCL_VERSION__)\nstatic  uint8_t futrts_umad_hi8 ( uint8_t a,  uint8_t b,  uint8_t c) { return mad_hi(a, b, c); }\nstatic uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return mad_hi(a, b, c); }\nstatic uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return mad_hi(a, b, c); }\nstatic uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return mad_hi(a, b, c); }\nstatic  uint8_t futrts_sm",
                                       "ad_hi8( int8_t a,  int8_t b,   int8_t c) { return mad_hi(a, b, c); }\nstatic uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return mad_hi(a, b, c); }\nstatic uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return mad_hi(a, b, c); }\nstatic uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return mad_hi(a, b, c); }\n#else // Not OpenCL\n\nstatic  uint8_t futrts_umad_hi8( uint8_t a,  uint8_t b,  uint8_t c) { return futrts_umul_hi8(a, b) + c; }\nstatic uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return futrts_umul_hi16(a, b) + c; }\nstatic uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return futrts_umul_hi32(a, b) + c; }\nstatic uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return futrts_umul_hi64(a, b) + c; }\nstatic  uint8_t futrts_smad_hi8 ( int8_t a,  int8_t b,  int8_t c) { return futrts_smul_hi8(a, b) + c; }\nstatic uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return futrts_smul_hi16(a, b) + c; }\nstatic uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return futrts_smul_hi32(a, b) + c; }\nstatic uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return futrts_smul_hi64(a, b) + c; }\n#endif\n\n#if defined(__OPENCL_VERSION__)\nstatic int32_t futrts_clzz8(int8_t x) {\n  return clz(x);\n}\n\nstatic int32_t futrts_clzz16(int16_t x) {\n  return clz(x);\n}\n\nstatic int32_t futrts_clzz32(int32_t x) {\n  return clz(x);\n}\n\nstatic int32_t futrts_clzz64(int64_t x) {\n  return clz(x);\n}\n\n#elif defined(__CUDA_ARCH__)\n\nstatic int32_t futrts_clzz8(int8_t x) {\n  return __clz(zext_i8_i32(x)) - 24;\n}\n\nstatic int32_t futrts_clzz16(int16_t x) {\n  return __clz(zext_i16_i32(x)) - 16;\n}\n\nstatic int32_t futrts_clzz32(int32_t x) {\n  return __clz(x);\n}\n\nstatic int32_t futrts_clzz64(int64_t x) {\n  return __clzll(x);\n}\n\n#elif ISPC\n\nstatic int32_t futrts_clzz8(int8_t x) {\n  return count_leading_zeros((int32_t)(uint8_t)x)-24;\n}\n\nstatic int32_t futrts_clzz16(int16_t x) {\n  return count_lead", "ing_zeros((int32_t)(uint16_t)x)-16;\n}\n\nstatic int32_t futrts_clzz32(int32_t x) {\n  return count_leading_zeros(x);\n}\n\nstatic int32_t futrts_clzz64(int64_t x) {\n  return count_leading_zeros(x);\n}\n\n#else // Not OpenCL, ISPC or CUDA, but plain C.\n\nstatic int32_t futrts_clzz8(int8_t x) {\n  return x == 0 ? 8 : __builtin_clz((uint32_t)zext_i8_i32(x)) - 24;\n}\n\nstatic int32_t futrts_clzz16(int16_t x) {\n  return x == 0 ? 16 : __builtin_clz((uint32_t)zext_i16_i32(x)) - 16;\n}\n\nstatic int32_t futrts_clzz32(int32_t x) {\n  return x == 0 ? 32 : __builtin_clz((uint32_t)x);\n}\n\nstatic int32_t futrts_clzz64(int64_t x) {\n  return x == 0 ? 64 : __builtin_clzll((uint64_t)x);\n}\n#endif\n\n#if defined(__OPENCL_VERSION__)\nstatic int32_t futrts_ctzz8(int8_t x) {\n  int i = 0;\n  for (; i < 8 && (x & 1) == 0; i++, x >>= 1)\n    ;\n  return i;\n}\n\nstatic int32_t futrts_ctzz16(int16_t x) {\n  int i = 0;\n  for (; i < 16 && (x & 1) == 0; i++, x >>= 1)\n    ;\n  return i;\n}\n\nstatic int32_t futrts_ctzz32(int32_t x) {\n  int i = 0;\n  for (; i < 32 && (x & 1) == 0; i++, x >>= 1)\n    ;\n  return i;\n}\n\nstatic int32_t futrts_ctzz64(int64_t x) {\n  int i = 0;\n  for (; i < 64 && (x & 1) == 0; i++, x >>= 1)\n    ;\n  return i;\n}\n\n#elif defined(__CUDA_ARCH__)\n\nstatic int32_t futrts_ctzz8(int8_t x) {\n  int y = __ffs(x);\n  return y == 0 ? 8 : y - 1;\n}\n\nstatic int32_t futrts_ctzz16(int16_t x) {\n  int y = __ffs(x);\n  return y == 0 ? 16 : y - 1;\n}\n\nstatic int32_t futrts_ctzz32(int32_t x) {\n  int y = __ffs(x);\n  return y == 0 ? 32 : y - 1;\n}\n\nstatic int32_t futrts_ctzz64(int64_t x) {\n  int y = __ffsll(x);\n  return y == 0 ? 64 : y - 1;\n}\n\n#elif ISPC\n\nstatic int32_t futrts_ctzz8(int8_t x) {\n  return x == 0 ? 8 : count_trailing_zeros((int32_t)x);\n}\n\nstatic int32_t futrts_ctzz16(int16_t x) {\n  return x == 0 ? 16 : count_trailing_zeros((int32_t)x);\n}\n\nstatic int32_t futrts_ctzz32(int32_t x) {\n  return count_trailing_zeros(x);\n}\n\nstatic int32_t futrts_ctzz64(int64_t x) {\n  return count_trailing_zeros(x);\n}\n\n#else // Not OpenCL or CUDA,", " but plain C.\n\nstatic int32_t futrts_ctzz8(int8_t x) {\n  return x == 0 ? 8 : __builtin_ctz((uint32_t)x);\n}\n\nstatic int32_t futrts_ctzz16(int16_t x) {\n  return x == 0 ? 16 : __builtin_ctz((uint32_t)x);\n}\n\nstatic int32_t futrts_ctzz32(int32_t x) {\n  return x == 0 ? 32 : __builtin_ctz((uint32_t)x);\n}\n\nstatic int32_t futrts_ctzz64(int64_t x) {\n  return x == 0 ? 64 : __builtin_ctzll((uint64_t)x);\n}\n#endif\n\nstatic inline float fdiv32(float x, float y) {\n  return x / y;\n}\n\nstatic inline float fadd32(float x, float y) {\n  return x + y;\n}\n\nstatic inline float fsub32(float x, float y) {\n  return x - y;\n}\n\nstatic inline float fmul32(float x, float y) {\n  return x * y;\n}\n\nstatic inline bool cmplt32(float x, float y) {\n  return x < y;\n}\n\nstatic inline bool cmple32(float x, float y) {\n  return x <= y;\n}\n\nstatic inline float sitofp_i8_f32(int8_t x) {\n  return (float) x;\n}\n\nstatic inline float sitofp_i16_f32(int16_t x) {\n  return (float) x;\n}\n\nstatic inline float sitofp_i32_f32(int32_t x) {\n  return (float) x;\n}\n\nstatic inline float sitofp_i64_f32(int64_t x) {\n  return (float) x;\n}\n\nstatic inline float uitofp_i8_f32(uint8_t x) {\n  return (float) x;\n}\n\nstatic inline float uitofp_i16_f32(uint16_t x) {\n  return (float) x;\n}\n\nstatic inline float uitofp_i32_f32(uint32_t x) {\n  return (float) x;\n}\n\nstatic inline float uitofp_i64_f32(uint64_t x) {\n  return (float) x;\n}\n\n#ifdef __OPENCL_VERSION__\nstatic inline float fabs32(float x) {\n  return fabs(x);\n}\n\nstatic inline float fmax32(float x, float y) {\n  return fmax(x, y);\n}\n\nstatic inline float fmin32(float x, float y) {\n  return fmin(x, y);\n}\n\nstatic inline float fpow32(float x, float y) {\n  return pow(x, y);\n}\n\n#elif ISPC\n\nstatic inline float fabs32(float x) {\n  return abs(x);\n}\n\nstatic inline float fmax32(float x, float y) {\n  return isnan(x) ? y : isnan(y) ? x : max(x, y);\n}\n\nstatic inline float fmin32(float x, float y) {\n  return isnan(x) ? y : isnan(y) ? x : min(x, y);\n}\n\nstatic inline float fpow32(float a, float b) {\n  float ret;\n  f",
                                       "oreach_active (i) {\n      uniform float r = __stdlib_powf(extract(a, i), extract(b, i));\n      ret = insert(ret, i, r);\n  }\n  return ret;\n}\n\n#else // Not OpenCL, but CUDA or plain C.\n\nstatic inline float fabs32(float x) {\n  return fabsf(x);\n}\n\nstatic inline float fmax32(float x, float y) {\n  return fmaxf(x, y);\n}\n\nstatic inline float fmin32(float x, float y) {\n  return fminf(x, y);\n}\n\nstatic inline float fpow32(float x, float y) {\n  return powf(x, y);\n}\n#endif\n\nstatic inline bool futrts_isnan32(float x) {\n  return isnan(x);\n}\n\n#if ISPC\n\nstatic inline bool futrts_isinf32(float x) {\n  return !isnan(x) && isnan(x - x);\n}\n\nstatic inline bool futrts_isfinite32(float x) {\n  return !isnan(x) && !futrts_isinf32(x);\n}\n\n#else\n\nstatic inline bool futrts_isinf32(float x) {\n  return isinf(x);\n}\n\n#endif\n\nstatic inline int8_t fptosi_f32_i8(float x) {\n  if (futrts_isnan32(x) || futrts_isinf32(x)) {\n    return 0;\n  } else {\n    return (int8_t) x;\n  }\n}\n\nstatic inline int16_t fptosi_f32_i16(float x) {\n  if (futrts_isnan32(x) || futrts_isinf32(x)) {\n    return 0;\n  } else {\n    return (int16_t) x;\n  }\n}\n\nstatic inline int32_t fptosi_f32_i32(float x) {\n  if (futrts_isnan32(x) || futrts_isinf32(x)) {\n    return 0;\n  } else {\n    return (int32_t) x;\n  }\n}\n\nstatic inline int64_t fptosi_f32_i64(float x) {\n  if (futrts_isnan32(x) || futrts_isinf32(x)) {\n    return 0;\n  } else {\n    return (int64_t) x;\n  };\n}\n\nstatic inline uint8_t fptoui_f32_i8(float x) {\n  if (futrts_isnan32(x) || futrts_isinf32(x)) {\n    return 0;\n  } else {\n    return (uint8_t) (int8_t) x;\n  }\n}\n\nstatic inline uint16_t fptoui_f32_i16(float x) {\n  if (futrts_isnan32(x) || futrts_isinf32(x)) {\n    return 0;\n  } else {\n    return (uint16_t) (int16_t) x;\n  }\n}\n\nstatic inline uint32_t fptoui_f32_i32(float x) {\n  if (futrts_isnan32(x) || futrts_isinf32(x)) {\n    return 0;\n  } else {\n    return (uint32_t) (int32_t) x;\n  }\n}\n\nstatic inline uint64_t fptoui_f32_i64(float x) {\n  if (futrts_isnan32(x) || futrts_isinf32(x)) {\n    ret", "urn 0;\n  } else {\n    return (uint64_t) (int64_t) x;\n  }\n}\n\nstatic inline bool ftob_f32_bool(float x) {\n  return x != 0;\n}\n\nstatic inline float btof_bool_f32(bool x) {\n  return x ? 1 : 0;\n}\n\n#ifdef __OPENCL_VERSION__\nstatic inline float futrts_log32(float x) {\n  return log(x);\n}\n\nstatic inline float futrts_log2_32(float x) {\n  return log2(x);\n}\n\nstatic inline float futrts_log10_32(float x) {\n  return log10(x);\n}\n\nstatic inline float futrts_log1p_32(float x) {\n  return log1p(x);\n}\n\nstatic inline float futrts_sqrt32(float x) {\n  return sqrt(x);\n}\n\nstatic inline float futrts_cbrt32(float x) {\n  return cbrt(x);\n}\n\nstatic inline float futrts_exp32(float x) {\n  return exp(x);\n}\n\nstatic inline float futrts_cos32(float x) {\n  return cos(x);\n}\n\nstatic inline float futrts_sin32(float x) {\n  return sin(x);\n}\n\nstatic inline float futrts_tan32(float x) {\n  return tan(x);\n}\n\nstatic inline float futrts_acos32(float x) {\n  return acos(x);\n}\n\nstatic inline float futrts_asin32(float x) {\n  return asin(x);\n}\n\nstatic inline float futrts_atan32(float x) {\n  return atan(x);\n}\n\nstatic inline float futrts_cosh32(float x) {\n  return cosh(x);\n}\n\nstatic inline float futrts_sinh32(float x) {\n  return sinh(x);\n}\n\nstatic inline float futrts_tanh32(float x) {\n  return tanh(x);\n}\n\nstatic inline float futrts_acosh32(float x) {\n  return acosh(x);\n}\n\nstatic inline float futrts_asinh32(float x) {\n  return asinh(x);\n}\n\nstatic inline float futrts_atanh32(float x) {\n  return atanh(x);\n}\n\nstatic inline float futrts_atan2_32(float x, float y) {\n  return atan2(x, y);\n}\n\nstatic inline float futrts_hypot32(float x, float y) {\n  return hypot(x, y);\n}\n\nstatic inline float futrts_gamma32(float x) {\n  return tgamma(x);\n}\n\nstatic inline float futrts_lgamma32(float x) {\n  return lgamma(x);\n}\n\nstatic inline float futrts_erf32(float x) {\n  return erf(x);\n}\n\nstatic inline float futrts_erfc32(float x) {\n  return erfc(x);\n}\n\nstatic inline float fmod32(float x, float y) {\n  return fmod(x, y);\n}\n\nstatic inline float futrt", "s_round32(float x) {\n  return rint(x);\n}\n\nstatic inline float futrts_floor32(float x) {\n  return floor(x);\n}\n\nstatic inline float futrts_ceil32(float x) {\n  return ceil(x);\n}\n\nstatic inline float futrts_nextafter32(float x, float y) {\n  return nextafter(x, y);\n}\n\nstatic inline float futrts_lerp32(float v0, float v1, float t) {\n  return mix(v0, v1, t);\n}\n\nstatic inline float futrts_mad32(float a, float b, float c) {\n  return mad(a, b, c);\n}\n\nstatic inline float futrts_fma32(float a, float b, float c) {\n  return fma(a, b, c);\n}\n\n#elif ISPC\n\nstatic inline float futrts_log32(float x) {\n  return futrts_isfinite32(x) || (futrts_isinf32(x) && x < 0)? log(x) : x;\n}\n\nstatic inline float futrts_log2_32(float x) {\n  return futrts_log32(x) / log(2.0f);\n}\n\nstatic inline float futrts_log10_32(float x) {\n  return futrts_log32(x) / log(10.0f);\n}\n\nstatic inline float futrts_log1p_32(float x) {\n  if(x == -1.0f || (futrts_isinf32(x) && x > 0.0f)) return x / 0.0f;\n  float y = 1.0f + x;\n  float z = y - 1.0f;\n  return log(y) - (z-x)/y;\n}\n\nstatic inline float futrts_sqrt32(float x) {\n  return sqrt(x);\n}\n\nextern \"C\" unmasked uniform float cbrtf(uniform float);\nstatic inline float futrts_cbrt32(float x) {\n  float res;\n  foreach_active (i) {\n    uniform float r = cbrtf(extract(x, i));\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nstatic inline float futrts_exp32(float x) {\n  return exp(x);\n}\n\nstatic inline float futrts_cos32(float x) {\n  return cos(x);\n}\n\nstatic inline float futrts_sin32(float x) {\n  return sin(x);\n}\n\nstatic inline float futrts_tan32(float x) {\n  return tan(x);\n}\n\nstatic inline float futrts_acos32(float x) {\n  return acos(x);\n}\n\nstatic inline float futrts_asin32(float x) {\n  return asin(x);\n}\n\nstatic inline float futrts_atan32(float x) {\n  return atan(x);\n}\n\nstatic inline float futrts_cosh32(float x) {\n  return (exp(x)+exp(-x)) / 2.0f;\n}\n\nstatic inline float futrts_sinh32(float x) {\n  return (exp(x)-exp(-x)) / 2.0f;\n}\n\nstatic inline float futrts_tanh32(float x) {\n  retur",
                                       "n futrts_sinh32(x)/futrts_cosh32(x);\n}\n\nstatic inline float futrts_acosh32(float x) {\n  float f = x+sqrt(x*x-1);\n  if(futrts_isfinite32(f)) return log(f);\n  return f;\n}\n\nstatic inline float futrts_asinh32(float x) {\n  float f = x+sqrt(x*x+1);\n  if(futrts_isfinite32(f)) return log(f);\n  return f;\n\n}\n\nstatic inline float futrts_atanh32(float x) {\n  float f = (1+x)/(1-x);\n  if(futrts_isfinite32(f)) return log(f)/2.0f;\n  return f;\n\n}\n\nstatic inline float futrts_atan2_32(float x, float y) {\n  return (x == 0.0f && y == 0.0f) ? 0.0f : atan2(x, y);\n}\n\nstatic inline float futrts_hypot32(float x, float y) {\n  if (futrts_isfinite32(x) && futrts_isfinite32(y)) {\n    x = abs(x);\n    y = abs(y);\n    float a;\n    float b;\n    if (x >= y){\n        a = x;\n        b = y;\n    } else {\n        a = y;\n        b = x;\n    }\n    if(b == 0){\n      return a;\n    }\n\n    int e;\n    float an;\n    float bn;\n    an = frexp (a, &e);\n    bn = ldexp (b, - e);\n    float cn;\n    cn = sqrt (an * an + bn * bn);\n    return ldexp (cn, e);\n  } else {\n    if (futrts_isinf32(x) || futrts_isinf32(y)) return INFINITY;\n    else return x + y;\n  }\n\n}\n\nextern \"C\" unmasked uniform float tgammaf(uniform float x);\nstatic inline float futrts_gamma32(float x) {\n  float res;\n  foreach_active (i) {\n    uniform float r = tgammaf(extract(x, i));\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nextern \"C\" unmasked uniform float lgammaf(uniform float x);\nstatic inline float futrts_lgamma32(float x) {\n  float res;\n  foreach_active (i) {\n    uniform float r = lgammaf(extract(x, i));\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nextern \"C\" unmasked uniform float erff(uniform float x);\nstatic inline float futrts_erf32(float x) {\n  float res;\n  foreach_active (i) {\n    uniform float r = erff(extract(x, i));\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nextern \"C\" unmasked uniform float erfcf(uniform float x);\nstatic inline float futrts_erfc32(float x) {\n  float res;\n  foreach_active (i) {\n    uniform float r = erfcf(extr", "act(x, i));\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nstatic inline float fmod32(float x, float y) {\n  return x - y * trunc(x/y);\n}\n\nstatic inline float futrts_round32(float x) {\n  return round(x);\n}\n\nstatic inline float futrts_floor32(float x) {\n  return floor(x);\n}\n\nstatic inline float futrts_ceil32(float x) {\n  return ceil(x);\n}\n\nextern \"C\" unmasked uniform float nextafterf(uniform float x, uniform float y);\nstatic inline float futrts_nextafter32(float x, float y) {\n  float res;\n  foreach_active (i) {\n    uniform float r = nextafterf(extract(x, i), extract(y, i));\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nstatic inline float futrts_lerp32(float v0, float v1, float t) {\n  return v0 + (v1 - v0) * t;\n}\n\nstatic inline float futrts_mad32(float a, float b, float c) {\n  return a * b + c;\n}\n\nstatic inline float futrts_fma32(float a, float b, float c) {\n  return a * b + c;\n}\n\n#else // Not OpenCL or ISPC, but CUDA or plain C.\n\nstatic inline float futrts_log32(float x) {\n  return logf(x);\n}\n\nstatic inline float futrts_log2_32(float x) {\n  return log2f(x);\n}\n\nstatic inline float futrts_log10_32(float x) {\n  return log10f(x);\n}\n\nstatic inline float futrts_log1p_32(float x) {\n  return log1pf(x);\n}\n\nstatic inline float futrts_sqrt32(float x) {\n  return sqrtf(x);\n}\n\nstatic inline float futrts_cbrt32(float x) {\n  return cbrtf(x);\n}\n\nstatic inline float futrts_exp32(float x) {\n  return expf(x);\n}\n\nstatic inline float futrts_cos32(float x) {\n  return cosf(x);\n}\n\nstatic inline float futrts_sin32(float x) {\n  return sinf(x);\n}\n\nstatic inline float futrts_tan32(float x) {\n  return tanf(x);\n}\n\nstatic inline float futrts_acos32(float x) {\n  return acosf(x);\n}\n\nstatic inline float futrts_asin32(float x) {\n  return asinf(x);\n}\n\nstatic inline float futrts_atan32(float x) {\n  return atanf(x);\n}\n\nstatic inline float futrts_cosh32(float x) {\n  return coshf(x);\n}\n\nstatic inline float futrts_sinh32(float x) {\n  return sinhf(x);\n}\n\nstatic inline float futrts_tanh32(float x) {\n  r", "eturn tanhf(x);\n}\n\nstatic inline float futrts_acosh32(float x) {\n  return acoshf(x);\n}\n\nstatic inline float futrts_asinh32(float x) {\n  return asinhf(x);\n}\n\nstatic inline float futrts_atanh32(float x) {\n  return atanhf(x);\n}\n\nstatic inline float futrts_atan2_32(float x, float y) {\n  return atan2f(x, y);\n}\n\nstatic inline float futrts_hypot32(float x, float y) {\n  return hypotf(x, y);\n}\n\nstatic inline float futrts_gamma32(float x) {\n  return tgammaf(x);\n}\n\nstatic inline float futrts_lgamma32(float x) {\n  return lgammaf(x);\n}\n\nstatic inline float futrts_erf32(float x) {\n  return erff(x);\n}\n\nstatic inline float futrts_erfc32(float x) {\n  return erfcf(x);\n}\n\nstatic inline float fmod32(float x, float y) {\n  return fmodf(x, y);\n}\n\nstatic inline float futrts_round32(float x) {\n  return rintf(x);\n}\n\nstatic inline float futrts_floor32(float x) {\n  return floorf(x);\n}\n\nstatic inline float futrts_ceil32(float x) {\n  return ceilf(x);\n}\n\nstatic inline float futrts_nextafter32(float x, float y) {\n  return nextafterf(x, y);\n}\n\nstatic inline float futrts_lerp32(float v0, float v1, float t) {\n  return v0 + (v1 - v0) * t;\n}\n\nstatic inline float futrts_mad32(float a, float b, float c) {\n  return a * b + c;\n}\n\nstatic inline float futrts_fma32(float a, float b, float c) {\n  return fmaf(a, b, c);\n}\n#endif\n\n#if ISPC\nstatic inline int32_t futrts_to_bits32(float x) {\n  return intbits(x);\n}\n\nstatic inline float futrts_from_bits32(int32_t x) {\n  return floatbits(x);\n}\n#else\nstatic inline int32_t futrts_to_bits32(float x) {\n  union {\n    float f;\n    int32_t t;\n  } p;\n\n  p.f = x;\n  return p.t;\n}\n\nstatic inline float futrts_from_bits32(int32_t x) {\n  union {\n    int32_t f;\n    float t;\n  } p;\n\n  p.f = x;\n  return p.t;\n}\n#endif\n\nstatic inline float fsignum32(float x) {\n  return futrts_isnan32(x) ? x : (x > 0 ? 1 : 0) - (x < 0 ? 1 : 0);\n}\n\n#ifdef FUTHARK_F64_ENABLED\n\n#if ISPC\nstatic inline bool futrts_isinf64(float x) {\n  return !isnan(x) && isnan(x - x);\n}\n\nstatic inline bool futrts_isfinite64(fl",
                                       "oat x) {\n  return !isnan(x) && !futrts_isinf64(x);\n}\n\nstatic inline double fdiv64(double x, double y) {\n  return x / y;\n}\n\nstatic inline double fadd64(double x, double y) {\n  return x + y;\n}\n\nstatic inline double fsub64(double x, double y) {\n  return x - y;\n}\n\nstatic inline double fmul64(double x, double y) {\n  return x * y;\n}\n\nstatic inline bool cmplt64(double x, double y) {\n  return x < y;\n}\n\nstatic inline bool cmple64(double x, double y) {\n  return x <= y;\n}\n\nstatic inline double sitofp_i8_f64(int8_t x) {\n  return (double) x;\n}\n\nstatic inline double sitofp_i16_f64(int16_t x) {\n  return (double) x;\n}\n\nstatic inline double sitofp_i32_f64(int32_t x) {\n  return (double) x;\n}\n\nstatic inline double sitofp_i64_f64(int64_t x) {\n  return (double) x;\n}\n\nstatic inline double uitofp_i8_f64(uint8_t x) {\n  return (double) x;\n}\n\nstatic inline double uitofp_i16_f64(uint16_t x) {\n  return (double) x;\n}\n\nstatic inline double uitofp_i32_f64(uint32_t x) {\n  return (double) x;\n}\n\nstatic inline double uitofp_i64_f64(uint64_t x) {\n  return (double) x;\n}\n\nstatic inline double fabs64(double x) {\n  return abs(x);\n}\n\nstatic inline double fmax64(double x, double y) {\n  return isnan(x) ? y : isnan(y) ? x : max(x, y);\n}\n\nstatic inline double fmin64(double x, double y) {\n  return isnan(x) ? y : isnan(y) ? x : min(x, y);\n}\n\nstatic inline double fpow64(double a, double b) {\n  float ret;\n  foreach_active (i) {\n      uniform float r = __stdlib_powf(extract(a, i), extract(b, i));\n      ret = insert(ret, i, r);\n  }\n  return ret;\n}\n\nstatic inline double futrts_log64(double x) {\n  return futrts_isfinite64(x) || (futrts_isinf64(x) && x < 0)? log(x) : x;\n}\n\nstatic inline double futrts_log2_64(double x) {\n  return futrts_log64(x)/log(2.0d);\n}\n\nstatic inline double futrts_log10_64(double x) {\n  return futrts_log64(x)/log(10.0d);\n}\n\nstatic inline double futrts_log1p_64(double x) {\n  if(x == -1.0d || (futrts_isinf64(x) && x > 0.0d)) return x / 0.0d;\n  double y = 1.0d + x;\n  double z = y - 1.0d;\n  return log", "(y) - (z-x)/y;\n}\n\nstatic inline double futrts_sqrt64(double x) {\n  return sqrt(x);\n}\n\nextern \"C\" unmasked uniform double cbrt(uniform double);\nstatic inline double futrts_cbrt64(double x) {\n  double res;\n  foreach_active (i) {\n    uniform double r = cbrtf(extract(x, i));\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nstatic inline double futrts_exp64(double x) {\n  return exp(x);\n}\n\nstatic inline double futrts_cos64(double x) {\n  return cos(x);\n}\n\nstatic inline double futrts_sin64(double x) {\n  return sin(x);\n}\n\nstatic inline double futrts_tan64(double x) {\n  return tan(x);\n}\n\nstatic inline double futrts_acos64(double x) {\n  return acos(x);\n}\n\nstatic inline double futrts_asin64(double x) {\n  return asin(x);\n}\n\nstatic inline double futrts_atan64(double x) {\n  return atan(x);\n}\n\nstatic inline double futrts_cosh64(double x) {\n  return (exp(x)+exp(-x)) / 2.0d;\n}\n\nstatic inline double futrts_sinh64(double x) {\n  return (exp(x)-exp(-x)) / 2.0d;\n}\n\nstatic inline double futrts_tanh64(double x) {\n  return futrts_sinh64(x)/futrts_cosh64(x);\n}\n\nstatic inline double futrts_acosh64(double x) {\n  double f = x+sqrt(x*x-1.0d);\n  if(futrts_isfinite64(f)) return log(f);\n  return f;\n}\n\nstatic inline double futrts_asinh64(double x) {\n  double f = x+sqrt(x*x+1.0d);\n  if(futrts_isfinite64(f)) return log(f);\n  return f;\n}\n\nstatic inline double futrts_atanh64(double x) {\n  double f = (1.0d+x)/(1.0d-x);\n  if(futrts_isfinite64(f)) return log(f)/2.0d;\n  return f;\n\n}\n\nstatic inline double futrts_atan2_64(double x, double y) {\n  return atan2(x, y);\n}\n\nextern \"C\" unmasked uniform double hypot(uniform double x, uniform double y);\nstatic inline double futrts_hypot64(double x, double y) {\n  double res;\n  foreach_active (i) {\n    uniform double r = hypot(extract(x, i), extract(y, i));\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nextern \"C\" unmasked uniform double tgamma(uniform double x);\nstatic inline double futrts_gamma64(double x) {\n  double res;\n  foreach_active (i) {\n    uniform double r", " = tgamma(extract(x, i));\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nextern \"C\" unmasked uniform double lgamma(uniform double x);\nstatic inline double futrts_lgamma64(double x) {\n  double res;\n  foreach_active (i) {\n    uniform double r = lgamma(extract(x, i));\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nextern \"C\" unmasked uniform double erf(uniform double x);\nstatic inline double futrts_erf64(double x) {\n  double res;\n  foreach_active (i) {\n    uniform double r = erf(extract(x, i));\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nextern \"C\" unmasked uniform double erfc(uniform double x);\nstatic inline double futrts_erfc64(double x) {\n  double res;\n  foreach_active (i) {\n    uniform double r = erfc(extract(x, i));\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nstatic inline double futrts_fma64(double a, double b, double c) {\n  return a * b + c;\n}\n\nstatic inline double futrts_round64(double x) {\n  return round(x);\n}\n\nstatic inline double futrts_ceil64(double x) {\n  return ceil(x);\n}\n\nextern \"C\" unmasked uniform double nextafter(uniform float x, uniform double y);\nstatic inline float futrts_nextafter64(double x, double y) {\n  double res;\n  foreach_active (i) {\n    uniform double r = nextafter(extract(x, i), extract(y, i));\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nstatic inline double futrts_floor64(double x) {\n  return floor(x);\n}\n\nstatic inline bool futrts_isnan64(double x) {\n  return isnan(x);\n}\n\nstatic inline int8_t fptosi_f64_i8(double x) {\n  if (futrts_isnan64(x) || futrts_isinf64(x)) {\n    return 0;\n  } else {\n    return (int8_t) x;\n  }\n}\n\nstatic inline int16_t fptosi_f64_i16(double x) {\n  if (futrts_isnan64(x) || futrts_isinf64(x)) {\n    return 0;\n  } else {\n    return (int16_t) x;\n  }\n}\n\nstatic inline int32_t fptosi_f64_i32(double x) {\n  if (futrts_isnan64(x) || futrts_isinf64(x)) {\n    return 0;\n  } else {\n    return (int32_t) x;\n  }\n}\n\nstatic inline int64_t fptosi_f64_i64(double x) {\n  if (futrts_isnan64(x) || futrts_isinf64(x)) {",
                                       "\n    return 0;\n  } else {\n    return (int64_t) x;\n  }\n}\n\nstatic inline uint8_t fptoui_f64_i8(double x) {\n  if (futrts_isnan64(x) || futrts_isinf64(x)) {\n    return 0;\n  } else {\n    return (uint8_t) (int8_t) x;\n  }\n}\n\nstatic inline uint16_t fptoui_f64_i16(double x) {\n  if (futrts_isnan64(x) || futrts_isinf64(x)) {\n    return 0;\n  } else {\n    return (uint16_t) (int16_t) x;\n  }\n}\n\nstatic inline uint32_t fptoui_f64_i32(double x) {\n  if (futrts_isnan64(x) || futrts_isinf64(x)) {\n    return 0;\n  } else {\n    return (uint32_t) (int32_t) x;\n  }\n}\n\nstatic inline uint64_t fptoui_f64_i64(double x) {\n  if (futrts_isnan64(x) || futrts_isinf64(x)) {\n    return 0;\n  } else {\n    return (uint64_t) (int64_t) x;\n  }\n}\n\nstatic inline bool ftob_f64_bool(double x) {\n  return x != 0.0;\n}\n\nstatic inline double btof_bool_f64(bool x) {\n  return x ? 1.0 : 0.0;\n}\n\nstatic inline int64_t futrts_to_bits64(double x) {\n  int64_t res;\n  foreach_active (i) {\n    uniform double tmp = extract(x, i);\n    uniform int64_t r = *((uniform int64_t* uniform)&tmp);\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nstatic inline double futrts_from_bits64(int64_t x) {\n  double res;\n  foreach_active (i) {\n    uniform int64_t tmp = extract(x, i);\n    uniform double r = *((uniform double* uniform)&tmp);\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nstatic inline double fmod64(double x, double y) {\n  return x - y * trunc(x/y);\n}\n\nstatic inline double fsignum64(double x) {\n  return futrts_isnan64(x) ? x : (x > 0 ? 1.0d : 0.0d) - (x < 0 ? 1.0d : 0.0d);\n}\n\nstatic inline double futrts_lerp64(double v0, double v1, double t) {\n  return v0 + (v1 - v0) * t;\n}\n\nstatic inline double futrts_mad64(double a, double b, double c) {\n  return a * b + c;\n}\n\nstatic inline float fpconv_f32_f32(float x) {\n  return (float) x;\n}\n\nstatic inline double fpconv_f32_f64(float x) {\n  return (double) x;\n}\n\nstatic inline float fpconv_f64_f32(double x) {\n  return (float) x;\n}\n\nstatic inline double fpconv_f64_f64(double x) {\n  return (double", ") x;\n}\n\n#else\n\nstatic inline double fdiv64(double x, double y) {\n  return x / y;\n}\n\nstatic inline double fadd64(double x, double y) {\n  return x + y;\n}\n\nstatic inline double fsub64(double x, double y) {\n  return x - y;\n}\n\nstatic inline double fmul64(double x, double y) {\n  return x * y;\n}\n\nstatic inline bool cmplt64(double x, double y) {\n  return x < y;\n}\n\nstatic inline bool cmple64(double x, double y) {\n  return x <= y;\n}\n\nstatic inline double sitofp_i8_f64(int8_t x) {\n  return (double) x;\n}\n\nstatic inline double sitofp_i16_f64(int16_t x) {\n  return (double) x;\n}\n\nstatic inline double sitofp_i32_f64(int32_t x) {\n  return (double) x;\n}\n\nstatic inline double sitofp_i64_f64(int64_t x) {\n  return (double) x;\n}\n\nstatic inline double uitofp_i8_f64(uint8_t x) {\n  return (double) x;\n}\n\nstatic inline double uitofp_i16_f64(uint16_t x) {\n  return (double) x;\n}\n\nstatic inline double uitofp_i32_f64(uint32_t x) {\n  return (double) x;\n}\n\nstatic inline double uitofp_i64_f64(uint64_t x) {\n  return (double) x;\n}\n\nstatic inline double fabs64(double x) {\n  return fabs(x);\n}\n\nstatic inline double fmax64(double x, double y) {\n  return fmax(x, y);\n}\n\nstatic inline double fmin64(double x, double y) {\n  return fmin(x, y);\n}\n\nstatic inline double fpow64(double x, double y) {\n  return pow(x, y);\n}\n\nstatic inline double futrts_log64(double x) {\n  return log(x);\n}\n\nstatic inline double futrts_log2_64(double x) {\n  return log2(x);\n}\n\nstatic inline double futrts_log10_64(double x) {\n  return log10(x);\n}\n\nstatic inline double futrts_log1p_64(double x) {\n  return log1p(x);\n}\n\nstatic inline double futrts_sqrt64(double x) {\n  return sqrt(x);\n}\n\nstatic inline double futrts_cbrt64(double x) {\n  return cbrt(x);\n}\n\nstatic inline double futrts_exp64(double x) {\n  return exp(x);\n}\n\nstatic inline double futrts_cos64(double x) {\n  return cos(x);\n}\n\nstatic inline double futrts_sin64(double x) {\n  return sin(x);\n}\n\nstatic inline double futrts_tan64(double x) {\n  return tan(x);\n}\n\nstatic inline double futrts_a", "cos64(double x) {\n  return acos(x);\n}\n\nstatic inline double futrts_asin64(double x) {\n  return asin(x);\n}\n\nstatic inline double futrts_atan64(double x) {\n  return atan(x);\n}\n\nstatic inline double futrts_cosh64(double x) {\n  return cosh(x);\n}\n\nstatic inline double futrts_sinh64(double x) {\n  return sinh(x);\n}\n\nstatic inline double futrts_tanh64(double x) {\n  return tanh(x);\n}\n\nstatic inline double futrts_acosh64(double x) {\n  return acosh(x);\n}\n\nstatic inline double futrts_asinh64(double x) {\n  return asinh(x);\n}\n\nstatic inline double futrts_atanh64(double x) {\n  return atanh(x);\n}\n\nstatic inline double futrts_atan2_64(double x, double y) {\n  return atan2(x, y);\n}\n\nstatic inline double futrts_hypot64(double x, double y) {\n  return hypot(x, y);\n}\n\nstatic inline double futrts_gamma64(double x) {\n  return tgamma(x);\n}\n\nstatic inline double futrts_lgamma64(double x) {\n  return lgamma(x);\n}\n\nstatic inline double futrts_erf64(double x) {\n  return erf(x);\n}\n\nstatic inline double futrts_erfc64(double x) {\n  return erfc(x);\n}\n\nstatic inline double futrts_fma64(double a, double b, double c) {\n  return fma(a, b, c);\n}\n\nstatic inline double futrts_round64(double x) {\n  return rint(x);\n}\n\nstatic inline double futrts_ceil64(double x) {\n  return ceil(x);\n}\n\nstatic inline float futrts_nextafter64(float x, float y) {\n  return nextafter(x, y);\n}\n\nstatic inline double futrts_floor64(double x) {\n  return floor(x);\n}\n\nstatic inline bool futrts_isnan64(double x) {\n  return isnan(x);\n}\n\nstatic inline bool futrts_isinf64(double x) {\n  return isinf(x);\n}\n\nstatic inline int8_t fptosi_f64_i8(double x) {\n  if (futrts_isnan64(x) || futrts_isinf64(x)) {\n    return 0;\n  } else {\n    return (int8_t) x;\n  }\n}\n\nstatic inline int16_t fptosi_f64_i16(double x) {\n  if (futrts_isnan64(x) || futrts_isinf64(x)) {\n    return 0;\n  } else {\n    return (int16_t) x;\n  }\n}\n\nstatic inline int32_t fptosi_f64_i32(double x) {\n  if (futrts_isnan64(x) || futrts_isinf64(x)) {\n    return 0;\n  } else {\n    return (int32_t",
                                       ") x;\n  }\n}\n\nstatic inline int64_t fptosi_f64_i64(double x) {\n  if (futrts_isnan64(x) || futrts_isinf64(x)) {\n    return 0;\n  } else {\n    return (int64_t) x;\n  }\n}\n\nstatic inline uint8_t fptoui_f64_i8(double x) {\n  if (futrts_isnan64(x) || futrts_isinf64(x)) {\n    return 0;\n  } else {\n    return (uint8_t) (int8_t) x;\n  }\n}\n\nstatic inline uint16_t fptoui_f64_i16(double x) {\n  if (futrts_isnan64(x) || futrts_isinf64(x)) {\n    return 0;\n  } else {\n    return (uint16_t) (int16_t) x;\n  }\n}\n\nstatic inline uint32_t fptoui_f64_i32(double x) {\n  if (futrts_isnan64(x) || futrts_isinf64(x)) {\n    return 0;\n  } else {\n    return (uint32_t) (int32_t) x;\n  }\n}\n\nstatic inline uint64_t fptoui_f64_i64(double x) {\n  if (futrts_isnan64(x) || futrts_isinf64(x)) {\n    return 0;\n  } else {\n    return (uint64_t) (int64_t) x;\n  }\n}\n\nstatic inline bool ftob_f64_bool(double x) {\n  return x != 0;\n}\n\nstatic inline double btof_bool_f64(bool x) {\n  return x ? 1 : 0;\n}\n\nstatic inline int64_t futrts_to_bits64(double x) {\n  union {\n    double f;\n    int64_t t;\n  } p;\n\n  p.f = x;\n  return p.t;\n}\n\nstatic inline double futrts_from_bits64(int64_t x) {\n  union {\n    int64_t f;\n    double t;\n  } p;\n\n  p.f = x;\n  return p.t;\n}\n\nstatic inline double fmod64(double x, double y) {\n  return fmod(x, y);\n}\n\nstatic inline double fsignum64(double x) {\n  return futrts_isnan64(x) ? x : (x > 0) - (x < 0);\n}\n\nstatic inline double futrts_lerp64(double v0, double v1, double t) {\n#ifdef __OPENCL_VERSION__\n  return mix(v0, v1, t);\n#else\n  return v0 + (v1 - v0) * t;\n#endif\n}\n\nstatic inline double futrts_mad64(double a, double b, double c) {\n#ifdef __OPENCL_VERSION__\n  return mad(a, b, c);\n#else\n  return a * b + c;\n#endif\n}\n\nstatic inline float fpconv_f32_f32(float x) {\n  return (float) x;\n}\n\nstatic inline double fpconv_f32_f64(float x) {\n  return (double) x;\n}\n\nstatic inline float fpconv_f64_f32(double x) {\n  return (float) x;\n}\n\nstatic inline double fpconv_f64_f64(double x) {\n  return (double) x;\n}\n\n#endif\n\n#endif\n\n// End", " of scalar.h.\n// Start of scalar_f16.h.\n\n// Half-precision is emulated if needed (e.g. in straight C) with the\n// native type used if possible.  The emulation works by typedef'ing\n// 'float' to 'f16', and then implementing all operations on single\n// precision.  To cut down on duplication, we use the same code for\n// those Futhark functions that require just operators or casts.  The\n// in-memory representation for arrays will still be 16 bits even\n// under emulation, so the compiler will have to be careful when\n// generating reads or writes.\n\n#if !defined(cl_khr_fp16) && !(defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) && !(defined(ISPC))\n#define EMULATE_F16\n#endif\n\n#if !defined(EMULATE_F16) && defined(__OPENCL_VERSION__)\n#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n#endif\n\n#ifdef EMULATE_F16\n\n// Note that the half-precision storage format is still 16 bits - the\n// compiler will have to be real careful!\ntypedef float f16;\n\n#elif ISPC\ntypedef float16 f16;\n\n#else\n\n#ifdef __CUDA_ARCH__\n#include <cuda_fp16.h>\n#endif\n\ntypedef half f16;\n\n#endif\n\n// Some of these functions convert to single precision because half\n// precision versions are not available.\n\nstatic inline f16 fadd16(f16 x, f16 y) {\n  return x + y;\n}\n\nstatic inline f16 fsub16(f16 x, f16 y) {\n  return x - y;\n}\n\nstatic inline f16 fmul16(f16 x, f16 y) {\n  return x * y;\n}\n\nstatic inline bool cmplt16(f16 x, f16 y) {\n  return x < y;\n}\n\nstatic inline bool cmple16(f16 x, f16 y) {\n  return x <= y;\n}\n\nstatic inline f16 sitofp_i8_f16(int8_t x) {\n  return (f16) x;\n}\n\nstatic inline f16 sitofp_i16_f16(int16_t x) {\n  return (f16) x;\n}\n\nstatic inline f16 sitofp_i32_f16(int32_t x) {\n  return (f16) x;\n}\n\nstatic inline f16 sitofp_i64_f16(int64_t x) {\n  return (f16) x;\n}\n\nstatic inline f16 uitofp_i8_f16(uint8_t x) {\n  return (f16) x;\n}\n\nstatic inline f16 uitofp_i16_f16(uint16_t x) {\n  return (f16) x;\n}\n\nstatic inline f16 uitofp_i32_f16(uint32_t x) {\n  return (f16) x;\n}\n\nstatic inline f16 uitofp_i64_f16(uint64_t x) {\n  return (f1", "6) x;\n}\n\nstatic inline int8_t fptosi_f16_i8(f16 x) {\n  return (int8_t) (float) x;\n}\n\nstatic inline int16_t fptosi_f16_i16(f16 x) {\n  return (int16_t) x;\n}\n\nstatic inline int32_t fptosi_f16_i32(f16 x) {\n  return (int32_t) x;\n}\n\nstatic inline int64_t fptosi_f16_i64(f16 x) {\n  return (int64_t) x;\n}\n\nstatic inline uint8_t fptoui_f16_i8(f16 x) {\n  return (uint8_t) (float) x;\n}\n\nstatic inline uint16_t fptoui_f16_i16(f16 x) {\n  return (uint16_t) x;\n}\n\nstatic inline uint32_t fptoui_f16_i32(f16 x) {\n  return (uint32_t) x;\n}\n\nstatic inline uint64_t fptoui_f16_i64(f16 x) {\n  return (uint64_t) x;\n}\n\nstatic inline bool ftob_f16_bool(f16 x) {\n  return x != (f16)0;\n}\n\nstatic inline f16 btof_bool_f16(bool x) {\n  return x ? 1 : 0;\n}\n\n#ifndef EMULATE_F16\nstatic inline bool futrts_isnan16(f16 x) {\n  return isnan((float)x);\n}\n\n#ifdef __OPENCL_VERSION__\n\nstatic inline f16 fabs16(f16 x) {\n  return fabs(x);\n}\n\nstatic inline f16 fmax16(f16 x, f16 y) {\n  return fmax(x, y);\n}\n\nstatic inline f16 fmin16(f16 x, f16 y) {\n  return fmin(x, y);\n}\n\nstatic inline f16 fpow16(f16 x, f16 y) {\n  return pow(x, y);\n}\n\n#elif ISPC\nstatic inline f16 fabs16(f16 x) {\n  return abs(x);\n}\n\nstatic inline f16 fmax16(f16 x, f16 y) {\n  return futrts_isnan16(x) ? y : futrts_isnan16(y) ? x : max(x, y);\n}\n\nstatic inline f16 fmin16(f16 x, f16 y) {\n  return futrts_isnan16(x) ? y : futrts_isnan16(y) ? x : min(x, y);\n}\n\nstatic inline f16 fpow16(f16 x, f16 y) {\n  return pow(x, y);\n}\n#else // Assuming CUDA.\n\nstatic inline f16 fabs16(f16 x) {\n  return fabsf(x);\n}\n\nstatic inline f16 fmax16(f16 x, f16 y) {\n  return fmaxf(x, y);\n}\n\nstatic inline f16 fmin16(f16 x, f16 y) {\n  return fminf(x, y);\n}\n\nstatic inline f16 fpow16(f16 x, f16 y) {\n  return powf(x, y);\n}\n#endif\n\n#if ISPC\nstatic inline bool futrts_isinf16(float x) {\n  return !futrts_isnan16(x) && futrts_isnan16(x - x);\n}\nstatic inline bool futrts_isfinite16(float x) {\n  return !futrts_isnan16(x) && !futrts_isinf16(x);\n}\n\n#else\n\nstatic inline bool futrts_isinf16(f16 x) {\n  retu",
                                       "rn isinf((float)x);\n}\n#endif\n\n#ifdef __OPENCL_VERSION__\nstatic inline f16 futrts_log16(f16 x) {\n  return log(x);\n}\n\nstatic inline f16 futrts_log2_16(f16 x) {\n  return log2(x);\n}\n\nstatic inline f16 futrts_log10_16(f16 x) {\n  return log10(x);\n}\n\nstatic inline f16 futrts_log1p_16(f16 x) {\n  return log1p(x);\n}\n\nstatic inline f16 futrts_sqrt16(f16 x) {\n  return sqrt(x);\n}\n\nstatic inline f16 futrts_cbrt16(f16 x) {\n  return cbrt(x);\n}\n\nstatic inline f16 futrts_exp16(f16 x) {\n  return exp(x);\n}\n\nstatic inline f16 futrts_cos16(f16 x) {\n  return cos(x);\n}\n\nstatic inline f16 futrts_sin16(f16 x) {\n  return sin(x);\n}\n\nstatic inline f16 futrts_tan16(f16 x) {\n  return tan(x);\n}\n\nstatic inline f16 futrts_acos16(f16 x) {\n  return acos(x);\n}\n\nstatic inline f16 futrts_asin16(f16 x) {\n  return asin(x);\n}\n\nstatic inline f16 futrts_atan16(f16 x) {\n  return atan(x);\n}\n\nstatic inline f16 futrts_cosh16(f16 x) {\n  return cosh(x);\n}\n\nstatic inline f16 futrts_sinh16(f16 x) {\n  return sinh(x);\n}\n\nstatic inline f16 futrts_tanh16(f16 x) {\n  return tanh(x);\n}\n\nstatic inline f16 futrts_acosh16(f16 x) {\n  return acosh(x);\n}\n\nstatic inline f16 futrts_asinh16(f16 x) {\n  return asinh(x);\n}\n\nstatic inline f16 futrts_atanh16(f16 x) {\n  return atanh(x);\n}\n\nstatic inline f16 futrts_atan2_16(f16 x, f16 y) {\n  return atan2(x, y);\n}\n\nstatic inline f16 futrts_hypot16(f16 x, f16 y) {\n  return hypot(x, y);\n}\n\nstatic inline f16 futrts_gamma16(f16 x) {\n  return tgamma(x);\n}\n\nstatic inline f16 futrts_lgamma16(f16 x) {\n  return lgamma(x);\n}\n\nstatic inline f16 futrts_erf16(f16 x) {\n  return erf(x);\n}\n\nstatic inline f16 futrts_erfc16(f16 x) {\n  return erfc(x);\n}\n\nstatic inline f16 fmod16(f16 x, f16 y) {\n  return fmod(x, y);\n}\n\nstatic inline f16 futrts_round16(f16 x) {\n  return rint(x);\n}\n\nstatic inline f16 futrts_floor16(f16 x) {\n  return floor(x);\n}\n\nstatic inline f16 futrts_ceil16(f16 x) {\n  return ceil(x);\n}\n\nstatic inline f16 futrts_nextafter16(f16 x, f16 y) {\n  return nextafter(x, y);\n}\n\nstatic inline f16 futrts_", "lerp16(f16 v0, f16 v1, f16 t) {\n  return mix(v0, v1, t);\n}\n\nstatic inline f16 futrts_mad16(f16 a, f16 b, f16 c) {\n  return mad(a, b, c);\n}\n\nstatic inline f16 futrts_fma16(f16 a, f16 b, f16 c) {\n  return fma(a, b, c);\n}\n#elif ISPC\n\nstatic inline f16 futrts_log16(f16 x) {\n  return futrts_isfinite16(x) || (futrts_isinf16(x) && x < 0) ? log(x) : x;\n}\n\nstatic inline f16 futrts_log2_16(f16 x) {\n  return futrts_log16(x) / log(2.0f16);\n}\n\nstatic inline f16 futrts_log10_16(f16 x) {\n  return futrts_log16(x) / log(10.0f16);\n}\n\nstatic inline f16 futrts_log1p_16(f16 x) {\n  if(x == -1.0f16 || (futrts_isinf16(x) && x > 0.0f16)) return x / 0.0f16;\n  f16 y = 1.0f16 + x;\n  f16 z = y - 1.0f16;\n  return log(y) - (z-x)/y;\n}\n\nstatic inline f16 futrts_sqrt16(f16 x) {\n  return (float16)sqrt((float)x);\n}\n\nstatic inline f16 futrts_exp16(f16 x) {\n  return exp(x);\n}\n\nstatic inline f16 futrts_cos16(f16 x) {\n  return (float16)cos((float)x);\n}\n\nstatic inline f16 futrts_sin16(f16 x) {\n  return (float16)sin((float)x);\n}\n\nstatic inline f16 futrts_tan16(f16 x) {\n  return (float16)tan((float)x);\n}\n\nstatic inline f16 futrts_acos16(f16 x) {\n  return (float16)acos((float)x);\n}\n\nstatic inline f16 futrts_asin16(f16 x) {\n  return (float16)asin((float)x);\n}\n\nstatic inline f16 futrts_atan16(f16 x) {\n  return (float16)atan((float)x);\n}\n\nstatic inline f16 futrts_cosh16(f16 x) {\n  return (exp(x)+exp(-x)) / 2.0f16;\n}\n\nstatic inline f16 futrts_sinh16(f16 x) {\n  return (exp(x)-exp(-x)) / 2.0f16;\n}\n\nstatic inline f16 futrts_tanh16(f16 x) {\n  return futrts_sinh16(x)/futrts_cosh16(x);\n}\n\nstatic inline f16 futrts_acosh16(f16 x) {\n  float16 f = x+(float16)sqrt((float)(x*x-1));\n  if(futrts_isfinite16(f)) return log(f);\n  return f;\n}\n\nstatic inline f16 futrts_asinh16(f16 x) {\n  float16 f = x+(float16)sqrt((float)(x*x+1));\n  if(futrts_isfinite16(f)) return log(f);\n  return f;\n}\n\nstatic inline f16 futrts_atanh16(f16 x) {\n  float16 f = (1+x)/(1-x);\n  if(futrts_isfinite16(f)) return log(f)/2.0f16;\n  return f;\n}\n\nstatic inline", " f16 futrts_atan2_16(f16 x, f16 y) {\n  return (float16)atan2((float)x, (float)y);\n}\n\nstatic inline f16 futrts_hypot16(f16 x, f16 y) {\n  return (float16)futrts_hypot32((float)x, (float)y);\n}\n\nextern \"C\" unmasked uniform float tgammaf(uniform float x);\nstatic inline f16 futrts_gamma16(f16 x) {\n  f16 res;\n  foreach_active (i) {\n    uniform f16 r = (f16)tgammaf(extract((float)x, i));\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nextern \"C\" unmasked uniform float lgammaf(uniform float x);\nstatic inline f16 futrts_lgamma16(f16 x) {\n  f16 res;\n  foreach_active (i) {\n    uniform f16 r = (f16)lgammaf(extract((float)x, i));\n    res = insert(res, i, r);\n  }\n  return res;\n}\n\nstatic inline f16 futrts_cbrt16(f16 x) {\n  f16 res = (f16)futrts_cbrt32((float)x);\n  return res;\n}\n\nstatic inline f16 futrts_erf16(f16 x) {\n  f16 res = (f16)futrts_erf32((float)x);\n  return res;\n}\n\nstatic inline f16 futrts_erfc16(f16 x) {\n  f16 res = (f16)futrts_erfc32((float)x);\n  return res;\n}\n\nstatic inline f16 fmod16(f16 x, f16 y) {\n  return x - y * (float16)trunc((float) (x/y));\n}\n\nstatic inline f16 futrts_round16(f16 x) {\n  return (float16)round((float)x);\n}\n\nstatic inline f16 futrts_floor16(f16 x) {\n  return (float16)floor((float)x);\n}\n\nstatic inline f16 futrts_ceil16(f16 x) {\n  return (float16)ceil((float)x);\n}\n\nstatic inline f16 futrts_nextafter16(f16 x, f16 y) {\n  return (float16)futrts_nextafter32((float)x, (float) y);\n}\n\nstatic inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {\n  return v0 + (v1 - v0) * t;\n}\n\nstatic inline f16 futrts_mad16(f16 a, f16 b, f16 c) {\n  return a * b + c;\n}\n\nstatic inline f16 futrts_fma16(f16 a, f16 b, f16 c) {\n  return a * b + c;\n}\n\n#else // Assume CUDA.\n\nstatic inline f16 futrts_log16(f16 x) {\n  return hlog(x);\n}\n\nstatic inline f16 futrts_log2_16(f16 x) {\n  return hlog2(x);\n}\n\nstatic inline f16 futrts_log10_16(f16 x) {\n  return hlog10(x);\n}\n\nstatic inline f16 futrts_log1p_16(f16 x) {\n  return (f16)log1pf((float)x);\n}\n\nstatic inline f16 futrts_sqrt16(f16 x) {\n  ret",
                                       "urn hsqrt(x);\n}\n\nstatic inline f16 futrts_cbrt16(f16 x) {\n  return cbrtf(x);\n}\n\nstatic inline f16 futrts_exp16(f16 x) {\n  return hexp(x);\n}\n\nstatic inline f16 futrts_cos16(f16 x) {\n  return hcos(x);\n}\n\nstatic inline f16 futrts_sin16(f16 x) {\n  return hsin(x);\n}\n\nstatic inline f16 futrts_tan16(f16 x) {\n  return tanf(x);\n}\n\nstatic inline f16 futrts_acos16(f16 x) {\n  return acosf(x);\n}\n\nstatic inline f16 futrts_asin16(f16 x) {\n  return asinf(x);\n}\n\nstatic inline f16 futrts_atan16(f16 x) {\n  return atanf(x);\n}\n\nstatic inline f16 futrts_cosh16(f16 x) {\n  return coshf(x);\n}\n\nstatic inline f16 futrts_sinh16(f16 x) {\n  return sinhf(x);\n}\n\nstatic inline f16 futrts_tanh16(f16 x) {\n  return tanhf(x);\n}\n\nstatic inline f16 futrts_acosh16(f16 x) {\n  return acoshf(x);\n}\n\nstatic inline f16 futrts_asinh16(f16 x) {\n  return asinhf(x);\n}\n\nstatic inline f16 futrts_atanh16(f16 x) {\n  return atanhf(x);\n}\n\nstatic inline f16 futrts_atan2_16(f16 x, f16 y) {\n  return atan2f(x, y);\n}\n\nstatic inline f16 futrts_hypot16(f16 x, f16 y) {\n  return hypotf(x, y);\n}\n\nstatic inline f16 futrts_gamma16(f16 x) {\n  return tgammaf(x);\n}\n\nstatic inline f16 futrts_lgamma16(f16 x) {\n  return lgammaf(x);\n}\n\nstatic inline f16 futrts_erf16(f16 x) {\n  return erff(x);\n}\n\nstatic inline f16 futrts_erfc16(f16 x) {\n  return erfcf(x);\n}\n\nstatic inline f16 fmod16(f16 x, f16 y) {\n  return fmodf(x, y);\n}\n\nstatic inline f16 futrts_round16(f16 x) {\n  return rintf(x);\n}\n\nstatic inline f16 futrts_floor16(f16 x) {\n  return hfloor(x);\n}\n\nstatic inline f16 futrts_ceil16(f16 x) {\n  return hceil(x);\n}\n\nstatic inline f16 futrts_nextafter16(f16 x, f16 y) {\n  return __ushort_as_half(halfbitsnextafter(__half_as_ushort(x), __half_as_ushort(y)));\n}\n\nstatic inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {\n  return v0 + (v1 - v0) * t;\n}\n\nstatic inline f16 futrts_mad16(f16 a, f16 b, f16 c) {\n  return a * b + c;\n}\n\nstatic inline f16 futrts_fma16(f16 a, f16 b, f16 c) {\n  return fmaf(a, b, c);\n}\n\n#endif\n\n// The CUDA __half type cannot be put ", "in unions for some reason, so we\n// use bespoke conversion functions instead.\n#ifdef __CUDA_ARCH__\nstatic inline int16_t futrts_to_bits16(f16 x) {\n  return __half_as_ushort(x);\n}\nstatic inline f16 futrts_from_bits16(int16_t x) {\n  return __ushort_as_half(x);\n}\n#elif ISPC\n\nstatic inline int16_t futrts_to_bits16(f16 x) {\n  varying int16_t y = *((varying int16_t * uniform)&x);\n  return y;\n}\n\nstatic inline f16 futrts_from_bits16(int16_t x) {\n  varying f16 y = *((varying f16 * uniform)&x);\n  return y;\n}\n#else\nstatic inline int16_t futrts_to_bits16(f16 x) {\n  union {\n    f16 f;\n    int16_t t;\n  } p;\n\n  p.f = x;\n  return p.t;\n}\n\nstatic inline f16 futrts_from_bits16(int16_t x) {\n  union {\n    int16_t f;\n    f16 t;\n  } p;\n\n  p.f = x;\n  return p.t;\n}\n#endif\n\n#else // No native f16 - emulate.\n\nstatic inline f16 fabs16(f16 x) {\n  return fabs32(x);\n}\n\nstatic inline f16 fmax16(f16 x, f16 y) {\n  return fmax32(x, y);\n}\n\nstatic inline f16 fmin16(f16 x, f16 y) {\n  return fmin32(x, y);\n}\n\nstatic inline f16 fpow16(f16 x, f16 y) {\n  return fpow32(x, y);\n}\n\nstatic inline bool futrts_isnan16(f16 x) {\n  return futrts_isnan32(x);\n}\n\nstatic inline bool futrts_isinf16(f16 x) {\n  return futrts_isinf32(x);\n}\n\nstatic inline f16 futrts_log16(f16 x) {\n  return futrts_log32(x);\n}\n\nstatic inline f16 futrts_log2_16(f16 x) {\n  return futrts_log2_32(x);\n}\n\nstatic inline f16 futrts_log10_16(f16 x) {\n  return futrts_log10_32(x);\n}\n\nstatic inline f16 futrts_log1p_16(f16 x) {\n  return futrts_log1p_32(x);\n}\n\nstatic inline f16 futrts_sqrt16(f16 x) {\n  return futrts_sqrt32(x);\n}\n\nstatic inline f16 futrts_cbrt16(f16 x) {\n  return futrts_cbrt32(x);\n}\n\nstatic inline f16 futrts_exp16(f16 x) {\n  return futrts_exp32(x);\n}\n\nstatic inline f16 futrts_cos16(f16 x) {\n  return futrts_cos32(x);\n}\n\nstatic inline f16 futrts_sin16(f16 x) {\n  return futrts_sin32(x);\n}\n\nstatic inline f16 futrts_tan16(f16 x) {\n  return futrts_tan32(x);\n}\n\nstatic inline f16 futrts_acos16(f16 x) {\n  return futrts_acos32(x);\n}\n\nstatic inline f16 f", "utrts_asin16(f16 x) {\n  return futrts_asin32(x);\n}\n\nstatic inline f16 futrts_atan16(f16 x) {\n  return futrts_atan32(x);\n}\n\nstatic inline f16 futrts_cosh16(f16 x) {\n  return futrts_cosh32(x);\n}\n\nstatic inline f16 futrts_sinh16(f16 x) {\n  return futrts_sinh32(x);\n}\n\nstatic inline f16 futrts_tanh16(f16 x) {\n  return futrts_tanh32(x);\n}\n\nstatic inline f16 futrts_acosh16(f16 x) {\n  return futrts_acosh32(x);\n}\n\nstatic inline f16 futrts_asinh16(f16 x) {\n  return futrts_asinh32(x);\n}\n\nstatic inline f16 futrts_atanh16(f16 x) {\n  return futrts_atanh32(x);\n}\n\nstatic inline f16 futrts_atan2_16(f16 x, f16 y) {\n  return futrts_atan2_32(x, y);\n}\n\nstatic inline f16 futrts_hypot16(f16 x, f16 y) {\n  return futrts_hypot32(x, y);\n}\n\nstatic inline f16 futrts_gamma16(f16 x) {\n  return futrts_gamma32(x);\n}\n\nstatic inline f16 futrts_lgamma16(f16 x) {\n  return futrts_lgamma32(x);\n}\n\nstatic inline f16 futrts_erf16(f16 x) {\n  return futrts_erf32(x);\n}\n\nstatic inline f16 futrts_erfc16(f16 x) {\n  return futrts_erfc32(x);\n}\n\nstatic inline f16 fmod16(f16 x, f16 y) {\n  return fmod32(x, y);\n}\n\nstatic inline f16 futrts_round16(f16 x) {\n  return futrts_round32(x);\n}\n\nstatic inline f16 futrts_floor16(f16 x) {\n  return futrts_floor32(x);\n}\n\nstatic inline f16 futrts_ceil16(f16 x) {\n  return futrts_ceil32(x);\n}\n\nstatic inline f16 futrts_nextafter16(f16 x, f16 y) {\n  return halfbits2float(halfbitsnextafter(float2halfbits(x), float2halfbits(y)));\n}\n\nstatic inline f16 futrts_lerp16(f16 v0, f16 v1, f16 t) {\n  return futrts_lerp32(v0, v1, t);\n}\n\nstatic inline f16 futrts_mad16(f16 a, f16 b, f16 c) {\n  return futrts_mad32(a, b, c);\n}\n\nstatic inline f16 futrts_fma16(f16 a, f16 b, f16 c) {\n  return futrts_fma32(a, b, c);\n}\n\n// Even when we are using an OpenCL that does not support cl_khr_fp16,\n// it must still support vload_half for actually creating a\n// half-precision number, which can then be efficiently converted to a\n// float.  Similarly for vstore_half.\n#ifdef __OPENCL_VERSION__\n\nstatic inline int16_t futrt",
                                       "s_to_bits16(f16 x) {\n  int16_t y;\n  // Violating strict aliasing here.\n  vstore_half((float)x, 0, (half*)&y);\n  return y;\n}\n\nstatic inline f16 futrts_from_bits16(int16_t x) {\n  return (f16)vload_half(0, (half*)&x);\n}\n\n#else\n\nstatic inline int16_t futrts_to_bits16(f16 x) {\n  return (int16_t)float2halfbits(x);\n}\n\nstatic inline f16 futrts_from_bits16(int16_t x) {\n  return halfbits2float((uint16_t)x);\n}\n\nstatic inline f16 fsignum16(f16 x) {\n  return futrts_isnan16(x) ? x : (x > 0 ? 1 : 0) - (x < 0 ? 1 : 0);\n}\n\n#endif\n\n#endif\n\nstatic inline float fpconv_f16_f16(f16 x) {\n  return x;\n}\n\nstatic inline float fpconv_f16_f32(f16 x) {\n  return x;\n}\n\nstatic inline f16 fpconv_f32_f16(float x) {\n  return (f16) x;\n}\n\n#ifdef FUTHARK_F64_ENABLED\n\nstatic inline double fpconv_f16_f64(f16 x) {\n  return (double) x;\n}\n\n#if ISPC\nstatic inline f16 fpconv_f64_f16(double x) {\n  return (f16) ((float)x);\n}\n#else\nstatic inline f16 fpconv_f64_f16(double x) {\n  return (f16) x;\n}\n#endif\n#endif\n\n\n// End of scalar_f16.h.\n// Start of atomics.h\n\ninline int32_t atomic_xchg_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicExch((int32_t*)p, x);\n#else\n  return atomic_xor(p, x);\n#endif\n}\n\ninline int32_t atomic_xchg_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicExch((int32_t*)p, x);\n#else\n  return atomic_xor(p, x);\n#endif\n}\n\ninline int32_t atomic_cmpxchg_i32_global(volatile __global int32_t *p,\n                                         int32_t cmp, int32_t val) {\n#ifdef FUTHARK_CUDA\n  return atomicCAS((int32_t*)p, cmp, val);\n#else\n  return atomic_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int32_t atomic_cmpxchg_i32_local(volatile __local int32_t *p,\n                                        int32_t cmp, int32_t val) {\n#ifdef FUTHARK_CUDA\n  return atomicCAS((int32_t*)p, cmp, val);\n#else\n  return atomic_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int32_t atomic_add_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  re", "turn atomicAdd((int32_t*)p, x);\n#else\n  return atomic_add(p, x);\n#endif\n}\n\ninline int32_t atomic_add_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicAdd((int32_t*)p, x);\n#else\n  return atomic_add(p, x);\n#endif\n}\n\ninline float atomic_fadd_f32_global(volatile __global float *p, float x) {\n#ifdef FUTHARK_CUDA\n  return atomicAdd((float*)p, x);\n#else\n  union { int32_t i; float f; } old;\n  union { int32_t i; float f; } assumed;\n  old.f = *p;\n  do {\n    assumed.f = old.f;\n    old.f = old.f + x;\n    old.i = atomic_cmpxchg_i32_global((volatile __global int32_t*)p, assumed.i, old.i);\n  } while (assumed.i != old.i);\n  return old.f;\n#endif\n}\n\ninline float atomic_fadd_f32_local(volatile __local float *p, float x) {\n#ifdef FUTHARK_CUDA\n  return atomicAdd((float*)p, x);\n#else\n  union { int32_t i; float f; } old;\n  union { int32_t i; float f; } assumed;\n  old.f = *p;\n  do {\n    assumed.f = old.f;\n    old.f = old.f + x;\n    old.i = atomic_cmpxchg_i32_local((volatile __local int32_t*)p, assumed.i, old.i);\n  } while (assumed.i != old.i);\n  return old.f;\n#endif\n}\n\ninline int32_t atomic_smax_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMax((int32_t*)p, x);\n#else\n  return atomic_max(p, x);\n#endif\n}\n\ninline int32_t atomic_smax_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMax((int32_t*)p, x);\n#else\n  return atomic_max(p, x);\n#endif\n}\n\ninline int32_t atomic_smin_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMin((int32_t*)p, x);\n#else\n  return atomic_min(p, x);\n#endif\n}\n\ninline int32_t atomic_smin_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMin((int32_t*)p, x);\n#else\n  return atomic_min(p, x);\n#endif\n}\n\ninline uint32_t atomic_umax_i32_global(volatile __global uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMax((uint32_t*)p, x);\n#else\n  return atomic_max(p, x);\n#endif\n}\n\n", "inline uint32_t atomic_umax_i32_local(volatile __local uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMax((uint32_t*)p, x);\n#else\n  return atomic_max(p, x);\n#endif\n}\n\ninline uint32_t atomic_umin_i32_global(volatile __global uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMin((uint32_t*)p, x);\n#else\n  return atomic_min(p, x);\n#endif\n}\n\ninline uint32_t atomic_umin_i32_local(volatile __local uint32_t *p, uint32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMin((uint32_t*)p, x);\n#else\n  return atomic_min(p, x);\n#endif\n}\n\ninline int32_t atomic_and_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicAnd((int32_t*)p, x);\n#else\n  return atomic_and(p, x);\n#endif\n}\n\ninline int32_t atomic_and_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicAnd((int32_t*)p, x);\n#else\n  return atomic_and(p, x);\n#endif\n}\n\ninline int32_t atomic_or_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicOr((int32_t*)p, x);\n#else\n  return atomic_or(p, x);\n#endif\n}\n\ninline int32_t atomic_or_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicOr((int32_t*)p, x);\n#else\n  return atomic_or(p, x);\n#endif\n}\n\ninline int32_t atomic_xor_i32_global(volatile __global int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicXor((int32_t*)p, x);\n#else\n  return atomic_xor(p, x);\n#endif\n}\n\ninline int32_t atomic_xor_i32_local(volatile __local int32_t *p, int32_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicXor((int32_t*)p, x);\n#else\n  return atomic_xor(p, x);\n#endif\n}\n\n// Start of 64 bit atomics\n\ninline int64_t atomic_xchg_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicExch((uint64_t*)p, x);\n#else\n  return atom_xor(p, x);\n#endif\n}\n\ninline int64_t atomic_xchg_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicExch((uint64_t*)p, x);\n#else\n  return atom_xor(p, x);\n#endif\n}\n\ninli",
                                       "ne int64_t atomic_cmpxchg_i64_global(volatile __global int64_t *p,\n                                         int64_t cmp, int64_t val) {\n#ifdef FUTHARK_CUDA\n  return atomicCAS((uint64_t*)p, cmp, val);\n#else\n  return atom_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int64_t atomic_cmpxchg_i64_local(volatile __local int64_t *p,\n                                        int64_t cmp, int64_t val) {\n#ifdef FUTHARK_CUDA\n  return atomicCAS((uint64_t*)p, cmp, val);\n#else\n  return atom_cmpxchg(p, cmp, val);\n#endif\n}\n\ninline int64_t atomic_add_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicAdd((uint64_t*)p, x);\n#else\n  return atom_add(p, x);\n#endif\n}\n\ninline int64_t atomic_add_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicAdd((uint64_t*)p, x);\n#else\n  return atom_add(p, x);\n#endif\n}\n\n#ifdef FUTHARK_F64_ENABLED\n\ninline double atomic_fadd_f64_global(volatile __global double *p, double x) {\n#if defined(FUTHARK_CUDA) && __CUDA_ARCH__ >= 600\n  return atomicAdd((double*)p, x);\n#else\n  union { int64_t i; double f; } old;\n  union { int64_t i; double f; } assumed;\n  old.f = *p;\n  do {\n    assumed.f = old.f;\n    old.f = old.f + x;\n    old.i = atomic_cmpxchg_i64_global((volatile __global int64_t*)p, assumed.i, old.i);\n  } while (assumed.i != old.i);\n  return old.f;\n#endif\n}\n\ninline double atomic_fadd_f64_local(volatile __local double *p, double x) {\n#if defined(FUTHARK_CUDA) && __CUDA_ARCH__ >= 600\n  return atomicAdd((double*)p, x);\n#else\n  union { int64_t i; double f; } old;\n  union { int64_t i; double f; } assumed;\n  old.f = *p;\n  do {\n    assumed.f = old.f;\n    old.f = old.f + x;\n    old.i = atomic_cmpxchg_i64_local((volatile __local int64_t*)p, assumed.i, old.i);\n  } while (assumed.i != old.i);\n  return old.f;\n#endif\n}\n\n#endif\n\ninline int64_t atomic_smax_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMax((int64_t*)p, x);\n#else\n  return atom_max(p, x);\n#endif\n}\n\ninline ", "int64_t atomic_smax_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMax((int64_t*)p, x);\n#else\n  return atom_max(p, x);\n#endif\n}\n\ninline int64_t atomic_smin_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMin((int64_t*)p, x);\n#else\n  return atom_min(p, x);\n#endif\n}\n\ninline int64_t atomic_smin_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMin((int64_t*)p, x);\n#else\n  return atom_min(p, x);\n#endif\n}\n\ninline uint64_t atomic_umax_i64_global(volatile __global uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMax((uint64_t*)p, x);\n#else\n  return atom_max(p, x);\n#endif\n}\n\ninline uint64_t atomic_umax_i64_local(volatile __local uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMax((uint64_t*)p, x);\n#else\n  return atom_max(p, x);\n#endif\n}\n\ninline uint64_t atomic_umin_i64_global(volatile __global uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMin((uint64_t*)p, x);\n#else\n  return atom_min(p, x);\n#endif\n}\n\ninline uint64_t atomic_umin_i64_local(volatile __local uint64_t *p, uint64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicMin((uint64_t*)p, x);\n#else\n  return atom_min(p, x);\n#endif\n}\n\ninline int64_t atomic_and_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicAnd((int64_t*)p, x);\n#else\n  return atom_and(p, x);\n#endif\n}\n\ninline int64_t atomic_and_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicAnd((int64_t*)p, x);\n#else\n  return atom_and(p, x);\n#endif\n}\n\ninline int64_t atomic_or_i64_global(volatile __global int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicOr((int64_t*)p, x);\n#else\n  return atom_or(p, x);\n#endif\n}\n\ninline int64_t atomic_or_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicOr((int64_t*)p, x);\n#else\n  return atom_or(p, x);\n#endif\n}\n\ninline int64_t atomic_xor_i64_global(volatile __global ", "int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicXor((int64_t*)p, x);\n#else\n  return atom_xor(p, x);\n#endif\n}\n\ninline int64_t atomic_xor_i64_local(volatile __local int64_t *p, int64_t x) {\n#ifdef FUTHARK_CUDA\n  return atomicXor((int64_t*)p, x);\n#else\n  return atom_xor(p, x);\n#endif\n}\n\n// End of atomics.h\n\n\n\n__attribute__((reqd_work_group_size(addzisegmap_group_sizze_6879, 1, 1)))\n__kernel void addzisegmap_6892(__global int *global_failure, int64_t n_6776, __global unsigned char *xs_mem_6916, __global unsigned char *ys_mem_6917, __global unsigned char *mem_6920)\n{\n    #define segmap_group_sizze_6888 (addzisegmap_group_sizze_6879)\n    \n    const int block_dim0 = 0;\n    const int block_dim1 = 1;\n    const int block_dim2 = 2;\n    \n    if (*global_failure >= 0)\n        return;\n    \n    int32_t local_tid_6925;\n    int64_t group_sizze_6928;\n    int32_t wave_sizze_6927;\n    int32_t group_tid_6926;\n    \n    local_tid_6925 = get_local_id(0);\n    group_sizze_6928 = get_local_size(0);\n    wave_sizze_6927 = LOCKSTEP_WIDTH;\n    group_tid_6926 = get_group_id(0);\n    \n    int32_t global_tid_6924 = group_tid_6926 * group_sizze_6928 + local_tid_6925;\n    int32_t phys_tid_6892 = global_tid_6924;\n    int64_t global_tid_6929 = sext_i32_i64(group_tid_6926) * segmap_group_sizze_6888 + sext_i32_i64(local_tid_6925);\n    int64_t slice_6930 = n_6776;\n    int64_t gtid_6891 = global_tid_6929;\n    int64_t remnant_6931 = global_tid_6929 - gtid_6891;\n    \n    if (slt64(gtid_6891, n_6776)) {\n        int8_t x_6893 = ((__global int8_t *) xs_mem_6916)[gtid_6891];\n        int8_t x_6894 = ((__global int8_t *) ys_mem_6917)[gtid_6891];\n        int8_t defunc_0_f_res_6895 = add8(x_6893, x_6894);\n        \n        ((__global int8_t *) mem_6920)[gtid_6891] = defunc_0_f_res_6895;\n    }\n    \n  error_0:\n    return;\n    #undef segmap_group_sizze_6888\n}\n__attribute__((reqd_work_group_size(add_i64zisegmap_group_sizze_6899, 1, 1)))\n__kernel void add_i64zisegmap_6912(__global int *global_failure, int64_t n_", "6836, __global unsigned char *xs_mem_6916, __global unsigned char *ys_mem_6917, __global unsigned char *mem_6921)\n{\n    #define segmap_group_sizze_6908 (add_i64zisegmap_group_sizze_6899)\n    \n    const int block_dim0 = 0;\n    const int block_dim1 = 1;\n    const int block_dim2 = 2;\n    \n    if (*global_failure >= 0)\n        return;\n    \n    int32_t local_tid_6925;\n    int64_t group_sizze_6928;\n    int32_t wave_sizze_6927;\n    int32_t group_tid_6926;\n    \n    local_tid_6925 = get_local_id(0);\n    group_sizze_6928 = get_local_size(0);\n    wave_sizze_6927 = LOCKSTEP_WIDTH;\n    group_tid_6926 = get_group_id(0);\n    \n    int32_t global_tid_6924 = group_tid_6926 * group_sizze_6928 + local_tid_6925;\n    int32_t phys_tid_6912 = global_tid_6924;\n    int64_t global_tid_6929 = sext_i32_i64(group_tid_6926) * segmap_group_sizze_6908 + sext_i32_i64(local_tid_6925);\n    int64_t slice_6930 = n_6836;\n    int64_t gtid_6911 = global_tid_6929;\n    int64_t remnant_6931 = global_tid_6929 - gtid_6911;\n    \n    if (slt64(gtid_6911, n_6836)) {\n        int64_t x_6913 = ((__global int64_t *) xs_mem_6916)[gtid_6911];\n        int64_t x_6914 = ((__global int64_t *) ys_mem_6917)[gtid_6911];\n        int64_t defunc_0_f_res_6915 = add64(x_6913, x_6914);\n        \n        ((__global int64_t *) mem_6921)[gtid_6911] = defunc_0_f_res_6915;\n    }\n    \n  error_0:\n    return;\n    #undef segmap_group_sizze_6908\n}\n", NULL};
// Start of backends/opencl.h

// Forward declarations.
struct opencl_device_option;
// Invoked by setup_opencl() after the platform and device has been
// found, but before the program is loaded.  Its intended use is to
// tune constants based on the selected platform and device.
static void post_opencl_setup(struct futhark_context*, struct opencl_device_option*);
static void set_tuning_params(struct futhark_context* ctx);
static char* get_failure_msg(int failure_idx, int64_t args[]);

#define OPENCL_SUCCEED_FATAL(e) opencl_succeed_fatal(e, #e, __FILE__, __LINE__)
#define OPENCL_SUCCEED_NONFATAL(e) opencl_succeed_nonfatal(e, #e, __FILE__, __LINE__)
// Take care not to override an existing error.
#define OPENCL_SUCCEED_OR_RETURN(e) {           \
    char *serror = OPENCL_SUCCEED_NONFATAL(e);  \
    if (serror) {                               \
      if (!ctx->error) {                        \
        ctx->error = serror;                    \
        return bad;                             \
      } else {                                  \
        free(serror);                           \
      }                                         \
    }                                           \
  }

// OPENCL_SUCCEED_OR_RETURN returns the value of the variable 'bad' in
// scope.  By default, it will be this one.  Create a local variable
// of some other type if needed.  This is a bit of a hack, but it
// saves effort in the code generator.
static const int bad = 1;

static const char* opencl_error_string(cl_int err) {
  switch (err) {
  case CL_SUCCESS:                            return "Success!";
  case CL_DEVICE_NOT_FOUND:                   return "Device not found.";
  case CL_DEVICE_NOT_AVAILABLE:               return "Device not available";
  case CL_COMPILER_NOT_AVAILABLE:             return "Compiler not available";
  case CL_MEM_OBJECT_ALLOCATION_FAILURE:      return "Memory object allocation failure";
  case CL_OUT_OF_RESOURCES:                   return "Out of resources";
  case CL_OUT_OF_HOST_MEMORY:                 return "Out of host memory";
  case CL_PROFILING_INFO_NOT_AVAILABLE:       return "Profiling information not available";
  case CL_MEM_COPY_OVERLAP:                   return "Memory copy overlap";
  case CL_IMAGE_FORMAT_MISMATCH:              return "Image format mismatch";
  case CL_IMAGE_FORMAT_NOT_SUPPORTED:         return "Image format not supported";
  case CL_BUILD_PROGRAM_FAILURE:              return "Program build failure";
  case CL_MAP_FAILURE:                        return "Map failure";
  case CL_INVALID_VALUE:                      return "Invalid value";
  case CL_INVALID_DEVICE_TYPE:                return "Invalid device type";
  case CL_INVALID_PLATFORM:                   return "Invalid platform";
  case CL_INVALID_DEVICE:                     return "Invalid device";
  case CL_INVALID_CONTEXT:                    return "Invalid context";
  case CL_INVALID_QUEUE_PROPERTIES:           return "Invalid queue properties";
  case CL_INVALID_COMMAND_QUEUE:              return "Invalid command queue";
  case CL_INVALID_HOST_PTR:                   return "Invalid host pointer";
  case CL_INVALID_MEM_OBJECT:                 return "Invalid memory object";
  case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:    return "Invalid image format descriptor";
  case CL_INVALID_IMAGE_SIZE:                 return "Invalid image size";
  case CL_INVALID_SAMPLER:                    return "Invalid sampler";
  case CL_INVALID_BINARY:                     return "Invalid binary";
  case CL_INVALID_BUILD_OPTIONS:              return "Invalid build options";
  case CL_INVALID_PROGRAM:                    return "Invalid program";
  case CL_INVALID_PROGRAM_EXECUTABLE:         return "Invalid program executable";
  case CL_INVALID_KERNEL_NAME:                return "Invalid kernel name";
  case CL_INVALID_KERNEL_DEFINITION:          return "Invalid kernel definition";
  case CL_INVALID_KERNEL:                     return "Invalid kernel";
  case CL_INVALID_ARG_INDEX:                  return "Invalid argument index";
  case CL_INVALID_ARG_VALUE:                  return "Invalid argument value";
  case CL_INVALID_ARG_SIZE:                   return "Invalid argument size";
  case CL_INVALID_KERNEL_ARGS:                return "Invalid kernel arguments";
  case CL_INVALID_WORK_DIMENSION:             return "Invalid work dimension";
  case CL_INVALID_WORK_GROUP_SIZE:            return "Invalid work group size";
  case CL_INVALID_WORK_ITEM_SIZE:             return "Invalid work item size";
  case CL_INVALID_GLOBAL_OFFSET:              return "Invalid global offset";
  case CL_INVALID_EVENT_WAIT_LIST:            return "Invalid event wait list";
  case CL_INVALID_EVENT:                      return "Invalid event";
  case CL_INVALID_OPERATION:                  return "Invalid operation";
  case CL_INVALID_GL_OBJECT:                  return "Invalid OpenGL object";
  case CL_INVALID_BUFFER_SIZE:                return "Invalid buffer size";
  case CL_INVALID_MIP_LEVEL:                  return "Invalid mip-map level";
  default:                                    return "Unknown";
  }
}

static void opencl_succeed_fatal(cl_int ret,
                                 const char *call,
                                 const char *file,
                                 int line) {
  if (ret != CL_SUCCESS) {
    futhark_panic(-1, "%s:%d: OpenCL call\n  %s\nfailed with error code %d (%s)\n",
                  file, line, call, ret, opencl_error_string(ret));
  }
}

static char* opencl_succeed_nonfatal(cl_int ret,
                                     const char *call,
                                     const char *file,
                                     int line) {
  if (ret != CL_SUCCESS) {
    return msgprintf("%s:%d: OpenCL call\n  %s\nfailed with error code %d (%s)\n",
                     file, line, call, ret, opencl_error_string(ret));
  } else {
    return NULL;
  }
}

struct futhark_context_config {
  int in_use;
  int debugging;
  int profiling;
  int logging;
  const char *cache_fname;
  int num_tuning_params;
  int64_t *tuning_params;
  const char** tuning_param_names;
  const char** tuning_param_vars;
  const char** tuning_param_classes;
  // Uniform fields above.

  int preferred_device_num;
  const char *preferred_platform;
  const char *preferred_device;
  int ignore_blacklist;

  const char* dump_program_to;
  const char* load_program_from;
  const char* dump_binary_to;
  const char* load_binary_from;

  size_t default_group_size;
  size_t default_num_groups;
  size_t default_tile_size;
  size_t default_reg_tile_size;
  size_t default_threshold;

  int default_group_size_changed;
  int default_tile_size_changed;
  int num_build_opts;
  const char **build_opts;

  cl_command_queue queue;
  int queue_set;
};

static void backend_context_config_setup(struct futhark_context_config* cfg) {
  cfg->num_build_opts = 0;
  cfg->build_opts = (const char**) malloc(sizeof(const char*));
  cfg->build_opts[0] = NULL;
  cfg->preferred_device_num = 0;
  cfg->preferred_platform = "";
  cfg->preferred_device = "";
  cfg->ignore_blacklist = 0;
  cfg->dump_program_to = NULL;
  cfg->load_program_from = NULL;
  cfg->dump_binary_to = NULL;
  cfg->load_binary_from = NULL;

  // The following are dummy sizes that mean the concrete defaults
  // will be set during initialisation via hardware-inspection-based
  // heuristics.
  cfg->default_group_size = 0;
  cfg->default_num_groups = 0;
  cfg->default_tile_size = 0;
  cfg->default_reg_tile_size = 0;
  cfg->default_threshold = 0;

  cfg->default_group_size_changed = 0;
  cfg->default_tile_size_changed = 0;

  cfg->queue_set = 0;
}

static void backend_context_config_teardown(struct futhark_context_config* cfg) {
  free(cfg->build_opts);
}

void futhark_context_config_add_build_option(struct futhark_context_config* cfg, const char *opt) {
  cfg->build_opts[cfg->num_build_opts] = opt;
  cfg->num_build_opts++;
  cfg->build_opts = (const char**) realloc(cfg->build_opts, (cfg->num_build_opts+1) * sizeof(const char*));
  cfg->build_opts[cfg->num_build_opts] = NULL;
}

void futhark_context_config_set_device(struct futhark_context_config *cfg, const char* s) {
  int x = 0;
  if (*s == '#') {
    s++;
    while (isdigit(*s)) {
      x = x * 10 + (*s++)-'0';
    }
    // Skip trailing spaces.
    while (isspace(*s)) {
      s++;
    }
  }
  cfg->preferred_device = s;
  cfg->preferred_device_num = x;
  cfg->ignore_blacklist = 1;
}

void futhark_context_config_set_platform(struct futhark_context_config *cfg, const char *s) {
  cfg->preferred_platform = s;
  cfg->ignore_blacklist = 1;
}

void futhark_context_config_set_command_queue(struct futhark_context_config *cfg, cl_command_queue q) {
  cfg->queue = q;
  cfg->queue_set = 1;
}

struct opencl_device_option {
  cl_platform_id platform;
  cl_device_id device;
  cl_device_type device_type;
  char *platform_name;
  char *device_name;
};

static char* opencl_platform_info(cl_platform_id platform,
                                  cl_platform_info param) {
  size_t req_bytes;
  char *info;

  OPENCL_SUCCEED_FATAL(clGetPlatformInfo(platform, param, 0, NULL, &req_bytes));

  info = (char*) malloc(req_bytes);

  OPENCL_SUCCEED_FATAL(clGetPlatformInfo(platform, param, req_bytes, info, NULL));

  return info;
}

static char* opencl_device_info(cl_device_id device,
                                cl_device_info param) {
  size_t req_bytes;
  char *info;

  OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device, param, 0, NULL, &req_bytes));

  info = (char*) malloc(req_bytes);

  OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device, param, req_bytes, info, NULL));

  return info;
}

static int is_blacklisted(const char *platform_name, const char *device_name,
                          const struct futhark_context_config *cfg) {
  if (strcmp(cfg->preferred_platform, "") != 0 ||
      strcmp(cfg->preferred_device, "") != 0) {
    return 0;
  } else if (strstr(platform_name, "Apple") != NULL &&
             strstr(device_name, "Intel(R) Core(TM)") != NULL) {
    return 1;
  } else {
    return 0;
  }
}

static void opencl_all_device_options(struct opencl_device_option **devices_out,
                                      size_t *num_devices_out) {
  size_t num_devices = 0, num_devices_added = 0;

  cl_platform_id *all_platforms;
  cl_uint *platform_num_devices;

  cl_uint num_platforms;

  // Find the number of platforms.
  OPENCL_SUCCEED_FATAL(clGetPlatformIDs(0, NULL, &num_platforms));

  // Make room for them.
  all_platforms = calloc(num_platforms, sizeof(cl_platform_id));
  platform_num_devices = calloc(num_platforms, sizeof(cl_uint));

  // Fetch all the platforms.
  OPENCL_SUCCEED_FATAL(clGetPlatformIDs(num_platforms, all_platforms, NULL));

  // Count the number of devices for each platform, as well as the
  // total number of devices.
  for (cl_uint i = 0; i < num_platforms; i++) {
    if (clGetDeviceIDs(all_platforms[i], CL_DEVICE_TYPE_ALL,
                       0, NULL, &platform_num_devices[i]) == CL_SUCCESS) {
      num_devices += platform_num_devices[i];
    } else {
      platform_num_devices[i] = 0;
    }
  }

  // Make room for all the device options.
  struct opencl_device_option *devices =
    calloc(num_devices, sizeof(struct opencl_device_option));

  // Loop through the platforms, getting information about their devices.
  for (cl_uint i = 0; i < num_platforms; i++) {
    cl_platform_id platform = all_platforms[i];
    cl_uint num_platform_devices = platform_num_devices[i];

    if (num_platform_devices == 0) {
      continue;
    }

    char *platform_name = opencl_platform_info(platform, CL_PLATFORM_NAME);
    cl_device_id *platform_devices =
      calloc(num_platform_devices, sizeof(cl_device_id));

    // Fetch all the devices.
    OPENCL_SUCCEED_FATAL(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL,
                                        num_platform_devices, platform_devices, NULL));

    // Loop through the devices, adding them to the devices array.
    for (cl_uint i = 0; i < num_platform_devices; i++) {
      char *device_name = opencl_device_info(platform_devices[i], CL_DEVICE_NAME);
      devices[num_devices_added].platform = platform;
      devices[num_devices_added].device = platform_devices[i];
      OPENCL_SUCCEED_FATAL(clGetDeviceInfo(platform_devices[i], CL_DEVICE_TYPE,
                                           sizeof(cl_device_type),
                                           &devices[num_devices_added].device_type,
                                           NULL));
      // We don't want the structs to share memory, so copy the platform name.
      // Each device name is already unique.
      devices[num_devices_added].platform_name = strclone(platform_name);
      devices[num_devices_added].device_name = device_name;
      num_devices_added++;
    }
    free(platform_devices);
    free(platform_name);
  }
  free(all_platforms);
  free(platform_num_devices);

  *devices_out = devices;
  *num_devices_out = num_devices;
}

void futhark_context_config_select_device_interactively(struct futhark_context_config *cfg) {
  struct opencl_device_option *devices;
  size_t num_devices;

  opencl_all_device_options(&devices, &num_devices);

  printf("Choose OpenCL device:\n");
  const char *cur_platform = "";
  for (size_t i = 0; i < num_devices; i++) {
    struct opencl_device_option device = devices[i];
    if (strcmp(cur_platform, device.platform_name) != 0) {
      printf("Platform: %s\n", device.platform_name);
      cur_platform = device.platform_name;
    }
    printf("[%d] %s\n", (int)i, device.device_name);
  }

  int selection;
  printf("Choice: ");
  if (scanf("%d", &selection) == 1) {
    cfg->preferred_platform = "";
    cfg->preferred_device = "";
    cfg->preferred_device_num = selection;
    cfg->ignore_blacklist = 1;
  }

  // Free all the platform and device names.
  for (size_t j = 0; j < num_devices; j++) {
    free(devices[j].platform_name);
    free(devices[j].device_name);
  }
  free(devices);
}

void futhark_context_config_list_devices(struct futhark_context_config *cfg) {
  (void)cfg;
  struct opencl_device_option *devices;
  size_t num_devices;

  opencl_all_device_options(&devices, &num_devices);

  const char *cur_platform = "";
  for (size_t i = 0; i < num_devices; i++) {
    struct opencl_device_option device = devices[i];
    if (strcmp(cur_platform, device.platform_name) != 0) {
      printf("Platform: %s\n", device.platform_name);
      cur_platform = device.platform_name;
    }
    printf("[%d]: %s\n", (int)i, device.device_name);
  }

  // Free all the platform and device names.
  for (size_t j = 0; j < num_devices; j++) {
    free(devices[j].platform_name);
    free(devices[j].device_name);
  }
  free(devices);
}

void futhark_context_config_dump_program_to(struct futhark_context_config *cfg, const char *path) {
  cfg->dump_program_to = path;
}

void futhark_context_config_load_program_from(struct futhark_context_config *cfg, const char *path) {
  cfg->load_program_from = path;
}

void futhark_context_config_dump_binary_to(struct futhark_context_config *cfg, const char *path) {
  cfg->dump_binary_to = path;
}

void futhark_context_config_load_binary_from(struct futhark_context_config *cfg, const char *path) {
  cfg->load_binary_from = path;
}

void futhark_context_config_set_default_group_size(struct futhark_context_config *cfg, int size) {
  cfg->default_group_size = size;
  cfg->default_group_size_changed = 1;
}

void futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg, int num) {
  cfg->default_num_groups = num;
}

void futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg, int size) {
  cfg->default_tile_size = size;
  cfg->default_tile_size_changed = 1;
}

void futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg, int size) {
  cfg->default_reg_tile_size = size;
}

void futhark_context_config_set_default_threshold(struct futhark_context_config *cfg, int size) {
  cfg->default_threshold = size;
}

int futhark_context_config_set_tuning_param(struct futhark_context_config *cfg,
                                            const char *param_name,
                                            size_t new_value) {
  for (int i = 0; i < cfg->num_tuning_params; i++) {
    if (strcmp(param_name, cfg->tuning_param_names[i]) == 0) {
      cfg->tuning_params[i] = new_value;
      return 0;
    }
  }
  if (strcmp(param_name, "default_group_size") == 0) {
    cfg->default_group_size = new_value;
    return 0;
  }
  if (strcmp(param_name, "default_num_groups") == 0) {
    cfg->default_num_groups = new_value;
    return 0;
  }
  if (strcmp(param_name, "default_threshold") == 0) {
    cfg->default_threshold = new_value;
    return 0;
  }
  if (strcmp(param_name, "default_tile_size") == 0) {
    cfg->default_tile_size = new_value;
    return 0;
  }
  if (strcmp(param_name, "default_reg_tile_size") == 0) {
    cfg->default_reg_tile_size = new_value;
    return 0;
  }
  return 1;
}

// A record of something that happened.
struct profiling_record {
  cl_event *event;
  int *runs;
  int64_t *runtime;
};

struct futhark_context {
  struct futhark_context_config* cfg;
  int detail_memory;
  int debugging;
  int profiling;
  int profiling_paused;
  int logging;
  lock_t lock;
  char *error;
  lock_t error_lock;
  FILE *log;
  struct constants *constants;
  struct free_list free_list;
  int64_t peak_mem_usage_default;
  int64_t cur_mem_usage_default;
  struct program* program;

  // Common fields above.

  cl_mem global_failure;
  cl_mem global_failure_args;
  struct tuning_params tuning_params;
  // True if a potentially failing kernel has been enqueued.
  cl_int failure_is_an_option;
  int total_runs;
  long int total_runtime;
  int64_t peak_mem_usage_device;
  int64_t cur_mem_usage_device;

  cl_device_id device;
  cl_context ctx;
  cl_command_queue queue;
  cl_program clprogram;

  struct free_list cl_free_list;

  size_t max_group_size;
  size_t max_num_groups;
  size_t max_tile_size;
  size_t max_threshold;
  size_t max_local_memory;

  size_t lockstep_width;

  struct profiling_record *profiling_records;
  int profiling_records_capacity;
  int profiling_records_used;

};

static cl_build_status build_opencl_program(cl_program program, cl_device_id device, const char* options) {
  cl_int clBuildProgram_error = clBuildProgram(program, 1, &device, options, NULL, NULL);

  // Avoid termination due to CL_BUILD_PROGRAM_FAILURE
  if (clBuildProgram_error != CL_SUCCESS &&
      clBuildProgram_error != CL_BUILD_PROGRAM_FAILURE) {
    OPENCL_SUCCEED_FATAL(clBuildProgram_error);
  }

  cl_build_status build_status;
  OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program,
                                             device,
                                             CL_PROGRAM_BUILD_STATUS,
                                             sizeof(cl_build_status),
                                             &build_status,
                                             NULL));

  if (build_status != CL_SUCCESS) {
    char *build_log;
    size_t ret_val_size;
    OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size));

    build_log = (char*) malloc(ret_val_size+1);
    OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL));

    // The spec technically does not say whether the build log is zero-terminated, so let's be careful.
    build_log[ret_val_size] = '\0';

    fprintf(stderr, "Build log:\n%s\n", build_log);

    free(build_log);
  }

  return build_status;
}

static char* mk_compile_opts(struct futhark_context *ctx,
                             const char *extra_build_opts[],
                             struct opencl_device_option device_option) {
  int compile_opts_size = 1024;

  for (int i = 0; i < ctx->cfg->num_tuning_params; i++) {
    compile_opts_size += strlen(ctx->cfg->tuning_param_names[i]) + 20;
  }

  for (int i = 0; extra_build_opts[i] != NULL; i++) {
    compile_opts_size += strlen(extra_build_opts[i] + 1);
  }

  char *compile_opts = (char*) malloc(compile_opts_size);

  int w = snprintf(compile_opts, compile_opts_size,
                   "-DLOCKSTEP_WIDTH=%d ",
                   (int)ctx->lockstep_width);

  w += snprintf(compile_opts+w, compile_opts_size-w,
                "-D%s=%d ",
                "max_group_size",
                (int)ctx->max_group_size);

  for (int i = 0; i < ctx->cfg->num_tuning_params; i++) {
    w += snprintf(compile_opts+w, compile_opts_size-w,
                  "-D%s=%d ",
                  ctx->cfg->tuning_param_vars[i],
                  (int)ctx->cfg->tuning_params[i]);
  }

  for (int i = 0; extra_build_opts[i] != NULL; i++) {
    w += snprintf(compile_opts+w, compile_opts_size-w,
                  "%s ", extra_build_opts[i]);
  }

  // Oclgrind claims to support cl_khr_fp16, but this is not actually
  // the case.
  if (strcmp(device_option.platform_name, "Oclgrind") == 0) {
    w += snprintf(compile_opts+w, compile_opts_size-w, "-DEMULATE_F16 ");
  }

  return compile_opts;
}

// Count up the runtime all the profiling_records that occured during execution.
// Also clears the buffer of profiling_records.
static cl_int opencl_tally_profiling_records(struct futhark_context *ctx) {
  cl_int err;
  for (int i = 0; i < ctx->profiling_records_used; i++) {
    struct profiling_record record = ctx->profiling_records[i];

    cl_ulong start_t, end_t;

    if ((err = clGetEventProfilingInfo(*record.event,
                                       CL_PROFILING_COMMAND_START,
                                       sizeof(start_t),
                                       &start_t,
                                       NULL)) != CL_SUCCESS) {
      return err;
    }

    if ((err = clGetEventProfilingInfo(*record.event,
                                       CL_PROFILING_COMMAND_END,
                                       sizeof(end_t),
                                       &end_t,
                                       NULL)) != CL_SUCCESS) {
      return err;
    }

    // OpenCL provides nanosecond resolution, but we want
    // microseconds.
    *record.runs += 1;
    *record.runtime += (end_t - start_t)/1000;

    if ((err = clReleaseEvent(*record.event)) != CL_SUCCESS) {
      return err;
    }
    free(record.event);
  }

  ctx->profiling_records_used = 0;

  return CL_SUCCESS;
}

// If profiling, produce an event associated with a profiling record.
static cl_event* opencl_get_event(struct futhark_context *ctx, int *runs, int64_t *runtime) {
  if (ctx->profiling_records_used == ctx->profiling_records_capacity) {
    ctx->profiling_records_capacity *= 2;
    ctx->profiling_records =
      realloc(ctx->profiling_records,
              ctx->profiling_records_capacity *
              sizeof(struct profiling_record));
  }
  cl_event *event = malloc(sizeof(cl_event));
  ctx->profiling_records[ctx->profiling_records_used].event = event;
  ctx->profiling_records[ctx->profiling_records_used].runs = runs;
  ctx->profiling_records[ctx->profiling_records_used].runtime = runtime;
  ctx->profiling_records_used++;
  return event;
}

// Allocate memory from driver. The problem is that OpenCL may perform
// lazy allocation, so we cannot know whether an allocation succeeded
// until the first time we try to use it.  Hence we immediately
// perform a write to see if the allocation succeeded.  This is slow,
// but the assumption is that this operation will be rare (most things
// will go through the free list).
static int opencl_alloc_actual(struct futhark_context *ctx, size_t size, cl_mem *mem_out) {
  int error;
  *mem_out = clCreateBuffer(ctx->ctx, CL_MEM_READ_WRITE, size, NULL, &error);

  if (error != CL_SUCCESS) {
    return error;
  }

  int x = 2;
  error = clEnqueueWriteBuffer(ctx->queue, *mem_out,
                               CL_TRUE,
                               0, sizeof(x), &x,
                               0, NULL, NULL);

  // No need to wait for completion here. clWaitForEvents() cannot
  // return mem object allocation failures. This implies that the
  // buffer is faulted onto the device on enqueue. (Observation by
  // Andreas Kloeckner.)

  return error;
}

static int opencl_alloc(struct futhark_context *ctx, FILE *log,
                        size_t min_size, const char *tag,
                        cl_mem *mem_out, size_t *size_out) {
  (void)tag;
  if (min_size < sizeof(int)) {
    min_size = sizeof(int);
  }

  cl_mem* memptr;
  if (free_list_find(&ctx->cl_free_list, min_size, tag, size_out, (fl_mem*)&memptr) == 0) {
    // Successfully found a free block.  Is it big enough?
    if (*size_out >= min_size) {
      if (ctx->cfg->debugging) {
        fprintf(log, "No need to allocate: Found a block in the free list.\n");
      }
      *mem_out = *memptr;
      free(memptr);
      return CL_SUCCESS;
    } else {
      if (ctx->cfg->debugging) {
        fprintf(log, "Found a free block, but it was too small.\n");
      }
      int error = clReleaseMemObject(*memptr);
      free(*memptr);
      if (error != CL_SUCCESS) {
        return error;
      }
    }
  }

  *size_out = min_size;

  // We have to allocate a new block from the driver.  If the
  // allocation does not succeed, then we might be in an out-of-memory
  // situation.  We now start freeing things from the free list until
  // we think we have freed enough that the allocation will succeed.
  // Since we don't know how far the allocation is from fitting, we
  // have to check after every deallocation.  This might be pretty
  // expensive.  Let's hope that this case is hit rarely.

  if (ctx->cfg->debugging) {
    fprintf(log, "Actually allocating the desired block.\n");
  }

  int error = opencl_alloc_actual(ctx, min_size, mem_out);

  while (error == CL_MEM_OBJECT_ALLOCATION_FAILURE) {
    if (ctx->cfg->debugging) {
      fprintf(log, "Out of OpenCL memory: releasing entry from the free list...\n");
    }
    cl_mem* memptr;
    if (free_list_first(&ctx->cl_free_list, (fl_mem*)&memptr) == 0) {
      cl_mem mem = *memptr;
      free(memptr);
      error = clReleaseMemObject(mem);
      if (error != CL_SUCCESS) {
        return error;
      }
    } else {
      break;
    }
    error = opencl_alloc_actual(ctx, min_size, mem_out);
  }

  return error;
}

static int opencl_free(struct futhark_context *ctx,
                       cl_mem mem, size_t size, const char *tag) {
  cl_mem* memptr = malloc(sizeof(cl_mem));
  *memptr = mem;
  free_list_insert(&ctx->cl_free_list, size, (fl_mem)memptr, tag);
  return CL_SUCCESS;
}

static int opencl_free_all(struct futhark_context *ctx) {
  free_list_pack(&ctx->cl_free_list);
  cl_mem* memptr;
  while (free_list_first(&ctx->cl_free_list, (fl_mem*)&memptr) == 0) {
    cl_mem mem = *memptr;
    free(memptr);
    int error = clReleaseMemObject(mem);
    if (error != CL_SUCCESS) {
      return error;
    }
  }

  return CL_SUCCESS;
}

int futhark_context_sync(struct futhark_context* ctx) {
  // Check for any delayed error.
  cl_int failure_idx = -1;
  if (ctx->failure_is_an_option) {
    OPENCL_SUCCEED_OR_RETURN(
                             clEnqueueReadBuffer(ctx->queue,
                                                 ctx->global_failure,
                                                 CL_FALSE,
                                                 0, sizeof(cl_int), &failure_idx,
                                                 0, NULL, NULL));
    ctx->failure_is_an_option = 0;
  }

  OPENCL_SUCCEED_OR_RETURN(clFinish(ctx->queue));

  if (failure_idx >= 0) {
    // We have to clear global_failure so that the next entry point
    // is not considered a failure from the start.
    cl_int no_failure = -1;
    OPENCL_SUCCEED_OR_RETURN(
                             clEnqueueWriteBuffer(ctx->queue, ctx->global_failure, CL_TRUE,
                                                  0, sizeof(cl_int), &no_failure,
                                                  0, NULL, NULL));

    int64_t args[max_failure_args+1];
    OPENCL_SUCCEED_OR_RETURN(
                             clEnqueueReadBuffer(ctx->queue,
                                                 ctx->global_failure_args,
                                                 CL_TRUE,
                                                 0, sizeof(args), &args,
                                                 0, NULL, NULL));

    ctx->error = get_failure_msg(failure_idx, args);

    return FUTHARK_PROGRAM_ERROR;
  }
  return 0;
}


// We take as input several strings representing the program, because
// C does not guarantee that the compiler supports particularly large
// literals.  Notably, Visual C has a limit of 2048 characters.  The
// array must be NULL-terminated.
static void setup_opencl_with_command_queue(struct futhark_context *ctx,
                                            cl_command_queue queue,
                                            const char *srcs[],
                                            const char *extra_build_opts[],
                                            const char* cache_fname) {
  int error;

  free_list_init(&ctx->cl_free_list);
  ctx->queue = queue;

  OPENCL_SUCCEED_FATAL(clGetCommandQueueInfo(ctx->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx->ctx, NULL));

  // Fill out the device info.  This is redundant work if we are
  // called from setup_opencl() (which is the common case), but I
  // doubt it matters much.
  struct opencl_device_option device_option;
  OPENCL_SUCCEED_FATAL(clGetCommandQueueInfo(ctx->queue, CL_QUEUE_DEVICE,
                                             sizeof(cl_device_id),
                                             &device_option.device,
                                             NULL));
  OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_PLATFORM,
                                       sizeof(cl_platform_id),
                                       &device_option.platform,
                                       NULL));
  OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_TYPE,
                                       sizeof(cl_device_type),
                                       &device_option.device_type,
                                       NULL));
  device_option.platform_name = opencl_platform_info(device_option.platform, CL_PLATFORM_NAME);
  device_option.device_name = opencl_device_info(device_option.device, CL_DEVICE_NAME);

  ctx->device = device_option.device;

  if (f64_required) {
    cl_uint supported;
    OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
                                         sizeof(cl_uint), &supported, NULL));
    if (!supported) {
      futhark_panic(1, "Program uses double-precision floats, but this is not supported on the chosen device: %s\n",
                    device_option.device_name);
    }
  }

  size_t max_group_size;
  OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
                                       sizeof(size_t), &max_group_size, NULL));

  size_t max_tile_size = sqrt(max_group_size);

  cl_ulong max_local_memory;
  OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_LOCAL_MEM_SIZE,
                                       sizeof(size_t), &max_local_memory, NULL));

  // Futhark reserves 4 bytes for bookkeeping information.
  max_local_memory -= 4;

  // The OpenCL implementation may reserve some local memory bytes for
  // various purposes.  In principle, we should use
  // clGetKernelWorkGroupInfo() to figure out for each kernel how much
  // is actually available, but our current code generator design
  // makes this infeasible.  Instead, we have this nasty hack where we
  // arbitrarily subtract some bytes, based on empirical measurements
  // (but which might be arbitrarily wrong).  Fortunately, we rarely
  // try to really push the local memory usage.
  if (strstr(device_option.platform_name, "NVIDIA CUDA") != NULL) {
    max_local_memory -= 12;
  } else if (strstr(device_option.platform_name, "AMD") != NULL) {
    max_local_memory -= 16;
  }

  // Make sure this function is defined.
  post_opencl_setup(ctx, &device_option);

  if (max_group_size < ctx->cfg->default_group_size) {
    if (ctx->cfg->default_group_size_changed) {
      fprintf(stderr, "Note: Device limits default group size to %zu (down from %zu).\n",
              max_group_size, ctx->cfg->default_group_size);
    }
    ctx->cfg->default_group_size = max_group_size;
  }

  if (max_tile_size < ctx->cfg->default_tile_size) {
    if (ctx->cfg->default_tile_size_changed) {
      fprintf(stderr, "Note: Device limits default tile size to %zu (down from %zu).\n",
              max_tile_size, ctx->cfg->default_tile_size);
    }
    ctx->cfg->default_tile_size = max_tile_size;
  }

  ctx->max_group_size = max_group_size;
  ctx->max_tile_size = max_tile_size; // No limit.
  ctx->max_threshold = ctx->max_num_groups = 0; // No limit.
  ctx->max_local_memory = max_local_memory;

  // Now we go through all the sizes, clamp them to the valid range,
  // or set them to the default.
  for (int i = 0; i < ctx->cfg->num_tuning_params; i++) {
    const char *size_class = ctx->cfg->tuning_param_classes[i];
    int64_t *size_value = &ctx->cfg->tuning_params[i];
    const char* size_name = ctx->cfg->tuning_param_names[i];
    int64_t max_value = 0, default_value = 0;

    if (strstr(size_class, "group_size") == size_class) {
      max_value = max_group_size;
      default_value = ctx->cfg->default_group_size;
    } else if (strstr(size_class, "num_groups") == size_class) {
      max_value = max_group_size; // Futhark assumes this constraint.
      default_value = ctx->cfg->default_num_groups;
      // XXX: as a quick and dirty hack, use twice as many threads for
      // histograms by default.  We really should just be smarter
      // about sizes somehow.
      if (strstr(size_name, ".seghist_") != NULL) {
        default_value *= 2;
      }
    } else if (strstr(size_class, "tile_size") == size_class) {
      max_value = sqrt(max_group_size);
      default_value = ctx->cfg->default_tile_size;
    } else if (strstr(size_class, "reg_tile_size") == size_class) {
      max_value = 0; // No limit.
      default_value = ctx->cfg->default_reg_tile_size;
    } else if (strstr(size_class, "threshold") == size_class) {
      // Threshold can be as large as it takes.
      default_value = ctx->cfg->default_threshold;
    } else {
      // Bespoke sizes have no limit or default.
    }
    if (*size_value == 0) {
      *size_value = default_value;
    } else if (max_value > 0 && *size_value > max_value) {
      fprintf(stderr, "Note: Device limits %s to %d (down from %d)\n",
              size_name, (int)max_value, (int)*size_value);
      *size_value = max_value;
    }
  }

  if (ctx->lockstep_width == 0) {
    ctx->lockstep_width = 1;
  }

  if (ctx->cfg->logging) {
    fprintf(stderr, "Lockstep width: %d\n", (int)ctx->lockstep_width);
    fprintf(stderr, "Default group size: %d\n", (int)ctx->cfg->default_group_size);
    fprintf(stderr, "Default number of groups: %d\n", (int)ctx->cfg->default_num_groups);
  }

  char *compile_opts = mk_compile_opts(ctx, extra_build_opts, device_option);

  if (ctx->cfg->logging) {
    fprintf(stderr, "OpenCL compiler options: %s\n", compile_opts);
  }

  char *fut_opencl_src = NULL;
  cl_program prog;
  error = CL_SUCCESS;

  struct cache_hash h;

  int loaded_from_cache = 0;
  if (ctx->cfg->load_binary_from == NULL) {
    size_t src_size = 0;

    // Maybe we have to read OpenCL source from somewhere else (used for debugging).
    if (ctx->cfg->load_program_from != NULL) {
      fut_opencl_src = slurp_file(ctx->cfg->load_program_from, NULL);
      assert(fut_opencl_src != NULL);
    } else {
      // Construct the OpenCL source concatenating all the fragments.
      for (const char **src = srcs; src && *src; src++) {
        src_size += strlen(*src);
      }

      fut_opencl_src = (char*) malloc(src_size + 1);

      size_t n, i;
      for (i = 0, n = 0; srcs && srcs[i]; i++) {
        strncpy(fut_opencl_src+n, srcs[i], src_size-n);
        n += strlen(srcs[i]);
      }
      fut_opencl_src[src_size] = 0;
    }

    if (ctx->cfg->dump_program_to != NULL) {
      if (ctx->cfg->logging) {
        fprintf(stderr, "Dumping OpenCL source to %s...\n", ctx->cfg->dump_program_to);
      }

      dump_file(ctx->cfg->dump_program_to, fut_opencl_src, strlen(fut_opencl_src));
    }

    if (cache_fname != NULL) {
      if (ctx->cfg->logging) {
        fprintf(stderr, "Restoring cache from from %s...\n", cache_fname);
      }
      cache_hash_init(&h);
      cache_hash(&h, fut_opencl_src, strlen(fut_opencl_src));
      cache_hash(&h, compile_opts, strlen(compile_opts));

      unsigned char *buf;
      size_t bufsize;
      errno = 0;
      if (cache_restore(cache_fname, &h, &buf, &bufsize) != 0) {
        if (ctx->cfg->logging) {
          fprintf(stderr, "Failed to restore cache (errno: %s)\n", strerror(errno));
        }
      } else {
        if (ctx->cfg->logging) {
          fprintf(stderr, "Cache restored; loading OpenCL binary...\n");
        }

        cl_int status = 0;
        prog = clCreateProgramWithBinary(ctx->ctx, 1, &device_option.device,
                                         &bufsize, (const unsigned char**)&buf,
                                         &status, &error);
        if (status == CL_SUCCESS) {
          loaded_from_cache = 1;
          if (ctx->cfg->logging) {
            fprintf(stderr, "Loading succeeded.\n");
          }
        } else {
          if (ctx->cfg->logging) {
            fprintf(stderr, "Loading failed.\n");
          }
        }
      }
    }

    if (!loaded_from_cache) {
      if (ctx->cfg->logging) {
        fprintf(stderr, "Creating OpenCL program...\n");
      }

      const char* src_ptr[] = {fut_opencl_src};
      prog = clCreateProgramWithSource(ctx->ctx, 1, src_ptr, &src_size, &error);
      OPENCL_SUCCEED_FATAL(error);
    }
  } else {
    if (ctx->cfg->logging) {
      fprintf(stderr, "Loading OpenCL binary from %s...\n", ctx->cfg->load_binary_from);
    }
    size_t binary_size;
    unsigned char *fut_opencl_bin =
      (unsigned char*) slurp_file(ctx->cfg->load_binary_from, &binary_size);
    assert(fut_opencl_bin != NULL);
    const unsigned char *binaries[1] = { fut_opencl_bin };
    cl_int status = 0;

    prog = clCreateProgramWithBinary(ctx->ctx, 1, &device_option.device,
                                     &binary_size, binaries,
                                     &status, &error);

    OPENCL_SUCCEED_FATAL(status);
    OPENCL_SUCCEED_FATAL(error);
  }

  if (ctx->cfg->logging) {
    fprintf(stderr, "Building OpenCL program...\n");
  }
  OPENCL_SUCCEED_FATAL(build_opencl_program(prog, device_option.device, compile_opts));

  free(compile_opts);
  free(fut_opencl_src);

  size_t binary_size = 0;
  unsigned char *binary = NULL;
  int store_in_cache = cache_fname != NULL && !loaded_from_cache;
  if (store_in_cache || ctx->cfg->dump_binary_to != NULL) {
    OPENCL_SUCCEED_FATAL(clGetProgramInfo(prog, CL_PROGRAM_BINARY_SIZES,
                                          sizeof(size_t), &binary_size, NULL));
    binary = (unsigned char*) malloc(binary_size);
    OPENCL_SUCCEED_FATAL(clGetProgramInfo(prog, CL_PROGRAM_BINARIES,
                                          sizeof(unsigned char*), &binary, NULL));
  }

  if (store_in_cache) {
    if (ctx->cfg->logging) {
      fprintf(stderr, "Caching OpenCL binary in %s...\n", cache_fname);
    }
    if (cache_store(cache_fname, &h, binary, binary_size) != 0) {
      printf("Failed to cache binary: %s\n", strerror(errno));
    }
  }

  if (ctx->cfg->dump_binary_to != NULL) {
    if (ctx->cfg->logging) {
      fprintf(stderr, "Dumping OpenCL binary to %s...\n", ctx->cfg->dump_binary_to);
    }
    dump_file(ctx->cfg->dump_binary_to, binary, binary_size);
  }

  ctx->clprogram = prog;
}

static struct opencl_device_option get_preferred_device(const struct futhark_context_config *cfg) {
  struct opencl_device_option *devices;
  size_t num_devices;

  opencl_all_device_options(&devices, &num_devices);

  int num_device_matches = 0;

  for (size_t i = 0; i < num_devices; i++) {
    struct opencl_device_option device = devices[i];
    if (strstr(device.platform_name, cfg->preferred_platform) != NULL &&
        strstr(device.device_name, cfg->preferred_device) != NULL &&
        (cfg->ignore_blacklist ||
         !is_blacklisted(device.platform_name, device.device_name, cfg)) &&
        num_device_matches++ == cfg->preferred_device_num) {
      // Free all the platform and device names, except the ones we have chosen.
      for (size_t j = 0; j < num_devices; j++) {
        if (j != i) {
          free(devices[j].platform_name);
          free(devices[j].device_name);
        }
      }
      free(devices);
      return device;
    }
  }

  futhark_panic(1, "Could not find acceptable OpenCL device.\n");
  exit(1); // Never reached
}

static void setup_opencl(struct futhark_context *ctx,
                         const char *srcs[],
                         const char *extra_build_opts[],
                         const char* cache_fname) {
  struct opencl_device_option device_option = get_preferred_device(ctx->cfg);

  if (ctx->cfg->logging) {
    fprintf(stderr, "Using platform: %s\n", device_option.platform_name);
    fprintf(stderr, "Using device: %s\n", device_option.device_name);
  }

  // Note that NVIDIA's OpenCL requires the platform property
  cl_context_properties properties[] = {
    CL_CONTEXT_PLATFORM,
    (cl_context_properties)device_option.platform,
    0
  };

  cl_int clCreateContext_error;
  ctx->ctx = clCreateContext(properties, 1, &device_option.device, NULL, NULL, &clCreateContext_error);
  OPENCL_SUCCEED_FATAL(clCreateContext_error);

  cl_int clCreateCommandQueue_error;
  cl_command_queue queue =
    clCreateCommandQueue(ctx->ctx,
                         device_option.device,
                         ctx->cfg->profiling ? CL_QUEUE_PROFILING_ENABLE : 0,
                         &clCreateCommandQueue_error);
  OPENCL_SUCCEED_FATAL(clCreateCommandQueue_error);

  setup_opencl_with_command_queue(ctx, queue, srcs, extra_build_opts, cache_fname);
}

int backend_context_setup(struct futhark_context* ctx) {
  ctx->lockstep_width = 0; // Real value set later.
  ctx->profiling_records_capacity = 200;
  ctx->profiling_records_used = 0;
  ctx->profiling_records =
    malloc(ctx->profiling_records_capacity *
           sizeof(struct profiling_record));
  ctx->failure_is_an_option = 0;
  ctx->total_runs = 0;
  ctx->total_runtime = 0;
  ctx->peak_mem_usage_device = 0;
  ctx->cur_mem_usage_device = 0;

  if (ctx->cfg->queue_set) {
    setup_opencl_with_command_queue(ctx, ctx->cfg->queue, opencl_program, ctx->cfg->build_opts, ctx->cfg->cache_fname);
  } else {
    setup_opencl(ctx, opencl_program, ctx->cfg->build_opts, ctx->cfg->cache_fname);
  }

  cl_int error;
  cl_int no_error = -1;
  ctx->global_failure =
    clCreateBuffer(ctx->ctx,
                   CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                   sizeof(cl_int), &no_error, &error);
  OPENCL_SUCCEED_OR_RETURN(error);

  // The +1 is to avoid zero-byte allocations.
  ctx->global_failure_args =
    clCreateBuffer(ctx->ctx,
                   CL_MEM_READ_WRITE,
                   sizeof(int64_t)*(max_failure_args+1), NULL, &error);
  OPENCL_SUCCEED_OR_RETURN(error);
  return 0;
}

void backend_context_teardown(struct futhark_context* ctx) {
  OPENCL_SUCCEED_FATAL(clReleaseMemObject(ctx->global_failure));
  OPENCL_SUCCEED_FATAL(clReleaseMemObject(ctx->global_failure_args));
  (void)opencl_tally_profiling_records(ctx);
  free(ctx->profiling_records);
  (void)opencl_free_all(ctx);
  (void)clReleaseProgram(ctx->clprogram);
  (void)clReleaseCommandQueue(ctx->queue);
  (void)clReleaseContext(ctx->ctx);
}

cl_command_queue futhark_context_get_command_queue(struct futhark_context* ctx) {
  return ctx->queue;
}

// End of backends/opencl.h

static char *get_failure_msg(int failure_idx, int64_t args[])
{
    switch (failure_idx) { }
    return strdup("Unknown error.  This is a compiler bug.");
}
void post_opencl_setup(struct futhark_context *ctx, struct opencl_device_option *option)
{
    if ((ctx->lockstep_width == 0 && strstr(option->platform_name, "NVIDIA CUDA") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
        ctx->lockstep_width = 32;
    }
    if ((ctx->lockstep_width == 0 && strstr(option->platform_name, "AMD Accelerated Parallel Processing") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
        ctx->lockstep_width = 32;
    }
    if ((ctx->lockstep_width == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
        ctx->lockstep_width = 1;
    }
    if ((ctx->cfg->default_num_groups == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
        size_t MAX_COMPUTE_UNITS_val = 0;

        clGetDeviceInfo(ctx->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MAX_COMPUTE_UNITS_val), &MAX_COMPUTE_UNITS_val, NULL);
        ctx->cfg->default_num_groups = 4 * MAX_COMPUTE_UNITS_val;
    }
    if ((ctx->cfg->default_group_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
        ctx->cfg->default_group_size = 256;
    }
    if ((ctx->cfg->default_tile_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
        ctx->cfg->default_tile_size = 16;
    }
    if ((ctx->cfg->default_reg_tile_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
        ctx->cfg->default_reg_tile_size = 4;
    }
    if ((ctx->cfg->default_threshold == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_GPU) == CL_DEVICE_TYPE_GPU) {
        ctx->cfg->default_threshold = 32768;
    }
    if ((ctx->lockstep_width == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) {
        ctx->lockstep_width = 1;
    }
    if ((ctx->cfg->default_num_groups == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) {
        size_t MAX_COMPUTE_UNITS_val = 0;

        clGetDeviceInfo(ctx->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MAX_COMPUTE_UNITS_val), &MAX_COMPUTE_UNITS_val, NULL);
        ctx->cfg->default_num_groups = MAX_COMPUTE_UNITS_val;
    }
    if ((ctx->cfg->default_group_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) {
        ctx->cfg->default_group_size = 32;
    }
    if ((ctx->cfg->default_tile_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) {
        ctx->cfg->default_tile_size = 4;
    }
    if ((ctx->cfg->default_reg_tile_size == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) {
        ctx->cfg->default_reg_tile_size = 1;
    }
    if ((ctx->cfg->default_threshold == 0 && strstr(option->platform_name, "") != NULL) && (option->device_type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) {
        size_t MAX_COMPUTE_UNITS_val = 0;

        clGetDeviceInfo(ctx->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MAX_COMPUTE_UNITS_val), &MAX_COMPUTE_UNITS_val, NULL);
        ctx->cfg->default_threshold = MAX_COMPUTE_UNITS_val;
    }
}
struct program {
    cl_kernel addzisegmap_6892;
    cl_kernel add_i64zisegmap_6912;
    int64_t copy_dev_to_dev_total_runtime;
    int copy_dev_to_dev_runs;
    int64_t copy_dev_to_host_total_runtime;
    int copy_dev_to_host_runs;
    int64_t copy_host_to_dev_total_runtime;
    int copy_host_to_dev_runs;
    int64_t copy_scalar_to_dev_total_runtime;
    int copy_scalar_to_dev_runs;
    int64_t copy_scalar_from_dev_total_runtime;
    int copy_scalar_from_dev_runs;
    int64_t addzisegmap_6892_total_runtime;
    int addzisegmap_6892_runs;
    int64_t add_i64zisegmap_6912_total_runtime;
    int add_i64zisegmap_6912_runs;
};
static void setup_program(struct futhark_context *ctx)
{
    (void) ctx;

    int error = 0;

    (void) error;
    ctx->program = malloc(sizeof(struct program));
    ctx->program->copy_dev_to_dev_total_runtime = 0;
    ctx->program->copy_dev_to_dev_runs = 0;
    ctx->program->copy_dev_to_host_total_runtime = 0;
    ctx->program->copy_dev_to_host_runs = 0;
    ctx->program->copy_host_to_dev_total_runtime = 0;
    ctx->program->copy_host_to_dev_runs = 0;
    ctx->program->copy_scalar_to_dev_total_runtime = 0;
    ctx->program->copy_scalar_to_dev_runs = 0;
    ctx->program->copy_scalar_from_dev_total_runtime = 0;
    ctx->program->copy_scalar_from_dev_runs = 0;
    ctx->program->addzisegmap_6892_total_runtime = 0;
    ctx->program->addzisegmap_6892_runs = 0;
    ctx->program->add_i64zisegmap_6912_total_runtime = 0;
    ctx->program->add_i64zisegmap_6912_runs = 0;
    {
        ctx->program->addzisegmap_6892 = clCreateKernel(ctx->clprogram, "addzisegmap_6892", &error);
        OPENCL_SUCCEED_FATAL(error);
        OPENCL_SUCCEED_FATAL(clSetKernelArg(ctx->program->addzisegmap_6892, 0, sizeof(cl_mem), &ctx->global_failure));
        if (ctx->debugging)
            fprintf(ctx->log, "Created kernel %s.\n", "add.segmap_6892");
    }
    {
        ctx->program->add_i64zisegmap_6912 = clCreateKernel(ctx->clprogram, "add_i64zisegmap_6912", &error);
        OPENCL_SUCCEED_FATAL(error);
        OPENCL_SUCCEED_FATAL(clSetKernelArg(ctx->program->add_i64zisegmap_6912, 0, sizeof(cl_mem), &ctx->global_failure));
        if (ctx->debugging)
            fprintf(ctx->log, "Created kernel %s.\n", "add_i64.segmap_6912");
    }
}
static void teardown_program(struct futhark_context *ctx)
{
    (void) ctx;

    int error = 0;

    (void) error;
    OPENCL_SUCCEED_FATAL(clReleaseKernel(ctx->program->addzisegmap_6892));
    OPENCL_SUCCEED_FATAL(clReleaseKernel(ctx->program->add_i64zisegmap_6912));
    free(ctx->program);
}
static void set_tuning_params(struct futhark_context *ctx)
{
    (void) ctx;
    ctx->tuning_params.addzisegmap_group_sizze_6879 = &ctx->cfg->tuning_params[0];
    ctx->tuning_params.add_i64zisegmap_group_sizze_6899 = &ctx->cfg->tuning_params[1];
}
int memblock_unref_device(struct futhark_context *ctx, struct memblock_device *block, const char *desc)
{
    if (block->references != NULL) {
        *block->references -= 1;
        if (ctx->detail_memory)
            fprintf(ctx->log, "Unreferencing block %s (allocated as %s) in %s: %d references remaining.\n", desc, block->desc, "space 'device'", *block->references);
        if (*block->references == 0) {
            ctx->cur_mem_usage_device -= block->size;
            OPENCL_SUCCEED_OR_RETURN(opencl_free(ctx, block->mem, block->size, desc));
            free(block->references);
            if (ctx->detail_memory)
                fprintf(ctx->log, "%lld bytes freed (now allocated: %lld bytes)\n", (long long) block->size, (long long) ctx->cur_mem_usage_device);
        }
        block->references = NULL;
    }
    return 0;
}
int memblock_alloc_device(struct futhark_context *ctx, struct memblock_device *block, int64_t size, const char *desc)
{
    if (size < 0)
        futhark_panic(1, "Negative allocation of %lld bytes attempted for %s in %s.\n", (long long) size, desc, "space 'device'", ctx->cur_mem_usage_device);

    int ret = memblock_unref_device(ctx, block, desc);

    if (ret != FUTHARK_SUCCESS)
        return ret;
    if (ctx->detail_memory)
        fprintf(ctx->log, "Allocating %lld bytes for %s in %s (then allocated: %lld bytes)", (long long) size, desc, "space 'device'", (long long) ctx->cur_mem_usage_device + size);
    if (ctx->cur_mem_usage_device > ctx->peak_mem_usage_device) {
        ctx->peak_mem_usage_device = ctx->cur_mem_usage_device;
        if (ctx->detail_memory)
            fprintf(ctx->log, " (new peak).\n");
    } else if (ctx->detail_memory)
        fprintf(ctx->log, ".\n");
    ctx->error = OPENCL_SUCCEED_NONFATAL(opencl_alloc(ctx, ctx->log, (size_t) size, desc, &block->mem, (size_t *) &size));
    if (ctx->error == NULL) {
        block->references = (int *) malloc(sizeof(int));
        *block->references = 1;
        block->size = size;
        block->desc = desc;
        ctx->cur_mem_usage_device += size;
        return FUTHARK_SUCCESS;
    } else {
        // We are naively assuming that any memory allocation error is due to OOM.
        lock_lock(&ctx->error_lock);

        char *old_error = ctx->error;

        ctx->error = msgprintf("Failed to allocate memory in %s.\nAttempted allocation: %12lld bytes\nCurrently allocated:  %12lld bytes\n%s", "space 'device'", (long long) size, (long long) ctx->cur_mem_usage_device, old_error);
        free(old_error);
        lock_unlock(&ctx->error_lock);
        return FUTHARK_OUT_OF_MEMORY;
    }
}
int memblock_set_device(struct futhark_context *ctx, struct memblock_device *lhs, struct memblock_device *rhs, const char *lhs_desc)
{
    int ret = memblock_unref_device(ctx, lhs, lhs_desc);

    if (rhs->references != NULL)
        (*rhs->references)++;
    *lhs = *rhs;
    return ret;
}
int memblock_unref(struct futhark_context *ctx, struct memblock *block, const char *desc)
{
    if (block->references != NULL) {
        *block->references -= 1;
        if (ctx->detail_memory)
            fprintf(ctx->log, "Unreferencing block %s (allocated as %s) in %s: %d references remaining.\n", desc, block->desc, "default space", *block->references);
        if (*block->references == 0) {
            ctx->cur_mem_usage_default -= block->size;
            host_free(ctx, (size_t) block->size, desc, (void *) block->mem);
            free(block->references);
            if (ctx->detail_memory)
                fprintf(ctx->log, "%lld bytes freed (now allocated: %lld bytes)\n", (long long) block->size, (long long) ctx->cur_mem_usage_default);
        }
        block->references = NULL;
    }
    return 0;
}
int memblock_alloc(struct futhark_context *ctx, struct memblock *block, int64_t size, const char *desc)
{
    if (size < 0)
        futhark_panic(1, "Negative allocation of %lld bytes attempted for %s in %s.\n", (long long) size, desc, "default space", ctx->cur_mem_usage_default);

    int ret = memblock_unref(ctx, block, desc);

    if (ret != FUTHARK_SUCCESS)
        return ret;
    if (ctx->detail_memory)
        fprintf(ctx->log, "Allocating %lld bytes for %s in %s (then allocated: %lld bytes)", (long long) size, desc, "default space", (long long) ctx->cur_mem_usage_default + size);
    if (ctx->cur_mem_usage_default > ctx->peak_mem_usage_default) {
        ctx->peak_mem_usage_default = ctx->cur_mem_usage_default;
        if (ctx->detail_memory)
            fprintf(ctx->log, " (new peak).\n");
    } else if (ctx->detail_memory)
        fprintf(ctx->log, ".\n");
    host_alloc(ctx, (size_t) size, desc, (size_t *) &size, (void *) &block->mem);
    if (ctx->error == NULL) {
        block->references = (int *) malloc(sizeof(int));
        *block->references = 1;
        block->size = size;
        block->desc = desc;
        ctx->cur_mem_usage_default += size;
        return FUTHARK_SUCCESS;
    } else {
        // We are naively assuming that any memory allocation error is due to OOM.
        lock_lock(&ctx->error_lock);

        char *old_error = ctx->error;

        ctx->error = msgprintf("Failed to allocate memory in %s.\nAttempted allocation: %12lld bytes\nCurrently allocated:  %12lld bytes\n%s", "default space", (long long) size, (long long) ctx->cur_mem_usage_default, old_error);
        free(old_error);
        lock_unlock(&ctx->error_lock);
        return FUTHARK_OUT_OF_MEMORY;
    }
}
int memblock_set(struct futhark_context *ctx, struct memblock *lhs, struct memblock *rhs, const char *lhs_desc)
{
    int ret = memblock_unref(ctx, lhs, lhs_desc);

    if (rhs->references != NULL)
        (*rhs->references)++;
    *lhs = *rhs;
    return ret;
}
void futhark_context_config_set_debugging(struct futhark_context_config *cfg, int flag)
{
    cfg->profiling = cfg->logging = cfg->debugging = flag;
}
void futhark_context_config_set_profiling(struct futhark_context_config *cfg, int flag)
{
    cfg->profiling = flag;
}
void futhark_context_config_set_logging(struct futhark_context_config *cfg, int flag)
{
    cfg->logging = flag;
}
void futhark_context_config_set_cache_file(struct futhark_context_config *cfg, const char *f)
{
    cfg->cache_fname = f;
}
int futhark_get_tuning_param_count(void)
{
    return num_tuning_params;
}
const char *futhark_get_tuning_param_name(int i)
{
    return tuning_param_names[i];
}
const char *futhark_get_tuning_param_class(int i)
{
    return tuning_param_classes[i];
}
char *futhark_context_report(struct futhark_context *ctx)
{
    if (futhark_context_sync(ctx) != 0)
        return NULL;

    struct str_builder builder;

    str_builder_init(&builder);
    str_builder(&builder, "Peak memory usage for space 'device': %lld bytes.\n", (long long) ctx->peak_mem_usage_device);
    { }
    if (ctx->profiling) {
        OPENCL_SUCCEED_FATAL(opencl_tally_profiling_records(ctx));
        str_builder(&builder, "copy_dev_to_dev      ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->copy_dev_to_dev_runs, (long) ctx->program->copy_dev_to_dev_total_runtime / (ctx->program->copy_dev_to_dev_runs != 0 ? ctx->program->copy_dev_to_dev_runs : 1), (long) ctx->program->copy_dev_to_dev_total_runtime);
        ctx->total_runtime += ctx->program->copy_dev_to_dev_total_runtime;
        ctx->total_runs += ctx->program->copy_dev_to_dev_runs;
        str_builder(&builder, "copy_dev_to_host     ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->copy_dev_to_host_runs, (long) ctx->program->copy_dev_to_host_total_runtime / (ctx->program->copy_dev_to_host_runs != 0 ? ctx->program->copy_dev_to_host_runs : 1), (long) ctx->program->copy_dev_to_host_total_runtime);
        ctx->total_runtime += ctx->program->copy_dev_to_host_total_runtime;
        ctx->total_runs += ctx->program->copy_dev_to_host_runs;
        str_builder(&builder, "copy_host_to_dev     ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->copy_host_to_dev_runs, (long) ctx->program->copy_host_to_dev_total_runtime / (ctx->program->copy_host_to_dev_runs != 0 ? ctx->program->copy_host_to_dev_runs : 1), (long) ctx->program->copy_host_to_dev_total_runtime);
        ctx->total_runtime += ctx->program->copy_host_to_dev_total_runtime;
        ctx->total_runs += ctx->program->copy_host_to_dev_runs;
        str_builder(&builder, "copy_scalar_to_dev   ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->copy_scalar_to_dev_runs, (long) ctx->program->copy_scalar_to_dev_total_runtime / (ctx->program->copy_scalar_to_dev_runs != 0 ? ctx->program->copy_scalar_to_dev_runs : 1), (long) ctx->program->copy_scalar_to_dev_total_runtime);
        ctx->total_runtime += ctx->program->copy_scalar_to_dev_total_runtime;
        ctx->total_runs += ctx->program->copy_scalar_to_dev_runs;
        str_builder(&builder, "copy_scalar_from_dev ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->copy_scalar_from_dev_runs, (long) ctx->program->copy_scalar_from_dev_total_runtime / (ctx->program->copy_scalar_from_dev_runs != 0 ? ctx->program->copy_scalar_from_dev_runs : 1), (long) ctx->program->copy_scalar_from_dev_total_runtime);
        ctx->total_runtime += ctx->program->copy_scalar_from_dev_total_runtime;
        ctx->total_runs += ctx->program->copy_scalar_from_dev_runs;
        str_builder(&builder, "add.segmap_6892      ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->addzisegmap_6892_runs, (long) ctx->program->addzisegmap_6892_total_runtime / (ctx->program->addzisegmap_6892_runs != 0 ? ctx->program->addzisegmap_6892_runs : 1), (long) ctx->program->addzisegmap_6892_total_runtime);
        ctx->total_runtime += ctx->program->addzisegmap_6892_total_runtime;
        ctx->total_runs += ctx->program->addzisegmap_6892_runs;
        str_builder(&builder, "add_i64.segmap_6912  ran %5d times; avg: %8ldus; total: %8ldus\n", ctx->program->add_i64zisegmap_6912_runs, (long) ctx->program->add_i64zisegmap_6912_total_runtime / (ctx->program->add_i64zisegmap_6912_runs != 0 ? ctx->program->add_i64zisegmap_6912_runs : 1), (long) ctx->program->add_i64zisegmap_6912_total_runtime);
        ctx->total_runtime += ctx->program->add_i64zisegmap_6912_total_runtime;
        ctx->total_runs += ctx->program->add_i64zisegmap_6912_runs;
        str_builder(&builder, "%d operations with cumulative runtime: %6ldus\n", ctx->total_runs, ctx->total_runtime);
    }
    return builder.str;
}
char *futhark_context_get_error(struct futhark_context *ctx)
{
    char *error = ctx->error;

    ctx->error = NULL;
    return error;
}
void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f)
{
    ctx->log = f;
}
void futhark_context_pause_profiling(struct futhark_context *ctx)
{
    ctx->profiling_paused = 1;
}
void futhark_context_unpause_profiling(struct futhark_context *ctx)
{
    ctx->profiling_paused = 0;
}
int futhark_context_clear_caches(struct futhark_context *ctx)
{
    lock_lock(&ctx->lock);
    ctx->peak_mem_usage_device = 0;
    ctx->peak_mem_usage_default = 0;
    if (ctx->error == NULL)
        ctx->error = OPENCL_SUCCEED_NONFATAL(opencl_free_all(ctx));
    lock_unlock(&ctx->lock);
    return ctx->error != NULL;
}

// Start of context.h

// Eventually it would be nice to move the context definition in here
// instead of generating it in the compiler.  For now it defines
// various helper functions that must be available.

// Internal functions.

static void set_error(struct futhark_context* ctx, char *error) {
  lock_lock(&ctx->error_lock);
  if (ctx->error == NULL) {
    ctx->error = error;
  } else {
    free(error);
  }
  lock_unlock(&ctx->error_lock);
}

// XXX: should be static, but used in ispc_util.h
void lexical_realloc_error(struct futhark_context* ctx, size_t new_size) {
  set_error(ctx,
            msgprintf("Failed to allocate memory.\nAttempted allocation: %12lld bytes\n",
                      (long long) new_size));
}

static int lexical_realloc(struct futhark_context *ctx,
                           unsigned char **ptr,
                           int64_t *old_size,
                           int64_t new_size) {
  unsigned char *new = realloc(*ptr, (size_t)new_size);
  if (new == NULL) {
    lexical_realloc_error(ctx, new_size);
    return FUTHARK_OUT_OF_MEMORY;
  } else {
    *ptr = new;
    *old_size = new_size;
    return FUTHARK_SUCCESS;
  }
}

static void free_all_in_free_list(struct futhark_context* ctx) {
  fl_mem mem;
  free_list_pack(&ctx->free_list);
  while (free_list_first(&ctx->free_list, (fl_mem*)&mem) == 0) {
    free((void*)mem);
  }
}

static int is_small_alloc(size_t size) {
  return size < 1024*1024;
}

static void host_alloc(struct futhark_context* ctx,
                       size_t size, const char* tag, size_t* size_out, void** mem_out) {
  if (is_small_alloc(size) || free_list_find(&ctx->free_list, size, tag, size_out, (fl_mem*)mem_out) != 0) {
    *size_out = size;
    *mem_out = malloc(size);
  }
}

static void host_free(struct futhark_context* ctx,
                      size_t size, const char* tag, void* mem) {
  // Small allocations are handled by malloc()s own free list.  The
  // threshold here is kind of arbitrary, but seems to work OK.
  // Larger allocations are mmap()ed/munmapped() every time, which is
  // very slow, and Futhark programs tend to use a few very large
  // allocations.
  if (is_small_alloc(size)) {
    free(mem);
  } else {
    free_list_insert(&ctx->free_list, size, (fl_mem)mem, tag);
  }
}

struct futhark_context_config* futhark_context_config_new(void) {
  struct futhark_context_config* cfg = malloc(sizeof(struct futhark_context_config));
  if (cfg == NULL) {
    return NULL;
  }
  cfg->in_use = 0;
  cfg->debugging = 0;
  cfg->profiling = 0;
  cfg->logging = 0;
  cfg->cache_fname = NULL;
  cfg->num_tuning_params = num_tuning_params;
  cfg->tuning_params = malloc(cfg->num_tuning_params * sizeof(int64_t));
  memcpy(cfg->tuning_params, tuning_param_defaults,
         cfg->num_tuning_params * sizeof(int64_t));
  cfg->tuning_param_names = tuning_param_names;
  cfg->tuning_param_vars = tuning_param_vars;
  cfg->tuning_param_classes = tuning_param_classes;
  backend_context_config_setup(cfg);
  return cfg;
}

void futhark_context_config_free(struct futhark_context_config* cfg) {
  assert(!cfg->in_use);
  backend_context_config_teardown(cfg);
  free(cfg->tuning_params);
  free(cfg);
}

struct futhark_context* futhark_context_new(struct futhark_context_config* cfg) {
  struct futhark_context* ctx = malloc(sizeof(struct futhark_context));
  if (ctx == NULL) {
    return NULL;
  }
  assert(!cfg->in_use);
  ctx->cfg = cfg;
  ctx->cfg->in_use = 1;
  create_lock(&ctx->error_lock);
  create_lock(&ctx->lock);
  free_list_init(&ctx->free_list);
  ctx->peak_mem_usage_default = 0;
  ctx->cur_mem_usage_default = 0;
  ctx->constants = malloc(sizeof(struct constants));
  ctx->detail_memory = cfg->debugging;
  ctx->debugging = cfg->debugging;
  ctx->logging = cfg->logging;
  ctx->profiling = cfg->profiling;
  ctx->profiling_paused = 0;
  ctx->error = NULL;
  ctx->log = stderr;
  if (backend_context_setup(ctx) == 0) {
    set_tuning_params(ctx);
    setup_program(ctx);
    init_constants(ctx);
    (void)futhark_context_clear_caches(ctx);
    (void)futhark_context_sync(ctx);
  }
  return ctx;
}

void futhark_context_free(struct futhark_context* ctx) {
  free_constants(ctx);
  teardown_program(ctx);
  backend_context_teardown(ctx);
  free_all_in_free_list(ctx);
  free_list_destroy(&ctx->free_list);
  free(ctx->constants);
  free_lock(&ctx->lock);
  free_lock(&ctx->error_lock);
  ctx->cfg->in_use = 0;
  free(ctx);
}

// End of context.h

static int futrts_entry_add(struct futhark_context *ctx, struct memblock_device *mem_out_p_6932, struct memblock_device xs_mem_6916, struct memblock_device ys_mem_6917, int64_t n_6776);
static int futrts_entry_add_i64(struct futhark_context *ctx, struct memblock_device *mem_out_p_6938, struct memblock_device xs_mem_6916, struct memblock_device ys_mem_6917, int64_t n_6836);

static int init_constants(struct futhark_context *ctx)
{
    (void) ctx;

    int err = 0;


  cleanup:
    return err;
}
static int free_constants(struct futhark_context *ctx)
{
    (void) ctx;
    return 0;
}
struct futhark_u8_1d {
    struct memblock_device mem;
    int64_t shape[1];
};
struct futhark_u8_1d *futhark_new_u8_1d(struct futhark_context *ctx, const uint8_t *data, int64_t dim0)
{
    struct futhark_u8_1d *bad = NULL;
    struct futhark_u8_1d *arr = (struct futhark_u8_1d *) malloc(sizeof(struct futhark_u8_1d));

    if (arr == NULL)
        return bad;
    lock_lock(&ctx->lock);
    arr->mem.references = NULL;
    if (memblock_alloc_device(ctx, &arr->mem, dim0 * 1, "arr->mem"))
        return NULL;
    arr->shape[0] = dim0;
    if ((size_t) dim0 * 1 > 0)
        OPENCL_SUCCEED_OR_RETURN(clEnqueueWriteBuffer(ctx->queue, arr->mem.mem, CL_FALSE, (size_t) 0, (size_t) ((size_t) dim0 * 1), data + 0, 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_dev_to_host_runs, &ctx->program->copy_dev_to_host_total_runtime)));
    lock_unlock(&ctx->lock);
    return arr;
}
struct futhark_u8_1d *futhark_new_raw_u8_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0)
{
    struct futhark_u8_1d *bad = NULL;
    struct futhark_u8_1d *arr = (struct futhark_u8_1d *) malloc(sizeof(struct futhark_u8_1d));

    if (arr == NULL)
        return bad;
    lock_lock(&ctx->lock);
    arr->mem.references = NULL;
    if (memblock_alloc_device(ctx, &arr->mem, dim0 * 1, "arr->mem"))
        return NULL;
    arr->shape[0] = dim0;
    if ((size_t) dim0 * 1 > 0) {
        OPENCL_SUCCEED_OR_RETURN(clEnqueueCopyBuffer(ctx->queue, data, arr->mem.mem, (size_t) offset, (size_t) 0, (size_t) ((size_t) dim0 * 1), 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_dev_to_dev_runs, &ctx->program->copy_dev_to_dev_total_runtime)));
        if (ctx->debugging)
            OPENCL_SUCCEED_FATAL(clFinish(ctx->queue));
    }
    lock_unlock(&ctx->lock);
    return arr;
}
int futhark_free_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr)
{
    lock_lock(&ctx->lock);
    if (memblock_unref_device(ctx, &arr->mem, "arr->mem") != 0)
        return 1;
    lock_unlock(&ctx->lock);
    free(arr);
    return 0;
}
int futhark_values_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr, uint8_t *data)
{
    lock_lock(&ctx->lock);
    if ((size_t) arr->shape[0] * 1 > 0) {
        cl_bool sync_call = CL_FALSE;

        OPENCL_SUCCEED_OR_RETURN(clEnqueueReadBuffer(ctx->queue, arr->mem.mem, ctx->failure_is_an_option ? CL_FALSE : sync_call, (size_t) 0, (size_t) ((size_t) arr->shape[0] * 1), data + 0, 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_host_to_dev_runs, &ctx->program->copy_host_to_dev_total_runtime)));
        if ((sync_call && ctx->failure_is_an_option) && futhark_context_sync(ctx) != 0)
            return 1;
    }
    lock_unlock(&ctx->lock);
    return 0;
}
cl_mem futhark_values_raw_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr)
{
    (void) ctx;
    return arr->mem.mem;
}
const int64_t *futhark_shape_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr)
{
    (void) ctx;
    return arr->shape;
}
struct futhark_i64_1d {
    struct memblock_device mem;
    int64_t shape[1];
};
struct futhark_i64_1d *futhark_new_i64_1d(struct futhark_context *ctx, const int64_t *data, int64_t dim0)
{
    struct futhark_i64_1d *bad = NULL;
    struct futhark_i64_1d *arr = (struct futhark_i64_1d *) malloc(sizeof(struct futhark_i64_1d));

    if (arr == NULL)
        return bad;
    lock_lock(&ctx->lock);
    arr->mem.references = NULL;
    if (memblock_alloc_device(ctx, &arr->mem, dim0 * 8, "arr->mem"))
        return NULL;
    arr->shape[0] = dim0;
    if ((size_t) dim0 * 8 > 0)
        OPENCL_SUCCEED_OR_RETURN(clEnqueueWriteBuffer(ctx->queue, arr->mem.mem, CL_FALSE, (size_t) 0, (size_t) ((size_t) dim0 * 8), data + 0, 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_dev_to_host_runs, &ctx->program->copy_dev_to_host_total_runtime)));
    lock_unlock(&ctx->lock);
    return arr;
}
struct futhark_i64_1d *futhark_new_raw_i64_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0)
{
    struct futhark_i64_1d *bad = NULL;
    struct futhark_i64_1d *arr = (struct futhark_i64_1d *) malloc(sizeof(struct futhark_i64_1d));

    if (arr == NULL)
        return bad;
    lock_lock(&ctx->lock);
    arr->mem.references = NULL;
    if (memblock_alloc_device(ctx, &arr->mem, dim0 * 8, "arr->mem"))
        return NULL;
    arr->shape[0] = dim0;
    if ((size_t) dim0 * 8 > 0) {
        OPENCL_SUCCEED_OR_RETURN(clEnqueueCopyBuffer(ctx->queue, data, arr->mem.mem, (size_t) offset, (size_t) 0, (size_t) ((size_t) dim0 * 8), 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_dev_to_dev_runs, &ctx->program->copy_dev_to_dev_total_runtime)));
        if (ctx->debugging)
            OPENCL_SUCCEED_FATAL(clFinish(ctx->queue));
    }
    lock_unlock(&ctx->lock);
    return arr;
}
int futhark_free_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr)
{
    lock_lock(&ctx->lock);
    if (memblock_unref_device(ctx, &arr->mem, "arr->mem") != 0)
        return 1;
    lock_unlock(&ctx->lock);
    free(arr);
    return 0;
}
int futhark_values_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr, int64_t *data)
{
    lock_lock(&ctx->lock);
    if ((size_t) arr->shape[0] * 8 > 0) {
        cl_bool sync_call = CL_FALSE;

        OPENCL_SUCCEED_OR_RETURN(clEnqueueReadBuffer(ctx->queue, arr->mem.mem, ctx->failure_is_an_option ? CL_FALSE : sync_call, (size_t) 0, (size_t) ((size_t) arr->shape[0] * 8), data + 0, 0, NULL, ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->copy_host_to_dev_runs, &ctx->program->copy_host_to_dev_total_runtime)));
        if ((sync_call && ctx->failure_is_an_option) && futhark_context_sync(ctx) != 0)
            return 1;
    }
    lock_unlock(&ctx->lock);
    return 0;
}
cl_mem futhark_values_raw_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr)
{
    (void) ctx;
    return arr->mem.mem;
}
const int64_t *futhark_shape_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr)
{
    (void) ctx;
    return arr->shape;
}

static int futrts_entry_add(struct futhark_context *ctx, struct memblock_device *mem_out_p_6932, struct memblock_device xs_mem_6916, struct memblock_device ys_mem_6917, int64_t n_6776)
{
    (void) ctx;

    int err = 0;
    struct memblock_device mem_6920;

    mem_6920.references = NULL;

    struct memblock_device mem_out_6922;

    mem_out_6922.references = NULL;

    int64_t bytes_6919 = smax64((int64_t) 0, n_6776);
    int64_t segmap_group_sizze_6888;

    segmap_group_sizze_6888 = *ctx->tuning_params.addzisegmap_group_sizze_6879;

    int64_t segmap_usable_groups_6889 = sdiv_up64(n_6776, segmap_group_sizze_6888);

    if (memblock_alloc_device(ctx, &mem_6920, bytes_6919, "mem_6920")) {
        err = 1;
        goto cleanup;
    }
    if (ctx->debugging)
        fprintf(ctx->log, "%s\n", "\n# SegMap");

    int32_t virt_num_groups_6923 = sext_i64_i32(sdiv_up64(n_6776, segmap_group_sizze_6888));

    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->addzisegmap_6892, 1, sizeof(n_6776), &n_6776));
    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->addzisegmap_6892, 2, sizeof(xs_mem_6916.mem), &xs_mem_6916.mem));
    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->addzisegmap_6892, 3, sizeof(ys_mem_6917.mem), &ys_mem_6917.mem));
    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->addzisegmap_6892, 4, sizeof(mem_6920.mem), &mem_6920.mem));
    if (1 * ((size_t) segmap_usable_groups_6889 * (size_t) *ctx->tuning_params.addzisegmap_group_sizze_6879) != 0) {
        const size_t global_work_sizze_6933[1] = {(size_t) segmap_usable_groups_6889 * (size_t) *ctx->tuning_params.addzisegmap_group_sizze_6879};
        const size_t local_work_sizze_6937[1] = {(size_t) *ctx->tuning_params.addzisegmap_group_sizze_6879};
        int64_t time_start_6934 = 0, time_end_6935 = 0;

        if (ctx->debugging) {
            fprintf(ctx->log, "Launching %s with global work size [%zu] and local work size [%zu]; local memory: %d bytes.\n", "add.segmap_6892", global_work_sizze_6933[0], local_work_sizze_6937[0], (int) 0);
            time_start_6934 = get_wall_time();
        }

        cl_event *pevent = ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->addzisegmap_6892_runs, &ctx->program->addzisegmap_6892_total_runtime);

        OPENCL_SUCCEED_OR_RETURN(clEnqueueNDRangeKernel(ctx->queue, ctx->program->addzisegmap_6892, 1, NULL, global_work_sizze_6933, local_work_sizze_6937, 0, NULL, pevent));
        if (ctx->debugging) {
            OPENCL_SUCCEED_FATAL(clFinish(ctx->queue));
            time_end_6935 = get_wall_time();

            long time_diff_6936 = time_end_6935 - time_start_6934;

            fprintf(ctx->log, "kernel %s runtime: %ldus\n", "add.segmap_6892", time_diff_6936);
        }
    }
    if (ctx->debugging)
        fprintf(ctx->log, "%s\n", "");
    if (memblock_set_device(ctx, &mem_out_6922, &mem_6920, "mem_6920") != 0)
        return 1;
    if (memblock_set_device(ctx, &*mem_out_p_6932, &mem_out_6922, "mem_out_6922") != 0)
        return 1;

  cleanup:
    {
        if (memblock_unref_device(ctx, &mem_6920, "mem_6920") != 0)
            return 1;
        if (memblock_unref_device(ctx, &mem_out_6922, "mem_out_6922") != 0)
            return 1;
    }
    return err;
}
static int futrts_entry_add_i64(struct futhark_context *ctx, struct memblock_device *mem_out_p_6938, struct memblock_device xs_mem_6916, struct memblock_device ys_mem_6917, int64_t n_6836)
{
    (void) ctx;

    int err = 0;
    struct memblock_device mem_6921;

    mem_6921.references = NULL;

    struct memblock_device mem_out_6922;

    mem_out_6922.references = NULL;

    int64_t binop_y_6919 = (int64_t) 8 * n_6836;
    int64_t bytes_6920 = smax64((int64_t) 0, binop_y_6919);
    int64_t segmap_group_sizze_6908;

    segmap_group_sizze_6908 = *ctx->tuning_params.add_i64zisegmap_group_sizze_6899;

    int64_t segmap_usable_groups_6909 = sdiv_up64(n_6836, segmap_group_sizze_6908);

    if (memblock_alloc_device(ctx, &mem_6921, bytes_6920, "mem_6921")) {
        err = 1;
        goto cleanup;
    }
    if (ctx->debugging)
        fprintf(ctx->log, "%s\n", "\n# SegMap");

    int32_t virt_num_groups_6923 = sext_i64_i32(sdiv_up64(n_6836, segmap_group_sizze_6908));

    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->add_i64zisegmap_6912, 1, sizeof(n_6836), &n_6836));
    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->add_i64zisegmap_6912, 2, sizeof(xs_mem_6916.mem), &xs_mem_6916.mem));
    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->add_i64zisegmap_6912, 3, sizeof(ys_mem_6917.mem), &ys_mem_6917.mem));
    OPENCL_SUCCEED_OR_RETURN(clSetKernelArg(ctx->program->add_i64zisegmap_6912, 4, sizeof(mem_6921.mem), &mem_6921.mem));
    if (1 * ((size_t) segmap_usable_groups_6909 * (size_t) *ctx->tuning_params.add_i64zisegmap_group_sizze_6899) != 0) {
        const size_t global_work_sizze_6939[1] = {(size_t) segmap_usable_groups_6909 * (size_t) *ctx->tuning_params.add_i64zisegmap_group_sizze_6899};
        const size_t local_work_sizze_6943[1] = {(size_t) *ctx->tuning_params.add_i64zisegmap_group_sizze_6899};
        int64_t time_start_6940 = 0, time_end_6941 = 0;

        if (ctx->debugging) {
            fprintf(ctx->log, "Launching %s with global work size [%zu] and local work size [%zu]; local memory: %d bytes.\n", "add_i64.segmap_6912", global_work_sizze_6939[0], local_work_sizze_6943[0], (int) 0);
            time_start_6940 = get_wall_time();
        }

        cl_event *pevent = ctx->profiling_paused || !ctx->profiling ? NULL : opencl_get_event(ctx, &ctx->program->add_i64zisegmap_6912_runs, &ctx->program->add_i64zisegmap_6912_total_runtime);

        OPENCL_SUCCEED_OR_RETURN(clEnqueueNDRangeKernel(ctx->queue, ctx->program->add_i64zisegmap_6912, 1, NULL, global_work_sizze_6939, local_work_sizze_6943, 0, NULL, pevent));
        if (ctx->debugging) {
            OPENCL_SUCCEED_FATAL(clFinish(ctx->queue));
            time_end_6941 = get_wall_time();

            long time_diff_6942 = time_end_6941 - time_start_6940;

            fprintf(ctx->log, "kernel %s runtime: %ldus\n", "add_i64.segmap_6912", time_diff_6942);
        }
    }
    if (ctx->debugging)
        fprintf(ctx->log, "%s\n", "");
    if (memblock_set_device(ctx, &mem_out_6922, &mem_6921, "mem_6921") != 0)
        return 1;
    if (memblock_set_device(ctx, &*mem_out_p_6938, &mem_out_6922, "mem_out_6922") != 0)
        return 1;

  cleanup:
    {
        if (memblock_unref_device(ctx, &mem_6921, "mem_6921") != 0)
            return 1;
        if (memblock_unref_device(ctx, &mem_out_6922, "mem_out_6922") != 0)
            return 1;
    }
    return err;
}

int futhark_entry_add(struct futhark_context *ctx, struct futhark_u8_1d **out0, const struct futhark_u8_1d *in0, const struct futhark_u8_1d *in1)
{
    int64_t n_6776 = (int64_t) 0;
    int ret = 0;

    lock_lock(&ctx->lock);

    struct memblock_device mem_out_6922;

    mem_out_6922.references = NULL;

    struct memblock_device ys_mem_6917;

    ys_mem_6917.references = NULL;

    struct memblock_device xs_mem_6916;

    xs_mem_6916.references = NULL;
    xs_mem_6916 = in0->mem;
    n_6776 = in0->shape[0];
    ys_mem_6917 = in1->mem;
    n_6776 = in1->shape[0];
    if (!(n_6776 == in0->shape[0] && n_6776 == in1->shape[0])) {
        ret = 1;
        set_error(ctx, msgprintf("Error: entry point arguments have invalid sizes.\n"));
    }
    if (ret == 0) {
        ret = futrts_entry_add(ctx, &mem_out_6922, xs_mem_6916, ys_mem_6917, n_6776);
        if (ret == 0) {
            assert((*out0 = (struct futhark_u8_1d *) malloc(sizeof(struct futhark_u8_1d))) != NULL);
            (*out0)->mem = mem_out_6922;
            (*out0)->shape[0] = n_6776;
        }
    }
    lock_unlock(&ctx->lock);
    return ret;
}
int futhark_entry_add_i64(struct futhark_context *ctx, struct futhark_i64_1d **out0, const struct futhark_i64_1d *in0, const struct futhark_i64_1d *in1)
{
    int64_t n_6836 = (int64_t) 0;
    int ret = 0;

    lock_lock(&ctx->lock);

    struct memblock_device mem_out_6922;

    mem_out_6922.references = NULL;

    struct memblock_device ys_mem_6917;

    ys_mem_6917.references = NULL;

    struct memblock_device xs_mem_6916;

    xs_mem_6916.references = NULL;
    xs_mem_6916 = in0->mem;
    n_6836 = in0->shape[0];
    ys_mem_6917 = in1->mem;
    n_6836 = in1->shape[0];
    if (!(n_6836 == in0->shape[0] && n_6836 == in1->shape[0])) {
        ret = 1;
        set_error(ctx, msgprintf("Error: entry point arguments have invalid sizes.\n"));
    }
    if (ret == 0) {
        ret = futrts_entry_add_i64(ctx, &mem_out_6922, xs_mem_6916, ys_mem_6917, n_6836);
        if (ret == 0) {
            assert((*out0 = (struct futhark_i64_1d *) malloc(sizeof(struct futhark_i64_1d))) != NULL);
            (*out0)->mem = mem_out_6922;
            (*out0)->shape[0] = n_6836;
        }
    }
    lock_unlock(&ctx->lock);
    return ret;
}


## lib_map.ex
defmodule Map.NIF do
  @on_load :load_nifs

  def load_nifs do
    :erlang.load_nif('./lib_map_nif', 0)
  end

  def futhark_context_config_new do
    raise "NIF futhark_context_config_new not implemented"
  end

  def futhark_context_new(_cfg) do
    raise "NIF futhark_context_new not implemented"
  end

  def futhark_context_sync(_ctx) do
    raise "NIF futhark_context_sync not implemented"
  end

  def futhark_new_i64_1d(_ctx, _binary) do
    raise "NIF futhark_new_i64_1d not implemented"
  end

  def futhark_i64_1d_to_binary(_ctx, _in) do
    raise "NIF futhark_i64_1d_to_binary not implemented"
  end

  def futhark_new_u8_1d(_ctx, _binary) do
    raise "NIF futhark_new_u8_1d not implemented"
  end

  def futhark_u8_1d_to_binary(_ctx, _in) do
    raise "NIF futhark_u8_1d_to_binary not implemented"
  end

  def futhark_entry_add(_ctx, _xs, _ys) do
    raise "NIF futhark_entry_add not implemented"
  end

  def futhark_entry_add_i64(_ctx, _xs, _ys) do
    raise "NIF futhark_entry_add_i64 not implemented"
  end

end

## lib_map.h
// Generated by Futhark 0.25.0 (prerelease - include info below when reporting bugs)
// git: 6a2e6e1 (Sun Apr 30 19:03:19 2023 +0200) [modified]
#pragma once


// Headers
#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdio.h>
#include <float.h>

#define CL_TARGET_OPENCL_VERSION 120
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifdef __APPLE__
#define CL_SILENCE_DEPRECATION
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

#ifdef __cplusplus
extern "C" {
#endif

// Initialisation
struct futhark_context_config;
struct futhark_context_config *futhark_context_config_new(void);
void futhark_context_config_free(struct futhark_context_config *cfg);
int futhark_context_config_set_tuning_param(struct futhark_context_config *cfg, const char *param_name, size_t new_value);
struct futhark_context;
struct futhark_context *futhark_context_new(struct futhark_context_config *cfg);
void futhark_context_free(struct futhark_context *cfg);
void futhark_context_config_add_build_option(struct futhark_context_config *cfg, const char *opt);
void futhark_context_config_set_device(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_set_platform(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_select_device_interactively(struct futhark_context_config *cfg);
void futhark_context_config_list_devices(struct futhark_context_config *cfg);
void futhark_context_config_dump_program_to(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_load_program_from(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_dump_binary_to(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_load_binary_from(struct futhark_context_config *cfg, const char *s);
void futhark_context_config_set_default_group_size(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_default_threshold(struct futhark_context_config *cfg, int size);
void futhark_context_config_set_command_queue(struct futhark_context_config *cfg, cl_command_queue);
void futhark_context_config_set_debugging(struct futhark_context_config *cfg, int flag);
void futhark_context_config_set_profiling(struct futhark_context_config *cfg, int flag);
void futhark_context_config_set_logging(struct futhark_context_config *cfg, int flag);
int futhark_get_tuning_param_count(void);
const char *futhark_get_tuning_param_name(int);
const char *futhark_get_tuning_param_class(int);

// Arrays
struct futhark_i64_1d;
struct futhark_i64_1d *futhark_new_i64_1d(struct futhark_context *ctx, const int64_t *data, int64_t dim0);
struct futhark_i64_1d *futhark_new_raw_i64_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0);
int futhark_free_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr);
int futhark_values_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr, int64_t *data);
cl_mem futhark_values_raw_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr);
const int64_t *futhark_shape_i64_1d(struct futhark_context *ctx, struct futhark_i64_1d *arr);
struct futhark_u8_1d;
struct futhark_u8_1d *futhark_new_u8_1d(struct futhark_context *ctx, const uint8_t *data, int64_t dim0);
struct futhark_u8_1d *futhark_new_raw_u8_1d(struct futhark_context *ctx, const cl_mem data, int64_t offset, int64_t dim0);
int futhark_free_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr);
int futhark_values_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr, uint8_t *data);
cl_mem futhark_values_raw_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr);
const int64_t *futhark_shape_u8_1d(struct futhark_context *ctx, struct futhark_u8_1d *arr);

// Opaque values


// Entry points
int futhark_entry_add(struct futhark_context *ctx, struct futhark_u8_1d **out0, const struct futhark_u8_1d *in0, const struct futhark_u8_1d *in1);
int futhark_entry_add_i64(struct futhark_context *ctx, struct futhark_i64_1d **out0, const struct futhark_i64_1d *in0, const struct futhark_i64_1d *in1);

// Miscellaneous
int futhark_context_sync(struct futhark_context *ctx);
cl_command_queue futhark_context_get_command_queue(struct futhark_context *ctx);
void futhark_context_config_set_cache_file(struct futhark_context_config *cfg, const char *f);
char *futhark_context_report(struct futhark_context *ctx);
char *futhark_context_get_error(struct futhark_context *ctx);
void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f);
void futhark_context_pause_profiling(struct futhark_context *ctx);
void futhark_context_unpause_profiling(struct futhark_context *ctx);
int futhark_context_clear_caches(struct futhark_context *ctx);
#define FUTHARK_BACKEND_opencl
#define FUTHARK_SUCCESS 0
#define FUTHARK_PROGRAM_ERROR 2
#define FUTHARK_OUT_OF_MEMORY 3

#ifdef __cplusplus
}
#endif

## lib_map.json
{
  "backend": "opencl",
  "entry_points": {
    "add": {
      "cfun": "futhark_entry_add",
      "inputs": [
        {
          "name": "xs",
          "type": "[]u8",
          "unique": false
        },
        {
          "name": "ys",
          "type": "[]u8",
          "unique": false
        }
      ],
      "outputs": [
        {
          "type": "[]u8",
          "unique": false
        }
      ],
      "tuning_params": [
        "add.segmap_group_size_6879"
      ]
    },
    "add_i64": {
      "cfun": "futhark_entry_add_i64",
      "inputs": [
        {
          "name": "xs",
          "type": "[]i64",
          "unique": false
        },
        {
          "name": "ys",
          "type": "[]i64",
          "unique": false
        }
      ],
      "outputs": [
        {
          "type": "[]i64",
          "unique": false
        }
      ],
      "tuning_params": [
        "add_i64.segmap_group_size_6899"
      ]
    }
  },
  "types": {
    "[]i64": {
      "ctype": "struct futhark_i64_1d *",
      "elemtype": "i64",
      "kind": "array",
      "ops": {
        "free": "futhark_free_i64_1d",
        "new": "futhark_new_i64_1d",
        "shape": "futhark_shape_i64_1d",
        "values": "futhark_values_i64_1d"
      },
      "rank": 1
    },
    "[]u8": {
      "ctype": "struct futhark_u8_1d *",
      "elemtype": "u8",
      "kind": "array",
      "ops": {
        "free": "futhark_free_u8_1d",
        "new": "futhark_new_u8_1d",
        "shape": "futhark_shape_u8_1d",
        "values": "futhark_values_u8_1d"
      },
      "rank": 1
    }
  },
  "version": "0.25.0 (prerelease - include info below when reporting bugs)\ngit: 6a2e6e1 (Sun Apr 30 19:03:19 2023 +0200) [modified]"
}

## lib_map_nif.c
#include <erl_nif.h>
#include "lib_map.c"

struct futhark_context;

ERL_NIF_TERM atom_ok;

ErlNifResourceType* CONFIG_TYPE;
ErlNifResourceType* CONTEXT_TYPE;

ErlNifResourceType* I64_1D;
ErlNifResourceType* U8_1D;

static int open_resource(ErlNifEnv* env, ErlNifResourceType** resource_type, const char* name)
{
  const char* mod = "resources";
  int flags = ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER;

  *resource_type = enif_open_resource_type(env, mod, name, NULL, flags, NULL);
  if(CONFIG_TYPE == NULL) return -1;
  return 0;
}

static int load(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info)
{
  if(open_resource(env, &CONFIG_TYPE, "Config") == -1) return -1;
  if(open_resource(env, &CONTEXT_TYPE, "Context") == -1) return -1;
  if(open_resource(env, &I64_1D, "i64_1d") == -1) return -1;
  if(open_resource(env, &U8_1D, "u8_1d") == -1) return -1;

  atom_ok = enif_make_atom(env, "ok");

  return 0;
}

static ERL_NIF_TERM futhark_context_config_new_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
  struct futhark_context_config **res;

  ERL_NIF_TERM ret;

  if(argc != 0) {
    return enif_make_badarg(env);
  }

  res = enif_alloc_resource(CONFIG_TYPE, sizeof(struct futhark_context_config *));
  if(res == NULL) return enif_make_badarg(env);

  struct futhark_context_config* tmp = futhark_context_config_new();

  *res = tmp;

  ret = enif_make_resource(env, res);
  enif_release_resource(res);

  return enif_make_tuple2(env, atom_ok, ret);
}

static ERL_NIF_TERM futhark_context_new_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
  struct futhark_context_config **cfg;
  struct futhark_context **res;

  ERL_NIF_TERM ret;

  if(argc != 1) {
    return enif_make_badarg(env);
  }

  if(!enif_get_resource(env, argv[0], CONFIG_TYPE, (void**) &cfg)) {
    return enif_make_badarg(env);
  }

  res = enif_alloc_resource(CONTEXT_TYPE, sizeof(struct futhark_context *));
  if(res == NULL) return enif_make_badarg(env);

  struct futhark_context* tmp = futhark_context_new(*cfg);

  *res = tmp;

  ret = enif_make_resource(env, res);
  enif_release_resource(res);

  return enif_make_tuple2(env, atom_ok, ret);
}

static ERL_NIF_TERM futhark_context_sync_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
  struct futhark_context **ctx;

  if(argc != 1) {
    return enif_make_badarg(env);
  }

  if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
    return enif_make_badarg(env);
  }

  futhark_context_sync(*ctx);

  return atom_ok;
}

static ERL_NIF_TERM futhark_new_i64_1d_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
  struct futhark_context **ctx;
  ErlNifBinary bin;

  struct futhark_i64_1d **res;
  ERL_NIF_TERM ret;

  if(argc != 2) {
    return enif_make_badarg(env);
  }

  if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
    return enif_make_badarg(env);
  }

  if (!enif_inspect_binary(env, argv[1], &bin)) {
    return enif_make_badarg(env);
  }

  res = enif_alloc_resource(I64_1D, sizeof(struct futhark_i64_1d *));
  if(res == NULL) return enif_make_badarg(env);

  struct futhark_i64_1d * tmp = futhark_new_i64_1d(*ctx, (const int64_t *)bin.data, bin.size / sizeof(int64_t));
  const int64_t *shape = futhark_shape_i64_1d(*ctx, tmp);

  *res = tmp;

  ret = enif_make_resource(env, res);
  enif_release_resource(res);

  return enif_make_tuple2(env, atom_ok, ret);
}

static ERL_NIF_TERM futhark_i64_1d_to_binary_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
  struct futhark_context **ctx;
  struct futhark_i64_1d **xs;

  ErlNifBinary binary;
  ERL_NIF_TERM ret;

  if(argc != 2) {
    return enif_make_badarg(env);
  }

  if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
    return enif_make_badarg(env);
  }

  if(!enif_get_resource(env, argv[1], I64_1D, (void**) &xs)) {
    return enif_make_badarg(env);
  }

  const int64_t *shape = futhark_shape_i64_1d(*ctx, *xs);

  enif_alloc_binary(shape[0] * sizeof(int64_t), &binary);

  if (futhark_values_i64_1d(*ctx, *xs, (int64_t *)(binary.data)) != 0) return enif_make_badarg(env);
  futhark_context_sync(*ctx);

  ret = enif_make_binary(env, &binary);

  return enif_make_tuple2(env, atom_ok, ret);
}

static ERL_NIF_TERM futhark_new_u8_1d_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
  struct futhark_context **ctx;
  ErlNifBinary bin;

  struct futhark_u8_1d **res;
  ERL_NIF_TERM ret;

  if(argc != 2) {
    return enif_make_badarg(env);
  }

  if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
    return enif_make_badarg(env);
  }

  if (!enif_inspect_binary(env, argv[1], &bin)) {
    return enif_make_badarg(env);
  }

  res = enif_alloc_resource(U8_1D, sizeof(struct futhark_u8_1d *));
  if(res == NULL) return enif_make_badarg(env);

  struct futhark_u8_1d * tmp = futhark_new_u8_1d(*ctx, (const uint8_t *)bin.data, bin.size / sizeof(uint8_t));
  const int64_t *shape = futhark_shape_u8_1d(*ctx, tmp);

  *res = tmp;

  ret = enif_make_resource(env, res);
  enif_release_resource(res);

  return enif_make_tuple2(env, atom_ok, ret);
}

static ERL_NIF_TERM futhark_u8_1d_to_binary_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
  struct futhark_context **ctx;
  struct futhark_u8_1d **xs;

  ErlNifBinary binary;
  ERL_NIF_TERM ret;

  if(argc != 2) {
    return enif_make_badarg(env);
  }

  if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
    return enif_make_badarg(env);
  }

  if(!enif_get_resource(env, argv[1], U8_1D, (void**) &xs)) {
    return enif_make_badarg(env);
  }

  const int64_t *shape = futhark_shape_u8_1d(*ctx, *xs);

  enif_alloc_binary(shape[0] * sizeof(uint8_t), &binary);

  if (futhark_values_u8_1d(*ctx, *xs, (uint8_t *)(binary.data)) != 0) return enif_make_badarg(env);
  futhark_context_sync(*ctx);

  ret = enif_make_binary(env, &binary);

  return enif_make_tuple2(env, atom_ok, ret);
}

static ERL_NIF_TERM futhark_entry_add_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
  struct futhark_context **ctx;

  struct futhark_u8_1d **xs;
  struct futhark_u8_1d **ys;

  struct futhark_u8_1d **res;

  ERL_NIF_TERM ret;

  if(argc != 3) {
    return enif_make_badarg(env);
  }

  if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
    return enif_make_badarg(env);
  }

  if(!enif_get_resource(env, argv[1], U8_1D, (void**) &xs)) {
    return enif_make_badarg(env);
  }

  if(!enif_get_resource(env, argv[2], U8_1D, (void**) &ys)) {
    return enif_make_badarg(env);
  }

  res = enif_alloc_resource(U8_1D, sizeof(struct futhark_u8_1d *));
  if(res == NULL) return enif_make_badarg(env);

  if (futhark_entry_add(*ctx, res, *xs, *ys) != 0) return enif_make_badarg(env);

  ret = enif_make_resource(env, res);
  enif_release_resource(res);

  return enif_make_tuple2(env, atom_ok, ret);
}

static ERL_NIF_TERM futhark_entry_add_i64_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
  struct futhark_context **ctx;

  struct futhark_i64_1d **xs;
  struct futhark_i64_1d **ys;

  struct futhark_i64_1d **res;

  ERL_NIF_TERM ret;

  if(argc != 3) {
    return enif_make_badarg(env);
  }

  if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
    return enif_make_badarg(env);
  }

  if(!enif_get_resource(env, argv[1], I64_1D, (void**) &xs)) {
    return enif_make_badarg(env);
  }

  if(!enif_get_resource(env, argv[2], I64_1D, (void**) &ys)) {
    return enif_make_badarg(env);
  }

  res = enif_alloc_resource(I64_1D, sizeof(struct futhark_i64_1d *));
  if(res == NULL) return enif_make_badarg(env);

  if (futhark_entry_add_i64(*ctx, res, *xs, *ys) != 0) return enif_make_badarg(env);

  ret = enif_make_resource(env, res);
  enif_release_resource(res);

  return enif_make_tuple2(env, atom_ok, ret);
}

static ErlNifFunc nif_funcs[] = {
  {"futhark_context_config_new", 0, futhark_context_config_new_nif},
  {"futhark_context_new", 1, futhark_context_new_nif},

  {"futhark_new_i64_1d", 2, futhark_new_i64_1d_nif},
  {"futhark_i64_1d_to_binary", 2, futhark_i64_1d_to_binary_nif},
  {"futhark_new_u8_1d", 2, futhark_new_u8_1d_nif},
  {"futhark_u8_1d_to_binary", 2, futhark_u8_1d_to_binary_nif},
  {"futhark_entry_add", 3, futhark_entry_add_nif},
  {"futhark_entry_add_i64", 3, futhark_entry_add_i64_nif},
  {"futhark_context_sync", 1, futhark_context_sync_nif}
};

ERL_NIF_INIT(Elixir.Map.NIF, nif_funcs, &load, NULL, NULL, NULL)

## test.exs
c("lib_map.ex")
{:ok, cfg} = Map.NIF.futhark_context_config_new()
{:ok, ctx} = Map.NIF.futhark_context_new(cfg)

xs_binary = <<0, 1>>
{:ok, xs} = Map.NIF.futhark_new_u8_1d(ctx, xs_binary)
{:ok, ^xs_binary} = Map.NIF.futhark_u8_1d_to_binary(ctx, xs)
{:ok, ys} = Map.NIF.futhark_new_u8_1d(ctx, <<1, 4>>)
{:ok, zs} = Map.NIF.futhark_entry_add(ctx, xs, ys)
{:ok, <<1, 5>> = zs_binary} = Map.NIF.futhark_u8_1d_to_binary(ctx, zs)

xs_binary = <<1::integer-signed-64-little>>
{:ok, xs} = Map.NIF.futhark_new_i64_1d(ctx, xs_binary)
{:ok, ^xs_binary} = Map.NIF.futhark_i64_1d_to_binary(ctx, xs)
{:ok, ys} = Map.NIF.futhark_new_i64_1d(ctx, <<1279::integer-signed-64-little>>)
{:ok, zs} = Map.NIF.futhark_entry_add_i64(ctx, xs, ys)
{:ok, <<1280::integer-signed-64-little>> = zs_binary} = Map.NIF.futhark_i64_1d_to_binary(ctx, zs)
	defmodule Map.NIF do
	@on_load :load_nifs

	def load_nifs do
	:erlang.load_nif('./lib_map_nif', 0)
	end

	def futhark_context_config_new do
	raise "NIF futhark_context_config_new not implemented"
	end

	def futhark_context_new(_cfg) do
	raise "NIF futhark_context_new not implemented"
	end

	def futhark_context_sync(_ctx) do
	raise "NIF futhark_context_sync not implemented"
	end

	def futhark_new_i64_1d(_ctx, _binary) do
	raise "NIF futhark_new_i64_1d not implemented"
	end

	def futhark_i64_1d_to_binary(_ctx, _in) do
	raise "NIF futhark_i64_1d_to_binary not implemented"
	end

	def futhark_new_u8_1d(_ctx, _binary) do
	raise "NIF futhark_new_u8_1d not implemented"
	end

	def futhark_u8_1d_to_binary(_ctx, _in) do
	raise "NIF futhark_u8_1d_to_binary not implemented"
	end

	def futhark_entry_add(_ctx, _xs, _ys) do
	raise "NIF futhark_entry_add not implemented"
	end

	def futhark_entry_add_i64(_ctx, _xs, _ys) do
	raise "NIF futhark_entry_add_i64 not implemented"
	end

	end
	// Generated by Futhark 0.25.0 (prerelease - include info below when reporting bugs)
	// git: 6a2e6e1 (Sun Apr 30 19:03:19 2023 +0200) [modified]
	#pragma once


	// Headers
	#include <stdint.h>
	#include <stddef.h>
	#include <stdbool.h>
	#include <stdio.h>
	#include <float.h>

	#define CL_TARGET_OPENCL_VERSION 120
	#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
	#ifdef __APPLE__
	#define CL_SILENCE_DEPRECATION
	#include <OpenCL/cl.h>
	#else
	#include <CL/cl.h>
	#endif

	#ifdef __cplusplus
	extern "C" {
	#endif

	// Initialisation
	struct futhark_context_config;
	struct futhark_context_config *futhark_context_config_new(void);
	void futhark_context_config_free(struct futhark_context_config *cfg);
	int futhark_context_config_set_tuning_param(struct futhark_context_config cfg, const char param_name, size_t new_value);
	struct futhark_context;
	struct futhark_context futhark_context_new(struct futhark_context_config cfg);
	void futhark_context_free(struct futhark_context *cfg);
	void futhark_context_config_add_build_option(struct futhark_context_config cfg, const char opt);
	void futhark_context_config_set_device(struct futhark_context_config cfg, const char s);
	void futhark_context_config_set_platform(struct futhark_context_config cfg, const char s);
	void futhark_context_config_select_device_interactively(struct futhark_context_config *cfg);
	void futhark_context_config_list_devices(struct futhark_context_config *cfg);
	void futhark_context_config_dump_program_to(struct futhark_context_config cfg, const char s);
	void futhark_context_config_load_program_from(struct futhark_context_config cfg, const char s);
	void futhark_context_config_dump_binary_to(struct futhark_context_config cfg, const char s);
	void futhark_context_config_load_binary_from(struct futhark_context_config cfg, const char s);
	void futhark_context_config_set_default_group_size(struct futhark_context_config *cfg, int size);
	void futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg, int size);
	void futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg, int size);
	void futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg, int size);
	void futhark_context_config_set_default_threshold(struct futhark_context_config *cfg, int size);
	void futhark_context_config_set_command_queue(struct futhark_context_config *cfg, cl_command_queue);
	void futhark_context_config_set_debugging(struct futhark_context_config *cfg, int flag);
	void futhark_context_config_set_profiling(struct futhark_context_config *cfg, int flag);
	void futhark_context_config_set_logging(struct futhark_context_config *cfg, int flag);
	int futhark_get_tuning_param_count(void);
	const char *futhark_get_tuning_param_name(int);
	const char *futhark_get_tuning_param_class(int);

	// Arrays
	struct futhark_i64_1d;
	struct futhark_i64_1d futhark_new_i64_1d(struct futhark_context ctx, const int64_t *data, int64_t dim0);
	struct futhark_i64_1d futhark_new_raw_i64_1d(struct futhark_context ctx, const cl_mem data, int64_t offset, int64_t dim0);
	int futhark_free_i64_1d(struct futhark_context ctx, struct futhark_i64_1d arr);
	int futhark_values_i64_1d(struct futhark_context ctx, struct futhark_i64_1d arr, int64_t *data);
	cl_mem futhark_values_raw_i64_1d(struct futhark_context ctx, struct futhark_i64_1d arr);
	const int64_t futhark_shape_i64_1d(struct futhark_context ctx, struct futhark_i64_1d *arr);
	struct futhark_u8_1d;
	struct futhark_u8_1d futhark_new_u8_1d(struct futhark_context ctx, const uint8_t *data, int64_t dim0);
	struct futhark_u8_1d futhark_new_raw_u8_1d(struct futhark_context ctx, const cl_mem data, int64_t offset, int64_t dim0);
	int futhark_free_u8_1d(struct futhark_context ctx, struct futhark_u8_1d arr);
	int futhark_values_u8_1d(struct futhark_context ctx, struct futhark_u8_1d arr, uint8_t *data);
	cl_mem futhark_values_raw_u8_1d(struct futhark_context ctx, struct futhark_u8_1d arr);
	const int64_t futhark_shape_u8_1d(struct futhark_context ctx, struct futhark_u8_1d *arr);

	// Opaque values



	// Entry points
	int futhark_entry_add(struct futhark_context ctx, struct futhark_u8_1d out0, const struct futhark_u8_1d in0, const struct futhark_u8_1d *in1);
	int futhark_entry_add_i64(struct futhark_context ctx, struct futhark_i64_1d out0, const struct futhark_i64_1d in0, const struct futhark_i64_1d *in1);

	// Miscellaneous
	int futhark_context_sync(struct futhark_context *ctx);
	cl_command_queue futhark_context_get_command_queue(struct futhark_context *ctx);
	void futhark_context_config_set_cache_file(struct futhark_context_config cfg, const char f);
	char futhark_context_report(struct futhark_context ctx);
	char futhark_context_get_error(struct futhark_context ctx);
	void futhark_context_set_logging_file(struct futhark_context ctx, FILE f);
	void futhark_context_pause_profiling(struct futhark_context *ctx);
	void futhark_context_unpause_profiling(struct futhark_context *ctx);
	int futhark_context_clear_caches(struct futhark_context *ctx);
	#define FUTHARK_BACKEND_opencl
	#define FUTHARK_SUCCESS 0
	#define FUTHARK_PROGRAM_ERROR 2
	#define FUTHARK_OUT_OF_MEMORY 3

	#ifdef __cplusplus
	}
	#endif
	{
	"backend": "opencl",
	"entry_points": {
	"add": {
	"cfun": "futhark_entry_add",
	"inputs": [
	{
	"name": "xs",
	"type": "[]u8",
	"unique": false
	},
	{
	"name": "ys",
	"type": "[]u8",
	"unique": false
	}
	],
	"outputs": [
	{
	"type": "[]u8",
	"unique": false
	}
	],
	"tuning_params": [
	"add.segmap_group_size_6879"
	]
	},
	"add_i64": {
	"cfun": "futhark_entry_add_i64",
	"inputs": [
	{
	"name": "xs",
	"type": "[]i64",
	"unique": false
	},
	{
	"name": "ys",
	"type": "[]i64",
	"unique": false
	}
	],
	"outputs": [
	{
	"type": "[]i64",
	"unique": false
	}
	],
	"tuning_params": [
	"add_i64.segmap_group_size_6899"
	]
	}
	},
	"types": {
	"[]i64": {
	"ctype": "struct futhark_i64_1d *",
	"elemtype": "i64",
	"kind": "array",
	"ops": {
	"free": "futhark_free_i64_1d",
	"new": "futhark_new_i64_1d",
	"shape": "futhark_shape_i64_1d",
	"values": "futhark_values_i64_1d"
	},
	"rank": 1
	},
	"[]u8": {
	"ctype": "struct futhark_u8_1d *",
	"elemtype": "u8",
	"kind": "array",
	"ops": {
	"free": "futhark_free_u8_1d",
	"new": "futhark_new_u8_1d",
	"shape": "futhark_shape_u8_1d",
	"values": "futhark_values_u8_1d"
	},
	"rank": 1
	}
	},
	"version": "0.25.0 (prerelease - include info below when reporting bugs)\ngit: 6a2e6e1 (Sun Apr 30 19:03:19 2023 +0200) [modified]"
	}
	#include <erl_nif.h>
	#include "lib_map.c"

	struct futhark_context;

	ERL_NIF_TERM atom_ok;

	ErlNifResourceType* CONFIG_TYPE;
	ErlNifResourceType* CONTEXT_TYPE;

	ErlNifResourceType* I64_1D;
	ErlNifResourceType* U8_1D;

	static int open_resource(ErlNifEnv* env, ErlNifResourceType** resource_type, const char* name)
	{
	const char* mod = "resources";
	int flags = ERL_NIF_RT_CREATE \| ERL_NIF_RT_TAKEOVER;

	*resource_type = enif_open_resource_type(env, mod, name, NULL, flags, NULL);
	if(CONFIG_TYPE == NULL) return -1;
	return 0;
	}

	static int load(ErlNifEnv* env, void** priv, ERL_NIF_TERM load_info)
	{
	if(open_resource(env, &CONFIG_TYPE, "Config") == -1) return -1;
	if(open_resource(env, &CONTEXT_TYPE, "Context") == -1) return -1;
	if(open_resource(env, &I64_1D, "i64_1d") == -1) return -1;
	if(open_resource(env, &U8_1D, "u8_1d") == -1) return -1;

	atom_ok = enif_make_atom(env, "ok");

	return 0;
	}

	static ERL_NIF_TERM futhark_context_config_new_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
	{
	struct futhark_context_config **res;

	ERL_NIF_TERM ret;

	if(argc != 0) {
	return enif_make_badarg(env);
	}

	res = enif_alloc_resource(CONFIG_TYPE, sizeof(struct futhark_context_config *));
	if(res == NULL) return enif_make_badarg(env);

	struct futhark_context_config* tmp = futhark_context_config_new();

	*res = tmp;

	ret = enif_make_resource(env, res);
	enif_release_resource(res);

	return enif_make_tuple2(env, atom_ok, ret);
	}

	static ERL_NIF_TERM futhark_context_new_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
	{
	struct futhark_context_config **cfg;
	struct futhark_context **res;

	ERL_NIF_TERM ret;

	if(argc != 1) {
	return enif_make_badarg(env);
	}

	if(!enif_get_resource(env, argv[0], CONFIG_TYPE, (void**) &cfg)) {
	return enif_make_badarg(env);
	}

	res = enif_alloc_resource(CONTEXT_TYPE, sizeof(struct futhark_context *));
	if(res == NULL) return enif_make_badarg(env);

	struct futhark_context* tmp = futhark_context_new(*cfg);

	*res = tmp;

	ret = enif_make_resource(env, res);
	enif_release_resource(res);

	return enif_make_tuple2(env, atom_ok, ret);
	}

	static ERL_NIF_TERM futhark_context_sync_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
	{
	struct futhark_context **ctx;

	if(argc != 1) {
	return enif_make_badarg(env);
	}

	if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
	return enif_make_badarg(env);
	}

	futhark_context_sync(*ctx);

	return atom_ok;
	}

	static ERL_NIF_TERM futhark_new_i64_1d_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
	{
	struct futhark_context **ctx;
	ErlNifBinary bin;

	struct futhark_i64_1d **res;
	ERL_NIF_TERM ret;

	if(argc != 2) {
	return enif_make_badarg(env);
	}

	if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
	return enif_make_badarg(env);
	}

	if (!enif_inspect_binary(env, argv[1], &bin)) {
	return enif_make_badarg(env);
	}

	res = enif_alloc_resource(I64_1D, sizeof(struct futhark_i64_1d *));
	if(res == NULL) return enif_make_badarg(env);

	struct futhark_i64_1d * tmp = futhark_new_i64_1d(ctx, (const int64_t )bin.data, bin.size / sizeof(int64_t));
	const int64_t shape = futhark_shape_i64_1d(ctx, tmp);

	*res = tmp;

	ret = enif_make_resource(env, res);
	enif_release_resource(res);

	return enif_make_tuple2(env, atom_ok, ret);
	}

	static ERL_NIF_TERM futhark_i64_1d_to_binary_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
	{
	struct futhark_context **ctx;
	struct futhark_i64_1d **xs;

	ErlNifBinary binary;
	ERL_NIF_TERM ret;

	if(argc != 2) {
	return enif_make_badarg(env);
	}

	if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
	return enif_make_badarg(env);
	}

	if(!enif_get_resource(env, argv[1], I64_1D, (void**) &xs)) {
	return enif_make_badarg(env);
	}

	const int64_t shape = futhark_shape_i64_1d(ctx, *xs);

	enif_alloc_binary(shape[0] * sizeof(int64_t), &binary);

	if (futhark_values_i64_1d(ctx, xs, (int64_t *)(binary.data)) != 0) return enif_make_badarg(env);
	futhark_context_sync(*ctx);

	ret = enif_make_binary(env, &binary);

	return enif_make_tuple2(env, atom_ok, ret);
	}

	static ERL_NIF_TERM futhark_new_u8_1d_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
	{
	struct futhark_context **ctx;
	ErlNifBinary bin;

	struct futhark_u8_1d **res;
	ERL_NIF_TERM ret;

	if(argc != 2) {
	return enif_make_badarg(env);
	}

	if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
	return enif_make_badarg(env);
	}

	if (!enif_inspect_binary(env, argv[1], &bin)) {
	return enif_make_badarg(env);
	}

	res = enif_alloc_resource(U8_1D, sizeof(struct futhark_u8_1d *));
	if(res == NULL) return enif_make_badarg(env);

	struct futhark_u8_1d * tmp = futhark_new_u8_1d(ctx, (const uint8_t )bin.data, bin.size / sizeof(uint8_t));
	const int64_t shape = futhark_shape_u8_1d(ctx, tmp);

	*res = tmp;

	ret = enif_make_resource(env, res);
	enif_release_resource(res);

	return enif_make_tuple2(env, atom_ok, ret);
	}

	static ERL_NIF_TERM futhark_u8_1d_to_binary_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
	{
	struct futhark_context **ctx;
	struct futhark_u8_1d **xs;

	ErlNifBinary binary;
	ERL_NIF_TERM ret;

	if(argc != 2) {
	return enif_make_badarg(env);
	}

	if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
	return enif_make_badarg(env);
	}

	if(!enif_get_resource(env, argv[1], U8_1D, (void**) &xs)) {
	return enif_make_badarg(env);
	}

	const int64_t shape = futhark_shape_u8_1d(ctx, *xs);

	enif_alloc_binary(shape[0] * sizeof(uint8_t), &binary);

	if (futhark_values_u8_1d(ctx, xs, (uint8_t *)(binary.data)) != 0) return enif_make_badarg(env);
	futhark_context_sync(*ctx);

	ret = enif_make_binary(env, &binary);

	return enif_make_tuple2(env, atom_ok, ret);
	}

	static ERL_NIF_TERM futhark_entry_add_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
	{
	struct futhark_context **ctx;

	struct futhark_u8_1d **xs;
	struct futhark_u8_1d **ys;

	struct futhark_u8_1d **res;

	ERL_NIF_TERM ret;

	if(argc != 3) {
	return enif_make_badarg(env);
	}

	if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
	return enif_make_badarg(env);
	}

	if(!enif_get_resource(env, argv[1], U8_1D, (void**) &xs)) {
	return enif_make_badarg(env);
	}

	if(!enif_get_resource(env, argv[2], U8_1D, (void**) &ys)) {
	return enif_make_badarg(env);
	}

	res = enif_alloc_resource(U8_1D, sizeof(struct futhark_u8_1d *));
	if(res == NULL) return enif_make_badarg(env);

	if (futhark_entry_add(ctx, res, xs, *ys) != 0) return enif_make_badarg(env);

	ret = enif_make_resource(env, res);
	enif_release_resource(res);

	return enif_make_tuple2(env, atom_ok, ret);
	}

	static ERL_NIF_TERM futhark_entry_add_i64_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
	{
	struct futhark_context **ctx;

	struct futhark_i64_1d **xs;
	struct futhark_i64_1d **ys;

	struct futhark_i64_1d **res;

	ERL_NIF_TERM ret;

	if(argc != 3) {
	return enif_make_badarg(env);
	}

	if(!enif_get_resource(env, argv[0], CONTEXT_TYPE, (void**) &ctx)) {
	return enif_make_badarg(env);
	}

	if(!enif_get_resource(env, argv[1], I64_1D, (void**) &xs)) {
	return enif_make_badarg(env);
	}

	if(!enif_get_resource(env, argv[2], I64_1D, (void**) &ys)) {
	return enif_make_badarg(env);
	}

	res = enif_alloc_resource(I64_1D, sizeof(struct futhark_i64_1d *));
	if(res == NULL) return enif_make_badarg(env);

	if (futhark_entry_add_i64(ctx, res, xs, *ys) != 0) return enif_make_badarg(env);

	ret = enif_make_resource(env, res);
	enif_release_resource(res);

	return enif_make_tuple2(env, atom_ok, ret);
	}

	static ErlNifFunc nif_funcs[] = {
	{"futhark_context_config_new", 0, futhark_context_config_new_nif},
	{"futhark_context_new", 1, futhark_context_new_nif},

	{"futhark_new_i64_1d", 2, futhark_new_i64_1d_nif},
	{"futhark_i64_1d_to_binary", 2, futhark_i64_1d_to_binary_nif},
	{"futhark_new_u8_1d", 2, futhark_new_u8_1d_nif},
	{"futhark_u8_1d_to_binary", 2, futhark_u8_1d_to_binary_nif},
	{"futhark_entry_add", 3, futhark_entry_add_nif},
	{"futhark_entry_add_i64", 3, futhark_entry_add_i64_nif},
	{"futhark_context_sync", 1, futhark_context_sync_nif}
	};

	ERL_NIF_INIT(Elixir.Map.NIF, nif_funcs, &load, NULL, NULL, NULL)
	c("lib_map.ex")
	{:ok, cfg} = Map.NIF.futhark_context_config_new()
	{:ok, ctx} = Map.NIF.futhark_context_new(cfg)

	xs_binary = <<0, 1>>
	{:ok, xs} = Map.NIF.futhark_new_u8_1d(ctx, xs_binary)
	{:ok, ^xs_binary} = Map.NIF.futhark_u8_1d_to_binary(ctx, xs)
	{:ok, ys} = Map.NIF.futhark_new_u8_1d(ctx, <<1, 4>>)
	{:ok, zs} = Map.NIF.futhark_entry_add(ctx, xs, ys)
	{:ok, <<1, 5>> = zs_binary} = Map.NIF.futhark_u8_1d_to_binary(ctx, zs)

	xs_binary = <<1::integer-signed-64-little>>
	{:ok, xs} = Map.NIF.futhark_new_i64_1d(ctx, xs_binary)
	{:ok, ^xs_binary} = Map.NIF.futhark_i64_1d_to_binary(ctx, xs)
	{:ok, ys} = Map.NIF.futhark_new_i64_1d(ctx, <<1279::integer-signed-64-little>>)
	{:ok, zs} = Map.NIF.futhark_entry_add_i64(ctx, xs, ys)
	{:ok, <<1280::integer-signed-64-little>> = zs_binary} = Map.NIF.futhark_i64_1d_to_binary(ctx, zs)