Skip to content

Instantly share code, notes, and snippets.

@athas
Created June 3, 2019 11:27
Show Gist options
  • Save athas/6e2891f0eaa7fd77107f6d9d2d6bee07 to your computer and use it in GitHub Desktop.
Save athas/6e2891f0eaa7fd77107f6d9d2d6bee07 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
#ifdef cl_clang_storage_class_specifiers
#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
#endif
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void dummy_kernel(__global unsigned char *dummy, int n)
{
const int thread_gid = get_global_id(0);
if (thread_gid >= n)
return;
}
typedef char int8_t;
typedef short int16_t;
typedef int int32_t;
typedef long int64_t;
typedef uchar uint8_t;
typedef ushort uint16_t;
typedef uint uint32_t;
typedef ulong uint64_t;
#define ALIGNED_LOCAL_MEMORY(m,size) __local unsigned char m[size] __attribute__ ((align))
#ifdef cl_nv_pragma_unroll
static inline void mem_fence_global()
{
asm("membar.gl;");
}
#else
static inline void mem_fence_global()
{
mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
}
#endif
static inline void mem_fence_local()
{
mem_fence(CLK_LOCAL_MEM_FENCE);
}
static inline int8_t add8(int8_t x, int8_t y)
{
return x + y;
}
static inline int16_t add16(int16_t x, int16_t y)
{
return x + y;
}
static inline int32_t add32(int32_t x, int32_t y)
{
return x + y;
}
static inline int64_t add64(int64_t x, int64_t y)
{
return x + y;
}
static inline int8_t sub8(int8_t x, int8_t y)
{
return x - y;
}
static inline int16_t sub16(int16_t x, int16_t y)
{
return x - y;
}
static inline int32_t sub32(int32_t x, int32_t y)
{
return x - y;
}
static inline int64_t sub64(int64_t x, int64_t y)
{
return x - y;
}
static inline int8_t mul8(int8_t x, int8_t y)
{
return x * y;
}
static inline int16_t mul16(int16_t x, int16_t y)
{
return x * y;
}
static inline int32_t mul32(int32_t x, int32_t y)
{
return x * y;
}
static inline int64_t mul64(int64_t x, int64_t y)
{
return x * y;
}
static inline uint8_t udiv8(uint8_t x, uint8_t y)
{
return x / y;
}
static inline uint16_t udiv16(uint16_t x, uint16_t y)
{
return x / y;
}
static inline uint32_t udiv32(uint32_t x, uint32_t y)
{
return x / y;
}
static inline uint64_t udiv64(uint64_t x, uint64_t y)
{
return x / y;
}
static inline uint8_t umod8(uint8_t x, uint8_t y)
{
return x % y;
}
static inline uint16_t umod16(uint16_t x, uint16_t y)
{
return x % y;
}
static inline uint32_t umod32(uint32_t x, uint32_t y)
{
return x % y;
}
static inline uint64_t umod64(uint64_t x, uint64_t y)
{
return x % y;
}
static inline int8_t sdiv8(int8_t x, int8_t y)
{
int8_t q = x / y;
int8_t r = x % y;
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}
static inline int16_t sdiv16(int16_t x, int16_t y)
{
int16_t q = x / y;
int16_t r = x % y;
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}
static inline int32_t sdiv32(int32_t x, int32_t y)
{
int32_t q = x / y;
int32_t r = x % y;
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}
static inline int64_t sdiv64(int64_t x, int64_t y)
{
int64_t q = x / y;
int64_t r = x % y;
return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}
static inline int8_t smod8(int8_t x, int8_t y)
{
int8_t r = x % y;
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}
static inline int16_t smod16(int16_t x, int16_t y)
{
int16_t r = x % y;
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}
static inline int32_t smod32(int32_t x, int32_t y)
{
int32_t r = x % y;
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}
static inline int64_t smod64(int64_t x, int64_t y)
{
int64_t r = x % y;
return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}
static inline int8_t squot8(int8_t x, int8_t y)
{
return x / y;
}
static inline int16_t squot16(int16_t x, int16_t y)
{
return x / y;
}
static inline int32_t squot32(int32_t x, int32_t y)
{
return x / y;
}
static inline int64_t squot64(int64_t x, int64_t y)
{
return x / y;
}
static inline int8_t srem8(int8_t x, int8_t y)
{
return x % y;
}
static inline int16_t srem16(int16_t x, int16_t y)
{
return x % y;
}
static inline int32_t srem32(int32_t x, int32_t y)
{
return x % y;
}
static inline int64_t srem64(int64_t x, int64_t y)
{
return x % y;
}
static inline int8_t smin8(int8_t x, int8_t y)
{
return x < y ? x : y;
}
static inline int16_t smin16(int16_t x, int16_t y)
{
return x < y ? x : y;
}
static inline int32_t smin32(int32_t x, int32_t y)
{
return x < y ? x : y;
}
static inline int64_t smin64(int64_t x, int64_t y)
{
return x < y ? x : y;
}
static inline uint8_t umin8(uint8_t x, uint8_t y)
{
return x < y ? x : y;
}
static inline uint16_t umin16(uint16_t x, uint16_t y)
{
return x < y ? x : y;
}
static inline uint32_t umin32(uint32_t x, uint32_t y)
{
return x < y ? x : y;
}
static inline uint64_t umin64(uint64_t x, uint64_t y)
{
return x < y ? x : y;
}
static inline int8_t smax8(int8_t x, int8_t y)
{
return x < y ? y : x;
}
static inline int16_t smax16(int16_t x, int16_t y)
{
return x < y ? y : x;
}
static inline int32_t smax32(int32_t x, int32_t y)
{
return x < y ? y : x;
}
static inline int64_t smax64(int64_t x, int64_t y)
{
return x < y ? y : x;
}
static inline uint8_t umax8(uint8_t x, uint8_t y)
{
return x < y ? y : x;
}
static inline uint16_t umax16(uint16_t x, uint16_t y)
{
return x < y ? y : x;
}
static inline uint32_t umax32(uint32_t x, uint32_t y)
{
return x < y ? y : x;
}
static inline uint64_t umax64(uint64_t x, uint64_t y)
{
return x < y ? y : x;
}
static inline uint8_t shl8(uint8_t x, uint8_t y)
{
return x << y;
}
static inline uint16_t shl16(uint16_t x, uint16_t y)
{
return x << y;
}
static inline uint32_t shl32(uint32_t x, uint32_t y)
{
return x << y;
}
static inline uint64_t shl64(uint64_t x, uint64_t y)
{
return x << y;
}
static inline uint8_t lshr8(uint8_t x, uint8_t y)
{
return x >> y;
}
static inline uint16_t lshr16(uint16_t x, uint16_t y)
{
return x >> y;
}
static inline uint32_t lshr32(uint32_t x, uint32_t y)
{
return x >> y;
}
static inline uint64_t lshr64(uint64_t x, uint64_t y)
{
return x >> y;
}
static inline int8_t ashr8(int8_t x, int8_t y)
{
return x >> y;
}
static inline int16_t ashr16(int16_t x, int16_t y)
{
return x >> y;
}
static inline int32_t ashr32(int32_t x, int32_t y)
{
return x >> y;
}
static inline int64_t ashr64(int64_t x, int64_t y)
{
return x >> y;
}
static inline uint8_t and8(uint8_t x, uint8_t y)
{
return x & y;
}
static inline uint16_t and16(uint16_t x, uint16_t y)
{
return x & y;
}
static inline uint32_t and32(uint32_t x, uint32_t y)
{
return x & y;
}
static inline uint64_t and64(uint64_t x, uint64_t y)
{
return x & y;
}
static inline uint8_t or8(uint8_t x, uint8_t y)
{
return x | y;
}
static inline uint16_t or16(uint16_t x, uint16_t y)
{
return x | y;
}
static inline uint32_t or32(uint32_t x, uint32_t y)
{
return x | y;
}
static inline uint64_t or64(uint64_t x, uint64_t y)
{
return x | y;
}
static inline uint8_t xor8(uint8_t x, uint8_t y)
{
return x ^ y;
}
static inline uint16_t xor16(uint16_t x, uint16_t y)
{
return x ^ y;
}
static inline uint32_t xor32(uint32_t x, uint32_t y)
{
return x ^ y;
}
static inline uint64_t xor64(uint64_t x, uint64_t y)
{
return x ^ y;
}
static inline char ult8(uint8_t x, uint8_t y)
{
return x < y;
}
static inline char ult16(uint16_t x, uint16_t y)
{
return x < y;
}
static inline char ult32(uint32_t x, uint32_t y)
{
return x < y;
}
static inline char ult64(uint64_t x, uint64_t y)
{
return x < y;
}
static inline char ule8(uint8_t x, uint8_t y)
{
return x <= y;
}
static inline char ule16(uint16_t x, uint16_t y)
{
return x <= y;
}
static inline char ule32(uint32_t x, uint32_t y)
{
return x <= y;
}
static inline char ule64(uint64_t x, uint64_t y)
{
return x <= y;
}
static inline char slt8(int8_t x, int8_t y)
{
return x < y;
}
static inline char slt16(int16_t x, int16_t y)
{
return x < y;
}
static inline char slt32(int32_t x, int32_t y)
{
return x < y;
}
static inline char slt64(int64_t x, int64_t y)
{
return x < y;
}
static inline char sle8(int8_t x, int8_t y)
{
return x <= y;
}
static inline char sle16(int16_t x, int16_t y)
{
return x <= y;
}
static inline char sle32(int32_t x, int32_t y)
{
return x <= y;
}
static inline char sle64(int64_t x, int64_t y)
{
return x <= y;
}
static inline int8_t pow8(int8_t x, int8_t y)
{
int8_t res = 1, rem = y;
while (rem != 0) {
if (rem & 1)
res *= x;
rem >>= 1;
x *= x;
}
return res;
}
static inline int16_t pow16(int16_t x, int16_t y)
{
int16_t res = 1, rem = y;
while (rem != 0) {
if (rem & 1)
res *= x;
rem >>= 1;
x *= x;
}
return res;
}
static inline int32_t pow32(int32_t x, int32_t y)
{
int32_t res = 1, rem = y;
while (rem != 0) {
if (rem & 1)
res *= x;
rem >>= 1;
x *= x;
}
return res;
}
static inline int64_t pow64(int64_t x, int64_t y)
{
int64_t res = 1, rem = y;
while (rem != 0) {
if (rem & 1)
res *= x;
rem >>= 1;
x *= x;
}
return res;
}
static inline bool itob_i8_bool(int8_t x)
{
return x;
}
static inline bool itob_i16_bool(int16_t x)
{
return x;
}
static inline bool itob_i32_bool(int32_t x)
{
return x;
}
static inline bool itob_i64_bool(int64_t x)
{
return x;
}
static inline int8_t btoi_bool_i8(bool x)
{
return x;
}
static inline int16_t btoi_bool_i16(bool x)
{
return x;
}
static inline int32_t btoi_bool_i32(bool x)
{
return x;
}
static inline int64_t btoi_bool_i64(bool x)
{
return x;
}
#define sext_i8_i8(x) ((int8_t) (int8_t) x)
#define sext_i8_i16(x) ((int16_t) (int8_t) x)
#define sext_i8_i32(x) ((int32_t) (int8_t) x)
#define sext_i8_i64(x) ((int64_t) (int8_t) x)
#define sext_i16_i8(x) ((int8_t) (int16_t) x)
#define sext_i16_i16(x) ((int16_t) (int16_t) x)
#define sext_i16_i32(x) ((int32_t) (int16_t) x)
#define sext_i16_i64(x) ((int64_t) (int16_t) x)
#define sext_i32_i8(x) ((int8_t) (int32_t) x)
#define sext_i32_i16(x) ((int16_t) (int32_t) x)
#define sext_i32_i32(x) ((int32_t) (int32_t) x)
#define sext_i32_i64(x) ((int64_t) (int32_t) x)
#define sext_i64_i8(x) ((int8_t) (int64_t) x)
#define sext_i64_i16(x) ((int16_t) (int64_t) x)
#define sext_i64_i32(x) ((int32_t) (int64_t) x)
#define sext_i64_i64(x) ((int64_t) (int64_t) x)
#define zext_i8_i8(x) ((uint8_t) (uint8_t) x)
#define zext_i8_i16(x) ((uint16_t) (uint8_t) x)
#define zext_i8_i32(x) ((uint32_t) (uint8_t) x)
#define zext_i8_i64(x) ((uint64_t) (uint8_t) x)
#define zext_i16_i8(x) ((uint8_t) (uint16_t) x)
#define zext_i16_i16(x) ((uint16_t) (uint16_t) x)
#define zext_i16_i32(x) ((uint32_t) (uint16_t) x)
#define zext_i16_i64(x) ((uint64_t) (uint16_t) x)
#define zext_i32_i8(x) ((uint8_t) (uint32_t) x)
#define zext_i32_i16(x) ((uint16_t) (uint32_t) x)
#define zext_i32_i32(x) ((uint32_t) (uint32_t) x)
#define zext_i32_i64(x) ((uint64_t) (uint32_t) x)
#define zext_i64_i8(x) ((uint8_t) (uint64_t) x)
#define zext_i64_i16(x) ((uint16_t) (uint64_t) x)
#define zext_i64_i32(x) ((uint32_t) (uint64_t) x)
#define zext_i64_i64(x) ((uint64_t) (uint64_t) x)
static inline float fdiv32(float x, float y)
{
return x / y;
}
static inline float fadd32(float x, float y)
{
return x + y;
}
static inline float fsub32(float x, float y)
{
return x - y;
}
static inline float fmul32(float x, float y)
{
return x * y;
}
static inline float fmin32(float x, float y)
{
return x < y ? x : y;
}
static inline float fmax32(float x, float y)
{
return x < y ? y : x;
}
static inline float fpow32(float x, float y)
{
return pow(x, y);
}
static inline char cmplt32(float x, float y)
{
return x < y;
}
static inline char cmple32(float x, float y)
{
return x <= y;
}
static inline float sitofp_i8_f32(int8_t x)
{
return x;
}
static inline float sitofp_i16_f32(int16_t x)
{
return x;
}
static inline float sitofp_i32_f32(int32_t x)
{
return x;
}
static inline float sitofp_i64_f32(int64_t x)
{
return x;
}
static inline float uitofp_i8_f32(uint8_t x)
{
return x;
}
static inline float uitofp_i16_f32(uint16_t x)
{
return x;
}
static inline float uitofp_i32_f32(uint32_t x)
{
return x;
}
static inline float uitofp_i64_f32(uint64_t x)
{
return x;
}
static inline int8_t fptosi_f32_i8(float x)
{
return x;
}
static inline int16_t fptosi_f32_i16(float x)
{
return x;
}
static inline int32_t fptosi_f32_i32(float x)
{
return x;
}
static inline int64_t fptosi_f32_i64(float x)
{
return x;
}
static inline uint8_t fptoui_f32_i8(float x)
{
return x;
}
static inline uint16_t fptoui_f32_i16(float x)
{
return x;
}
static inline uint32_t fptoui_f32_i32(float x)
{
return x;
}
static inline uint64_t fptoui_f32_i64(float x)
{
return x;
}
static inline float futrts_log32(float x)
{
return log(x);
}
static inline float futrts_log2_32(float x)
{
return log2(x);
}
static inline float futrts_log10_32(float x)
{
return log10(x);
}
static inline float futrts_sqrt32(float x)
{
return sqrt(x);
}
static inline float futrts_exp32(float x)
{
return exp(x);
}
static inline float futrts_cos32(float x)
{
return cos(x);
}
static inline float futrts_sin32(float x)
{
return sin(x);
}
static inline float futrts_tan32(float x)
{
return tan(x);
}
static inline float futrts_acos32(float x)
{
return acos(x);
}
static inline float futrts_asin32(float x)
{
return asin(x);
}
static inline float futrts_atan32(float x)
{
return atan(x);
}
static inline float futrts_atan2_32(float x, float y)
{
return atan2(x, y);
}
static inline float futrts_gamma32(float x)
{
return tgamma(x);
}
static inline float futrts_lgamma32(float x)
{
return lgamma(x);
}
static inline float futrts_round32(float x)
{
return rint(x);
}
static inline char futrts_isnan32(float x)
{
return isnan(x);
}
static inline char futrts_isinf32(float x)
{
return isinf(x);
}
static inline int32_t futrts_to_bits32(float x)
{
union {
float f;
int32_t t;
} p;
p.f = x;
return p.t;
}
static inline float futrts_from_bits32(int32_t x)
{
union {
int32_t f;
float t;
} p;
p.f = x;
return p.t;
}
static inline double fdiv64(double x, double y)
{
return x / y;
}
static inline double fadd64(double x, double y)
{
return x + y;
}
static inline double fsub64(double x, double y)
{
return x - y;
}
static inline double fmul64(double x, double y)
{
return x * y;
}
static inline double fmin64(double x, double y)
{
return x < y ? x : y;
}
static inline double fmax64(double x, double y)
{
return x < y ? y : x;
}
static inline double fpow64(double x, double y)
{
return pow(x, y);
}
static inline char cmplt64(double x, double y)
{
return x < y;
}
static inline char cmple64(double x, double y)
{
return x <= y;
}
static inline double sitofp_i8_f64(int8_t x)
{
return x;
}
static inline double sitofp_i16_f64(int16_t x)
{
return x;
}
static inline double sitofp_i32_f64(int32_t x)
{
return x;
}
static inline double sitofp_i64_f64(int64_t x)
{
return x;
}
static inline double uitofp_i8_f64(uint8_t x)
{
return x;
}
static inline double uitofp_i16_f64(uint16_t x)
{
return x;
}
static inline double uitofp_i32_f64(uint32_t x)
{
return x;
}
static inline double uitofp_i64_f64(uint64_t x)
{
return x;
}
static inline int8_t fptosi_f64_i8(double x)
{
return x;
}
static inline int16_t fptosi_f64_i16(double x)
{
return x;
}
static inline int32_t fptosi_f64_i32(double x)
{
return x;
}
static inline int64_t fptosi_f64_i64(double x)
{
return x;
}
static inline uint8_t fptoui_f64_i8(double x)
{
return x;
}
static inline uint16_t fptoui_f64_i16(double x)
{
return x;
}
static inline uint32_t fptoui_f64_i32(double x)
{
return x;
}
static inline uint64_t fptoui_f64_i64(double x)
{
return x;
}
static inline double futrts_log64(double x)
{
return log(x);
}
static inline double futrts_log2_64(double x)
{
return log2(x);
}
static inline double futrts_log10_64(double x)
{
return log10(x);
}
static inline double futrts_sqrt64(double x)
{
return sqrt(x);
}
static inline double futrts_exp64(double x)
{
return exp(x);
}
static inline double futrts_cos64(double x)
{
return cos(x);
}
static inline double futrts_sin64(double x)
{
return sin(x);
}
static inline double futrts_tan64(double x)
{
return tan(x);
}
static inline double futrts_acos64(double x)
{
return acos(x);
}
static inline double futrts_asin64(double x)
{
return asin(x);
}
static inline double futrts_atan64(double x)
{
return atan(x);
}
static inline double futrts_atan2_64(double x, double y)
{
return atan2(x, y);
}
static inline double futrts_gamma64(double x)
{
return tgamma(x);
}
static inline double futrts_lgamma64(double x)
{
return lgamma(x);
}
static inline double futrts_round64(double x)
{
return rint(x);
}
static inline char futrts_isnan64(double x)
{
return isnan(x);
}
static inline char futrts_isinf64(double x)
{
return isinf(x);
}
static inline int64_t futrts_to_bits64(double x)
{
union {
double f;
int64_t t;
} p;
p.f = x;
return p.t;
}
static inline double futrts_from_bits64(int64_t x)
{
union {
int64_t f;
double t;
} p;
p.f = x;
return p.t;
}
static inline float fpconv_f32_f32(float x)
{
return x;
}
static inline double fpconv_f32_f64(float x)
{
return x;
}
static inline float fpconv_f64_f32(double x)
{
return x;
}
static inline double fpconv_f64_f64(double x)
{
return x;
}
__kernel void copy_83061(int32_t K_68510, int32_t D_68526, __global
unsigned char *mem_81429, __global
unsigned char *mem_81433)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t copy_gtid_83061;
int32_t copy_ltid_83062;
int32_t copy_gid_83063;
copy_gtid_83061 = get_global_id(0);
copy_ltid_83062 = get_local_id(0);
copy_gid_83063 = get_group_id(0);
if (slt32(copy_gtid_83061, K_68510 * D_68526)) {
*(__global double *) &mem_81433[(squot32(copy_gtid_83061, D_68526) *
D_68526 + (copy_gtid_83061 -
squot32(copy_gtid_83061,
D_68526) *
D_68526)) * 8] = *(__global
double *) &mem_81429[(0 *
K_68510 +
((copy_gtid_83061 -
squot32(copy_gtid_83061,
D_68526) *
D_68526) *
K_68510 +
squot32(copy_gtid_83061,
D_68526))) *
8];
}
}
__kernel void map_69238(int32_t N_68316, int32_t K_68318, int32_t D_68320,
int32_t D_68322, int32_t triD_68324, int32_t D_68333,
int32_t inner_ldim_80101, __global
unsigned char *alphas_mem_80367, __global
unsigned char *means_mem_80368, __global
unsigned char *qs_mem_80369, __global
unsigned char *icf_mem_80370, __global
unsigned char *mem_80384, __global
unsigned char *mem_80390, __global
unsigned char *mem_80393)
{
const int32_t group_sizze_69861 = gmm_objectivezigroup_sizze_69218;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(mem_80387_backing_0, 8 *
sext_i32_i64(gmm_objectivezigroup_sizze_69218));
int32_t global_tid_69238;
int32_t local_tid_69239;
int32_t group_sizze_82036;
int32_t wave_sizze_82035;
int32_t group_id_69240;
global_tid_69238 = get_global_id(0);
local_tid_69239 = get_local_id(0);
group_sizze_82036 = get_local_size(0);
wave_sizze_82035 = LOCKSTEP_WIDTH;
group_id_69240 = get_group_id(0);
int32_t gtid_69216;
int32_t inner_ltid_80100;
gtid_69216 = srem32(global_tid_69238, inner_ldim_80101) +
squot32(global_tid_69238, inner_ldim_80101) * inner_ldim_80101;
inner_ltid_80100 = srem32(global_tid_69238, inner_ldim_80101);
__local char *mem_80387;
mem_80387 = (__local char *) mem_80387_backing_0;
double res_69882;
double res_69986;
if (slt32(gtid_69216, N_68316)) {
double x_69885 = 0.0;
for (int32_t chunk_offset_69884 = 0; chunk_offset_69884 < K_68318;
chunk_offset_69884++) {
double alphas_elem_69898 = *(__global
double *) &alphas_mem_80367[chunk_offset_69884 *
8];
double res_69904;
double x_69907 = 0.0;
int32_t chunk_sizze_69905;
int32_t chunk_offset_69906 = 0;
while (slt32(chunk_offset_69906, D_68333)) {
if (slt32(D_68333 - chunk_offset_69906, group_sizze_69861)) {
chunk_sizze_69905 = D_68333 - chunk_offset_69906;
} else {
chunk_sizze_69905 = group_sizze_69861;
}
double res_69909;
double sync_80107;
for (int32_t comb_iter_82038 = 0; comb_iter_82038 <
squot32(group_sizze_69861 + inner_ldim_80101 - 1,
inner_ldim_80101); comb_iter_82038++) {
int32_t cid_80099;
int32_t flat_comb_id_82039 = comb_iter_82038 *
inner_ldim_80101 + local_tid_69239;
cid_80099 = flat_comb_id_82039;
if (slt32(cid_80099, chunk_sizze_69905) && 1) {
double x_chunk_outer_elem_80098 = *(__global
double *) &qs_mem_80369[(chunk_offset_69884 *
D_68322 +
chunk_offset_69906 +
local_tid_69239) *
8];
*(__local double *) &mem_80387[cid_80099 * 8] =
x_chunk_outer_elem_80098;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
double acc_69912 = x_69907;
int32_t groupstream_mapaccum_dummy_chunk_sizze_69910;
groupstream_mapaccum_dummy_chunk_sizze_69910 = 1;
if (chunk_sizze_69905 == group_sizze_69861) {
for (int32_t i_69911 = 0; i_69911 < group_sizze_69861;
i_69911++) {
double x_69914;
double res_69917;
x_69914 = *(__local double *) &mem_80387[i_69911 * 8];
res_69917 = acc_69912 + x_69914;
double acc_tmp_82040 = res_69917;
acc_69912 = acc_tmp_82040;
}
} else {
for (int32_t i_69911 = 0; i_69911 < chunk_sizze_69905;
i_69911++) {
double x_69914;
double res_69917;
x_69914 = *(__local double *) &mem_80387[i_69911 * 8];
res_69917 = acc_69912 + x_69914;
double acc_tmp_82041 = res_69917;
acc_69912 = acc_tmp_82041;
}
}
res_69909 = acc_69912;
sync_80107 = res_69909;
barrier(CLK_LOCAL_MEM_FENCE);
x_69907 = sync_80107;
chunk_offset_69906 += group_sizze_69861;
}
res_69904 = x_69907;
double x_69918;
double res_69928;
double y_69981;
double res_69982;
double res_69983;
double res_69985;
x_69918 = alphas_elem_69898 + res_69904;
for (int32_t i_69923 = 0; i_69923 < D_68333; i_69923++) {
double x_elem_elem_69924;
double means_elem_elem_69925;
double res_69926;
x_elem_elem_69924 = *(__global double *) &mem_80384[(i_69923 *
N_68316 +
gtid_69216) *
8];
means_elem_elem_69925 = *(__global
double *) &means_mem_80368[(chunk_offset_69884 *
D_68320 +
i_69923) *
8];
res_69926 = x_elem_elem_69924 - means_elem_elem_69925;
*(__global double *) &mem_80390[(group_id_69240 *
(inner_ldim_80101 * D_68333) +
local_tid_69239 + i_69923 *
inner_ldim_80101) * 8] =
res_69926;
}
double x_69931 = 0.0;
for (int32_t chunk_offset_69930 = 0; chunk_offset_69930 < D_68333;
chunk_offset_69930++) {
double qs_elem_elem_69941;
double res_69943;
double res_69978;
double res_69980;
qs_elem_elem_69941 = *(__global
double *) &qs_mem_80369[(chunk_offset_69884 *
D_68322 +
chunk_offset_69930) *
8];
double x_69946 = 0.0;
for (int32_t chunk_offset_69945 = 0; chunk_offset_69945 <
D_68333; chunk_offset_69945++) {
double x_69956;
bool cond_69958;
double res_69959;
double res_69975;
double res_69977;
x_69956 = *(__global double *) &mem_80390[(group_id_69240 *
(inner_ldim_80101 *
D_68333) +
local_tid_69239 +
chunk_offset_69945 *
inner_ldim_80101) *
8];
cond_69958 = slt32(chunk_offset_69930, chunk_offset_69945);
if (cond_69958) {
res_69959 = 0.0;
} else {
bool cond_69960;
double res_69961;
cond_69960 = chunk_offset_69930 == chunk_offset_69945;
if (cond_69960) {
double res_69962;
res_69962 = futrts_exp64(qs_elem_elem_69941);
res_69961 = res_69962;
} else {
int32_t y_69963;
int32_t x_69964;
int32_t res_69965;
int32_t gmm_knossos_tri_arg_69966;
int32_t y_69967;
int32_t x_69968;
int32_t res_69969;
int32_t x_69970;
int32_t x_69971;
int32_t y_69972;
int32_t i_69973;
double res_69974;
y_69963 = D_68333 - 1;
x_69964 = D_68333 * y_69963;
res_69965 = sdiv32(x_69964, 2);
gmm_knossos_tri_arg_69966 = D_68333 -
chunk_offset_69945;
y_69967 = gmm_knossos_tri_arg_69966 - 1;
x_69968 = gmm_knossos_tri_arg_69966 * y_69967;
res_69969 = sdiv32(x_69968, 2);
x_69970 = res_69965 - res_69969;
x_69971 = chunk_offset_69930 - chunk_offset_69945;
y_69972 = x_69971 - 1;
i_69973 = x_69970 + y_69972;
res_69974 = *(__global
double *) &icf_mem_80370[(chunk_offset_69884 *
triD_68324 +
i_69973) *
8];
res_69961 = res_69974;
}
res_69959 = res_69961;
}
res_69975 = x_69956 * res_69959;
res_69977 = x_69946 + res_69975;
double x_tmp_82044 = res_69977;
x_69946 = x_tmp_82044;
}
res_69943 = x_69946;
res_69978 = res_69943 * res_69943;
res_69980 = x_69931 + res_69978;
double x_tmp_82043 = res_69980;
x_69931 = x_tmp_82043;
}
res_69928 = x_69931;
y_69981 = 0.5 * res_69928;
res_69982 = x_69918 - y_69981;
res_69983 = futrts_exp64(res_69982);
res_69985 = x_69885 + res_69983;
double x_tmp_82037 = res_69985;
x_69885 = x_tmp_82037;
}
res_69882 = x_69885;
res_69986 = futrts_log64(res_69882);
}
if (slt32(gtid_69216, N_68316)) {
*(__global double *) &mem_80393[gtid_69216 * 8] = res_69986;
}
}
__kernel void map_69582(int32_t N_68316, __global unsigned char *mem_80432,
__global unsigned char *mem_80435)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_69582;
int32_t local_tid_69583;
int32_t group_sizze_82102;
int32_t wave_sizze_82101;
int32_t group_id_69584;
global_tid_69582 = get_global_id(0);
local_tid_69583 = get_local_id(0);
group_sizze_82102 = get_local_size(0);
wave_sizze_82101 = LOCKSTEP_WIDTH;
group_id_69584 = get_group_id(0);
int32_t gtid_69575;
gtid_69575 = global_tid_69582;
double res_70226;
double res_70227;
if (slt32(gtid_69575, N_68316)) {
res_70226 = *(__global double *) &mem_80432[gtid_69575 * 8];
res_70227 = futrts_log64(res_70226);
}
if (slt32(gtid_69575, N_68316)) {
*(__global double *) &mem_80435[gtid_69575 * 8] = res_70227;
}
}
__kernel void map_70494(int32_t K_68318, int32_t K_68321, int32_t K_68323,
int32_t triD_68324, int32_t D_68333, double x_68452,
double res_68453, double y_68475,
int32_t num_groups_70963, int32_t virt_groups_70970,
__global unsigned char *mem_80461, __global
unsigned char *mem_80465, __global
unsigned char *mem_80468)
{
const int32_t group_sizze_70953 = gmm_objectivezigroup_sizze_70474;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_70494;
int32_t local_tid_70495;
int32_t group_sizze_82195;
int32_t wave_sizze_82194;
int32_t group_id_70496;
global_tid_70494 = get_global_id(0);
local_tid_70495 = get_local_id(0);
group_sizze_82195 = get_local_size(0);
wave_sizze_82194 = LOCKSTEP_WIDTH;
group_id_70496 = get_group_id(0);
int32_t gtid_70472;
int32_t phys_group_id_82196;
phys_group_id_82196 = get_group_id(0);
for (int32_t i_82197 = 0; i_82197 < squot32(virt_groups_70970 -
phys_group_id_82196 +
num_groups_70963 - 1,
num_groups_70963); i_82197++) {
int32_t virt_group_id_82198 = phys_group_id_82196 + i_82197 *
num_groups_70963;
gtid_70472 = virt_group_id_82198 * group_sizze_70953 + local_tid_70495;
double res_70975;
double res_70991;
double y_71006;
double y_71007;
double x_71008;
double res_71009;
double y_71023;
double x_71024;
double res_71025;
if (slt32(gtid_70472, K_68318)) {
double x_70978 = 0.0;
for (int32_t chunk_offset_70977 = 0; chunk_offset_70977 < D_68333;
chunk_offset_70977++) {
double qs_elem_elem_70985;
double res_70987;
double res_70988;
double res_70990;
qs_elem_elem_70985 = *(__global
double *) &mem_80465[(chunk_offset_70977 *
K_68321 +
gtid_70472) * 8];
res_70987 = futrts_exp64(qs_elem_elem_70985);
res_70988 = res_70987 * res_70987;
res_70990 = x_70978 + res_70988;
double x_tmp_82199 = res_70990;
x_70978 = x_tmp_82199;
}
res_70975 = x_70978;
double x_70994 = 0.0;
for (int32_t chunk_offset_70993 = 0; chunk_offset_70993 <
triD_68324; chunk_offset_70993++) {
double x_71001;
double res_71003;
double res_71005;
x_71001 = *(__global double *) &mem_80461[(chunk_offset_70993 *
K_68323 +
gtid_70472) * 8];
res_71003 = x_71001 * x_71001;
res_71005 = x_70994 + res_71003;
double x_tmp_82200 = res_71005;
x_70994 = x_tmp_82200;
}
res_70991 = x_70994;
y_71006 = res_70975 + res_70991;
y_71007 = x_68452 * y_71006;
x_71008 = 0.5 * y_71007;
double x_71012 = 0.0;
for (int32_t chunk_offset_71011 = 0; chunk_offset_71011 < D_68333;
chunk_offset_71011++) {
double x_71019;
double res_71022;
x_71019 = *(__global double *) &mem_80465[(chunk_offset_71011 *
K_68321 +
gtid_70472) * 8];
res_71022 = x_71012 + x_71019;
double x_tmp_82201 = res_71022;
x_71012 = x_tmp_82201;
}
res_71009 = x_71012;
y_71023 = res_68453 * res_71009;
x_71024 = x_71008 - y_71023;
res_71025 = x_71024 - y_68475;
}
if (slt32(gtid_70472, K_68318)) {
*(__global double *) &mem_80468[gtid_70472 * 8] = res_71025;
}
}
}
__kernel void map_70670(int32_t K_68318, double res_68453, double y_68475,
__global unsigned char *mem_80489, __global
unsigned char *mem_80492, __global
unsigned char *mem_80495)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_70670;
int32_t local_tid_70671;
int32_t group_sizze_82331;
int32_t wave_sizze_82330;
int32_t group_id_70672;
global_tid_70670 = get_global_id(0);
local_tid_70671 = get_local_id(0);
group_sizze_82331 = get_local_size(0);
wave_sizze_82330 = LOCKSTEP_WIDTH;
group_id_70672 = get_group_id(0);
int32_t gtid_70663;
gtid_70663 = global_tid_70670;
double x_71173;
double res_71174;
double y_71175;
double x_71176;
double res_71177;
if (slt32(gtid_70663, K_68318)) {
x_71173 = *(__global double *) &mem_80489[gtid_70663 * 8];
res_71174 = *(__global double *) &mem_80492[gtid_70663 * 8];
y_71175 = res_68453 * res_71174;
x_71176 = x_71173 - y_71175;
res_71177 = x_71176 - y_68475;
}
if (slt32(gtid_70663, K_68318)) {
*(__global double *) &mem_80495[gtid_70663 * 8] = res_71177;
}
}
__kernel void map_70713(int32_t K_68318, double x_68452, __global
unsigned char *mem_80483, __global
unsigned char *mem_80486, __global
unsigned char *mem_80489)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_70713;
int32_t local_tid_70714;
int32_t group_sizze_82292;
int32_t wave_sizze_82291;
int32_t group_id_70715;
global_tid_70713 = get_global_id(0);
local_tid_70714 = get_local_id(0);
group_sizze_82292 = get_local_size(0);
wave_sizze_82291 = LOCKSTEP_WIDTH;
group_id_70715 = get_group_id(0);
int32_t gtid_70706;
gtid_70706 = global_tid_70713;
double res_71138;
double res_71139;
double y_71140;
double y_71141;
double x_71142;
if (slt32(gtid_70706, K_68318)) {
res_71138 = *(__global double *) &mem_80483[gtid_70706 * 8];
res_71139 = *(__global double *) &mem_80486[gtid_70706 * 8];
y_71140 = res_71138 + res_71139;
y_71141 = x_68452 * y_71140;
x_71142 = 0.5 * y_71141;
}
if (slt32(gtid_70706, K_68318)) {
*(__global double *) &mem_80489[gtid_70706 * 8] = x_71142;
}
}
__kernel void map_71445(int32_t N_68508, int32_t K_68510, int32_t D_68512,
int32_t D_68514, int32_t triD_68516, double d_r_68524,
int32_t D_68526, int32_t num_groups_71747,
int32_t virt_groups_71754, __global
unsigned char *alphas_mem_80367, __global
unsigned char *means_mem_80368, __global
unsigned char *qs_mem_80369, __global
unsigned char *icf_mem_80370, __global
unsigned char *mem_80374, __global
unsigned char *mem_80377, __global
unsigned char *mem_80380, __global
unsigned char *mem_80384, __global
unsigned char *mem_80388, __global
unsigned char *mem_80392, __global
unsigned char *mem_80395, __global
unsigned char *mem_80398, __global
unsigned char *mem_80402, __global
unsigned char *mem_80411, __global
unsigned char *mem_80415, __global
unsigned char *mem_80419, __global
unsigned char *mem_80422, __global
unsigned char *mem_80426, __global
unsigned char *mem_80430, __global
unsigned char *mem_80433, __global
unsigned char *mem_80436, __global
unsigned char *mem_80439, __global
unsigned char *mem_80442, __global
unsigned char *mem_80445, __global
unsigned char *mem_80448, __global
unsigned char *mem_80480, __global
unsigned char *mem_80485, __global
unsigned char *mem_80490, __global
unsigned char *mem_80495, __global
unsigned char *mem_81655, __global
unsigned char *mem_81658, __global
unsigned char *mem_81665, __global
unsigned char *mem_81668)
{
const int32_t group_sizze_71737 = rev_gmm_objectivezigroup_sizze_71425;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_71445;
int32_t local_tid_71446;
int32_t group_sizze_82366;
int32_t wave_sizze_82365;
int32_t group_id_71447;
global_tid_71445 = get_global_id(0);
local_tid_71446 = get_local_id(0);
group_sizze_82366 = get_local_size(0);
wave_sizze_82365 = LOCKSTEP_WIDTH;
group_id_71447 = get_group_id(0);
int32_t gtid_71423;
int32_t phys_group_id_82367;
phys_group_id_82367 = get_group_id(0);
for (int32_t i_82368 = 0; i_82368 < squot32(virt_groups_71754 -
phys_group_id_82367 +
num_groups_71747 - 1,
num_groups_71747); i_82368++) {
int32_t virt_group_id_82369 = phys_group_id_82367 + i_82368 *
num_groups_71747;
gtid_71423 = virt_group_id_82369 * group_sizze_71737 + local_tid_71446;
double res_71763;
double x_71832;
double res_71833;
if (slt32(gtid_71423, N_68508)) {
double redout_71765 = 0.0;
for (int32_t i_71767 = 0; i_71767 < K_68510; i_71767++) {
double alphas_elem_71768;
double res_71773;
double x_71780;
double res_71790;
double y_71825;
double res_71826;
double res_71827;
double res_71830;
alphas_elem_71768 = *(__global
double *) &alphas_mem_80367[i_71767 * 8];
double redout_71774 = 0.0;
for (int32_t i_71775 = 0; i_71775 < D_68526; i_71775++) {
double x_71776;
double res_71779;
x_71776 = *(__global double *) &qs_mem_80369[(i_71767 *
D_68514 +
i_71775) * 8];
res_71779 = redout_71774 + x_71776;
double redout_tmp_82372 = res_71779;
redout_71774 = redout_tmp_82372;
}
res_71773 = redout_71774;
x_71780 = alphas_elem_71768 + res_71773;
for (int32_t i_71785 = 0; i_71785 < D_68526; i_71785++) {
double x_elem_elem_71786;
double means_elem_elem_71787;
double res_71788;
x_elem_elem_71786 = *(__global
double *) &mem_80374[(i_71785 *
N_68508 +
gtid_71423) *
8];
means_elem_elem_71787 = *(__global
double *) &means_mem_80368[(i_71767 *
D_68512 +
i_71785) *
8];
res_71788 = x_elem_elem_71786 - means_elem_elem_71787;
*(__global double *) &mem_80380[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 + i_71785 *
group_sizze_71737) * 8] =
res_71788;
}
double redout_71791 = 0.0;
for (int32_t i_71792 = 0; i_71792 < D_68526; i_71792++) {
double qs_elem_elem_71794;
double res_71795;
double res_71821;
double res_71824;
qs_elem_elem_71794 = *(__global
double *) &qs_mem_80369[(i_71767 *
D_68514 +
i_71792) *
8];
double redout_71796 = 0.0;
for (int32_t i_71797 = 0; i_71797 < D_68526; i_71797++) {
double x_71799;
bool cond_71800;
double res_71801;
double res_71817;
double res_71820;
x_71799 = *(__global
double *) &mem_80380[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_71797 *
group_sizze_71737) *
8];
cond_71800 = slt32(i_71792, i_71797);
if (cond_71800) {
res_71801 = 0.0;
} else {
bool cond_71802;
double res_71803;
cond_71802 = i_71792 == i_71797;
if (cond_71802) {
double res_71804;
res_71804 = futrts_exp64(qs_elem_elem_71794);
res_71803 = res_71804;
} else {
int32_t y_71805;
int32_t x_71806;
int32_t res_71807;
int32_t gmm_knossos_tri_arg_71808;
int32_t y_71809;
int32_t x_71810;
int32_t res_71811;
int32_t x_71812;
int32_t x_71813;
int32_t y_71814;
int32_t i_71815;
double res_71816;
y_71805 = D_68526 - 1;
x_71806 = D_68526 * y_71805;
res_71807 = sdiv32(x_71806, 2);
gmm_knossos_tri_arg_71808 = D_68526 - i_71797;
y_71809 = gmm_knossos_tri_arg_71808 - 1;
x_71810 = gmm_knossos_tri_arg_71808 * y_71809;
res_71811 = sdiv32(x_71810, 2);
x_71812 = res_71807 - res_71811;
x_71813 = i_71792 - i_71797;
y_71814 = x_71813 - 1;
i_71815 = x_71812 + y_71814;
res_71816 = *(__global
double *) &icf_mem_80370[(i_71767 *
triD_68516 +
i_71815) *
8];
res_71803 = res_71816;
}
res_71801 = res_71803;
}
res_71817 = x_71799 * res_71801;
res_71820 = redout_71796 + res_71817;
double redout_tmp_82375 = res_71820;
redout_71796 = redout_tmp_82375;
}
res_71795 = redout_71796;
res_71821 = res_71795 * res_71795;
res_71824 = redout_71791 + res_71821;
double redout_tmp_82374 = res_71824;
redout_71791 = redout_tmp_82374;
}
res_71790 = redout_71791;
y_71825 = 0.5 * res_71790;
res_71826 = x_71780 - y_71825;
res_71827 = futrts_exp64(res_71826);
res_71830 = redout_71765 + res_71827;
*(__global double *) &mem_80377[(group_id_71447 *
(group_sizze_71737 * K_68510) +
local_tid_71446 + i_71767 *
group_sizze_71737) * 8] =
res_71826;
double redout_tmp_82370 = res_71830;
redout_71765 = redout_tmp_82370;
}
res_71763 = redout_71765;
x_71832 = 1.0 / res_71763;
res_71833 = d_r_68524 * x_71832;
for (int32_t i_71847 = 0; i_71847 < K_68510; i_71847++) {
double res_elem_71848;
double res_71852;
double res_71853;
double y_71893;
double rev_sqnorm_arg_71894;
res_elem_71848 = *(__global
double *) &mem_80377[(group_id_71447 *
(group_sizze_71737 *
K_68510) +
local_tid_71446 +
i_71847 *
group_sizze_71737) *
8];
res_71852 = futrts_exp64(res_elem_71848);
res_71853 = res_71833 * res_71852;
for (int32_t i_71862 = 0; i_71862 < D_68526; i_71862++) {
double qs_elem_elem_71864;
double x_elem_elem_71865;
double means_elem_elem_71866;
double res_71890;
qs_elem_elem_71864 = *(__global
double *) &qs_mem_80369[(i_71847 *
D_68514 +
i_71862) *
8];
x_elem_elem_71865 = *(__global
double *) &mem_80374[(i_71862 *
N_68508 +
gtid_71423) *
8];
means_elem_elem_71866 = *(__global
double *) &means_mem_80368[(i_71847 *
D_68512 +
i_71862) *
8];
for (int32_t i_71870 = 0; i_71870 < D_68526; i_71870++) {
bool cond_71872;
double res_71873;
cond_71872 = slt32(i_71862, i_71870);
if (cond_71872) {
res_71873 = 0.0;
} else {
bool cond_71874;
double res_71875;
cond_71874 = i_71862 == i_71870;
if (cond_71874) {
double res_71876;
res_71876 = futrts_exp64(qs_elem_elem_71864);
res_71875 = res_71876;
} else {
int32_t y_71877;
int32_t x_71878;
int32_t res_71879;
int32_t gmm_knossos_tri_arg_71880;
int32_t y_71881;
int32_t x_71882;
int32_t res_71883;
int32_t x_71884;
int32_t x_71885;
int32_t y_71886;
int32_t i_71887;
double res_71888;
y_71877 = D_68526 - 1;
x_71878 = D_68526 * y_71877;
res_71879 = sdiv32(x_71878, 2);
gmm_knossos_tri_arg_71880 = D_68526 - i_71870;
y_71881 = gmm_knossos_tri_arg_71880 - 1;
x_71882 = gmm_knossos_tri_arg_71880 * y_71881;
res_71883 = sdiv32(x_71882, 2);
x_71884 = res_71879 - res_71883;
x_71885 = i_71862 - i_71870;
y_71886 = x_71885 - 1;
i_71887 = x_71884 + y_71886;
res_71888 = *(__global
double *) &icf_mem_80370[(i_71847 *
triD_68516 +
i_71887) *
8];
res_71875 = res_71888;
}
res_71873 = res_71875;
}
*(__global double *) &mem_80402[(group_id_71447 *
(group_sizze_71737 *
D_68526 * D_68526) +
local_tid_71446 +
i_71862 *
(group_sizze_71737 *
D_68526) + i_71870 *
group_sizze_71737) *
8] = res_71873;
}
res_71890 = x_elem_elem_71865 - means_elem_elem_71866;
*(__global double *) &mem_80398[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 + i_71862 *
group_sizze_71737) * 8] =
res_71890;
}
y_71893 = 0.0 - res_71853;
rev_sqnorm_arg_71894 = 0.5 * y_71893;
for (int32_t i_71898 = 0; i_71898 < D_68526; i_71898++) {
double res_71900;
double res_71909;
double res_71910;
double redout_71901 = 0.0;
for (int32_t i_71902 = 0; i_71902 < D_68526; i_71902++) {
double x_71903;
double x_71904;
double res_71905;
double res_71908;
x_71903 = *(__global
double *) &mem_80398[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_71902 *
group_sizze_71737) *
8];
x_71904 = *(__global
double *) &mem_80402[(group_id_71447 *
(group_sizze_71737 *
D_68526 * D_68526) +
local_tid_71446 +
(i_71898 *
(group_sizze_71737 *
D_68526) + i_71902 *
group_sizze_71737)) *
8];
res_71905 = x_71903 * x_71904;
res_71908 = redout_71901 + res_71905;
double redout_tmp_82384 = res_71908;
redout_71901 = redout_tmp_82384;
}
res_71900 = redout_71901;
res_71909 = rev_sqnorm_arg_71894 * res_71900;
res_71910 = res_71909 + res_71909;
*(__global double *) &mem_80411[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 + i_71898 *
group_sizze_71737) * 8] =
res_71910;
}
for (int32_t i_71922 = 0; i_71922 < D_68526; i_71922++) {
double x_71923;
double qs_elem_elem_71926;
double res_71927;
double res_71936;
x_71923 = *(__global double *) &mem_80411[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_71922 *
group_sizze_71737) *
8];
qs_elem_elem_71926 = *(__global
double *) &qs_mem_80369[(i_71847 *
D_68514 +
i_71922) *
8];
double redout_71928 = 0.0;
for (int32_t i_71929 = 0; i_71929 < D_68526; i_71929++) {
double x_71930;
double x_71931;
double res_71932;
double res_71935;
x_71930 = *(__global
double *) &mem_80402[(group_id_71447 *
(group_sizze_71737 *
D_68526 * D_68526) +
local_tid_71446 +
(i_71929 *
(group_sizze_71737 *
D_68526) + i_71922 *
group_sizze_71737)) *
8];
x_71931 = *(__global
double *) &mem_80411[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_71929 *
group_sizze_71737) *
8];
res_71932 = x_71930 * x_71931;
res_71935 = redout_71928 + res_71932;
double redout_tmp_82388 = res_71935;
redout_71928 = redout_tmp_82388;
}
res_71927 = redout_71928;
res_71936 = 0.0 - res_71927;
for (int32_t i_71943 = 0; i_71943 < D_68526; i_71943++) {
double x_71944;
double res_71946;
bool cond_71947;
bool cond_71948;
x_71944 = *(__global
double *) &mem_80398[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_71943 *
group_sizze_71737) *
8];
res_71946 = x_71923 * x_71944;
cond_71947 = slt32(i_71922, i_71943);
cond_71948 = i_71922 == i_71943;
if (cond_71947) {
for (int32_t i_82391 = 0; i_82391 < D_68526;
i_82391++) {
*(__global
double *) &mem_80433[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_82391 *
group_sizze_71737) *
8] = 0.0;
}
for (int32_t i_82392 = 0; i_82392 < triD_68516;
i_82392++) {
*(__global
double *) &mem_80436[(group_id_71447 *
(group_sizze_71737 *
triD_68516) +
local_tid_71446 +
i_82392 *
group_sizze_71737) *
8] = 0.0;
}
for (int32_t i_82393 = 0; i_82393 < D_68526;
i_82393++) {
*(__global
double *) &mem_81668[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_82393 *
group_sizze_71737) *
8] = *(__global
double *) &mem_80433[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_82393 *
group_sizze_71737) *
8];
}
for (int32_t i_82394 = 0; i_82394 < triD_68516;
i_82394++) {
*(__global
double *) &mem_81665[(group_id_71447 *
(group_sizze_71737 *
triD_68516) +
local_tid_71446 +
i_82394 *
group_sizze_71737) *
8] = *(__global
double *) &mem_80436[(group_id_71447 *
(group_sizze_71737 *
triD_68516) +
local_tid_71446 +
i_82394 *
group_sizze_71737) *
8];
}
} else {
if (cond_71948) {
double res_71955;
double deltaVec_arg_71956;
res_71955 = futrts_exp64(qs_elem_elem_71926);
deltaVec_arg_71956 = res_71946 * res_71955;
for (int32_t i_71961 = 0; i_71961 < D_68526;
i_71961++) {
bool cond_71963;
double res_71964;
cond_71963 = i_71961 == i_71922;
if (cond_71963) {
res_71964 = deltaVec_arg_71956;
} else {
res_71964 = 0.0;
}
*(__global
double *) &mem_80439[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_71961 *
group_sizze_71737) *
8] = res_71964;
}
for (int32_t i_82396 = 0; i_82396 < triD_68516;
i_82396++) {
*(__global
double *) &mem_80442[(group_id_71447 *
(group_sizze_71737 *
triD_68516) +
local_tid_71446 +
i_82396 *
group_sizze_71737) *
8] = 0.0;
}
for (int32_t i_82397 = 0; i_82397 < D_68526;
i_82397++) {
*(__global
double *) &mem_81658[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_82397 *
group_sizze_71737) *
8] = *(__global
double *) &mem_80439[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_82397 *
group_sizze_71737) *
8];
}
for (int32_t i_82398 = 0; i_82398 < triD_68516;
i_82398++) {
*(__global
double *) &mem_81655[(group_id_71447 *
(group_sizze_71737 *
triD_68516) +
local_tid_71446 +
i_82398 *
group_sizze_71737) *
8] = *(__global
double *) &mem_80442[(group_id_71447 *
(group_sizze_71737 *
triD_68516) +
local_tid_71446 +
i_82398 *
group_sizze_71737) *
8];
}
} else {
int32_t y_71967;
int32_t x_71968;
int32_t res_71969;
int32_t deltaVec_arg_71970;
y_71967 = i_71922 - 1;
x_71968 = i_71922 * y_71967;
res_71969 = sdiv32(x_71968, 2);
deltaVec_arg_71970 = i_71943 + res_71969;
for (int32_t i_71975 = 0; i_71975 < triD_68516;
i_71975++) {
bool cond_71977;
double res_71978;
cond_71977 = i_71975 == deltaVec_arg_71970;
if (cond_71977) {
res_71978 = res_71946;
} else {
res_71978 = 0.0;
}
*(__global
double *) &mem_80445[(group_id_71447 *
(group_sizze_71737 *
triD_68516) +
local_tid_71446 +
i_71975 *
group_sizze_71737) *
8] = res_71978;
}
for (int32_t i_82400 = 0; i_82400 < D_68526;
i_82400++) {
*(__global
double *) &mem_80448[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_82400 *
group_sizze_71737) *
8] = 0.0;
}
for (int32_t i_82401 = 0; i_82401 < D_68526;
i_82401++) {
*(__global
double *) &mem_81658[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_82401 *
group_sizze_71737) *
8] = *(__global
double *) &mem_80448[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_82401 *
group_sizze_71737) *
8];
}
for (int32_t i_82402 = 0; i_82402 < triD_68516;
i_82402++) {
*(__global
double *) &mem_81655[(group_id_71447 *
(group_sizze_71737 *
triD_68516) +
local_tid_71446 +
i_82402 *
group_sizze_71737) *
8] = *(__global
double *) &mem_80445[(group_id_71447 *
(group_sizze_71737 *
triD_68516) +
local_tid_71446 +
i_82402 *
group_sizze_71737) *
8];
}
}
for (int32_t i_82403 = 0; i_82403 < D_68526;
i_82403++) {
*(__global
double *) &mem_81668[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_82403 *
group_sizze_71737) *
8] = *(__global
double *) &mem_81658[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_82403 *
group_sizze_71737) *
8];
}
for (int32_t i_82404 = 0; i_82404 < triD_68516;
i_82404++) {
*(__global
double *) &mem_81665[(group_id_71447 *
(group_sizze_71737 *
triD_68516) +
local_tid_71446 +
i_82404 *
group_sizze_71737) *
8] = *(__global
double *) &mem_81655[(group_id_71447 *
(group_sizze_71737 *
triD_68516) +
local_tid_71446 +
i_82404 *
group_sizze_71737) *
8];
}
}
for (int32_t i_82405 = 0; i_82405 < triD_68516;
i_82405++) {
*(__global double *) &mem_80430[(group_id_71447 *
(group_sizze_71737 *
triD_68516 *
D_68526) +
local_tid_71446 +
i_71943 *
(group_sizze_71737 *
triD_68516) +
i_82405 *
group_sizze_71737) *
8] = *(__global
double *) &mem_81665[(group_id_71447 *
(group_sizze_71737 *
triD_68516) +
local_tid_71446 +
i_82405 *
group_sizze_71737) *
8];
}
for (int32_t i_82406 = 0; i_82406 < D_68526;
i_82406++) {
*(__global double *) &mem_80426[(group_id_71447 *
(group_sizze_71737 *
D_68526 *
D_68526) +
local_tid_71446 +
i_71943 *
(group_sizze_71737 *
D_68526) +
i_82406 *
group_sizze_71737) *
8] = *(__global
double *) &mem_81668[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_82406 *
group_sizze_71737) *
8];
}
}
for (int32_t i_71987 = 0; i_71987 < D_68526; i_71987++) {
double res_71989;
double redout_71990 = 0.0;
for (int32_t i_71991 = 0; i_71991 < D_68526;
i_71991++) {
double x_71992;
double res_71995;
x_71992 = *(__global
double *) &mem_80426[(group_id_71447 *
(group_sizze_71737 *
D_68526 *
D_68526) +
local_tid_71446 +
(i_71991 *
(group_sizze_71737 *
D_68526) +
i_71987 *
group_sizze_71737)) *
8];
res_71995 = redout_71990 + x_71992;
double redout_tmp_82408 = res_71995;
redout_71990 = redout_tmp_82408;
}
res_71989 = redout_71990;
*(__global double *) &mem_80415[(group_id_71447 *
(group_sizze_71737 *
D_68526 * D_68526) +
local_tid_71446 +
i_71922 *
(group_sizze_71737 *
D_68526) + i_71987 *
group_sizze_71737) *
8] = res_71989;
}
for (int32_t i_72001 = 0; i_72001 < triD_68516; i_72001++) {
double res_72003;
double redout_72004 = 0.0;
for (int32_t i_72005 = 0; i_72005 < D_68526;
i_72005++) {
double x_72006;
double res_72009;
x_72006 = *(__global
double *) &mem_80430[(group_id_71447 *
(group_sizze_71737 *
triD_68516 *
D_68526) +
local_tid_71446 +
(i_72005 *
(group_sizze_71737 *
triD_68516) +
i_72001 *
group_sizze_71737)) *
8];
res_72009 = redout_72004 + x_72006;
double redout_tmp_82410 = res_72009;
redout_72004 = redout_tmp_82410;
}
res_72003 = redout_72004;
*(__global double *) &mem_80419[(group_id_71447 *
(group_sizze_71737 *
triD_68516 *
D_68526) +
local_tid_71446 +
i_71922 *
(group_sizze_71737 *
triD_68516) +
i_72001 *
group_sizze_71737) *
8] = res_72003;
}
*(__global double *) &mem_80422[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 + i_71922 *
group_sizze_71737) * 8] =
res_71936;
}
for (int32_t i_82411 = 0; i_82411 < D_68526; i_82411++) {
*(__global double *) &mem_80384[(group_id_71447 *
(group_sizze_71737 *
D_68526 * K_68510) +
local_tid_71446 + i_71847 *
(group_sizze_71737 *
D_68526) + i_82411 *
group_sizze_71737) * 8] =
*(__global double *) &mem_80422[(group_id_71447 *
(group_sizze_71737 *
D_68526) +
local_tid_71446 +
i_82411 *
group_sizze_71737) *
8];
}
for (int32_t i_72018 = 0; i_72018 < triD_68516; i_72018++) {
double res_72020;
double redout_72021 = 0.0;
for (int32_t i_72022 = 0; i_72022 < D_68526; i_72022++) {
double x_72023;
double res_72026;
x_72023 = *(__global
double *) &mem_80419[(group_id_71447 *
(group_sizze_71737 *
triD_68516 *
D_68526) +
local_tid_71446 +
(i_72022 *
(group_sizze_71737 *
triD_68516) +
i_72018 *
group_sizze_71737)) *
8];
res_72026 = redout_72021 + x_72023;
double redout_tmp_82413 = res_72026;
redout_72021 = redout_tmp_82413;
}
res_72020 = redout_72021;
*(__global double *) &mem_80392[(group_id_71447 *
(group_sizze_71737 *
triD_68516 * K_68510) +
local_tid_71446 + i_71847 *
(group_sizze_71737 *
triD_68516) + i_72018 *
group_sizze_71737) * 8] =
res_72020;
}
for (int32_t i_72032 = 0; i_72032 < D_68526; i_72032++) {
double res_72034;
double res_72041;
double redout_72035 = 0.0;
for (int32_t i_72036 = 0; i_72036 < D_68526; i_72036++) {
double x_72037;
double res_72040;
x_72037 = *(__global
double *) &mem_80415[(group_id_71447 *
(group_sizze_71737 *
D_68526 * D_68526) +
local_tid_71446 +
(i_72036 *
(group_sizze_71737 *
D_68526) + i_72032 *
group_sizze_71737)) *
8];
res_72040 = redout_72035 + x_72037;
double redout_tmp_82415 = res_72040;
redout_72035 = redout_tmp_82415;
}
res_72034 = redout_72035;
res_72041 = res_71853 + res_72034;
*(__global double *) &mem_80388[(group_id_71447 *
(group_sizze_71737 *
D_68526 * K_68510) +
local_tid_71446 + i_71847 *
(group_sizze_71737 *
D_68526) + i_72032 *
group_sizze_71737) * 8] =
res_72041;
}
*(__global double *) &mem_80395[(group_id_71447 *
(group_sizze_71737 * K_68510) +
local_tid_71446 + i_71847 *
group_sizze_71737) * 8] =
res_71853;
}
}
if (slt32(gtid_71423, N_68508)) {
for (int32_t i_82416 = 0; i_82416 < K_68510; i_82416++) {
*(__global double *) &mem_80480[(gtid_71423 + i_82416 *
N_68508) * 8] = *(__global
double *) &mem_80395[(group_id_71447 *
(group_sizze_71737 *
K_68510) +
local_tid_71446 +
i_82416 *
group_sizze_71737) *
8];
}
}
if (slt32(gtid_71423, N_68508)) {
for (int32_t i_82417 = 0; i_82417 < K_68510; i_82417++) {
for (int32_t i_82418 = 0; i_82418 < D_68526; i_82418++) {
*(__global double *) &mem_80485[(N_68508 * D_68526 * 0 +
N_68508 * 0 + gtid_71423 +
(i_82417 * (N_68508 *
D_68526) +
i_82418 * N_68508)) * 8] =
*(__global double *) &mem_80384[(group_id_71447 *
(group_sizze_71737 *
D_68526 * K_68510) +
local_tid_71446 +
(i_82417 *
(group_sizze_71737 *
D_68526) + i_82418 *
group_sizze_71737)) *
8];
}
}
}
if (slt32(gtid_71423, N_68508)) {
for (int32_t i_82419 = 0; i_82419 < K_68510; i_82419++) {
for (int32_t i_82420 = 0; i_82420 < D_68526; i_82420++) {
*(__global double *) &mem_80490[(N_68508 * D_68526 * 0 +
N_68508 * 0 + gtid_71423 +
(i_82419 * (N_68508 *
D_68526) +
i_82420 * N_68508)) * 8] =
*(__global double *) &mem_80388[(group_id_71447 *
(group_sizze_71737 *
D_68526 * K_68510) +
local_tid_71446 +
(i_82419 *
(group_sizze_71737 *
D_68526) + i_82420 *
group_sizze_71737)) *
8];
}
}
}
if (slt32(gtid_71423, N_68508)) {
for (int32_t i_82421 = 0; i_82421 < K_68510; i_82421++) {
for (int32_t i_82422 = 0; i_82422 < triD_68516; i_82422++) {
*(__global double *) &mem_80495[(N_68508 * triD_68516 * 0 +
N_68508 * 0 + gtid_71423 +
(i_82421 * (N_68508 *
triD_68516) +
i_82422 * N_68508)) * 8] =
*(__global double *) &mem_80392[(group_id_71447 *
(group_sizze_71737 *
triD_68516 *
K_68510) +
local_tid_71446 +
(i_82421 *
(group_sizze_71737 *
triD_68516) +
i_82422 *
group_sizze_71737)) *
8];
}
}
}
}
}
__kernel void map_72450(int32_t N_68508, int32_t D_68509, int32_t K_68510,
int32_t K_68511, int32_t K_68513, int32_t K_68515,
int32_t triD_68516, int32_t D_68526,
int32_t num_groups_76462, int32_t virt_groups_76469,
__global unsigned char *x_mem_80366, __global
unsigned char *mem_80631, __global
unsigned char *mem_80635, __global
unsigned char *mem_80639, __global
unsigned char *mem_80649, __global
unsigned char *mem_80652, __global
unsigned char *mem_80655, __global
unsigned char *mem_80659, __global
unsigned char *mem_80668, __global
unsigned char *mem_80672, __global
unsigned char *mem_80676, __global
unsigned char *mem_80679, __global
unsigned char *mem_80683, __global
unsigned char *mem_80687, __global
unsigned char *mem_80690, __global
unsigned char *mem_80693, __global
unsigned char *mem_80696, __global
unsigned char *mem_80699, __global
unsigned char *mem_80702, __global
unsigned char *mem_80705, __global
unsigned char *mem_80724, __global
unsigned char *mem_80727, __global
unsigned char *mem_80731, __global
unsigned char *mem_80736, __global
unsigned char *mem_80741, __global
unsigned char *mem_80746, __global
unsigned char *mem_81705, __global
unsigned char *mem_81708, __global
unsigned char *mem_81715, __global
unsigned char *mem_81718)
{
const int32_t group_sizze_76452 = rev_gmm_objectivezigroup_sizze_72430;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_72450;
int32_t local_tid_72451;
int32_t group_sizze_82532;
int32_t wave_sizze_82531;
int32_t group_id_72452;
global_tid_72450 = get_global_id(0);
local_tid_72451 = get_local_id(0);
group_sizze_82532 = get_local_size(0);
wave_sizze_82531 = LOCKSTEP_WIDTH;
group_id_72452 = get_group_id(0);
int32_t gtid_72426;
int32_t gtid_72427;
int32_t phys_group_id_82533;
phys_group_id_82533 = get_group_id(0);
for (int32_t i_82534 = 0; i_82534 < squot32(virt_groups_76469 -
phys_group_id_82533 +
num_groups_76462 - 1,
num_groups_76462); i_82534++) {
int32_t virt_group_id_82535 = phys_group_id_82533 + i_82534 *
num_groups_76462;
gtid_72426 = squot32(virt_group_id_82535 * group_sizze_76452 +
local_tid_72451, K_68510);
gtid_72427 = virt_group_id_82535 * group_sizze_76452 + local_tid_72451 -
squot32(virt_group_id_82535 * group_sizze_76452 + local_tid_72451,
K_68510) * K_68510;
double res_76495;
double res_elem_76497;
double res_76501;
double res_76502;
double y_76542;
double rev_sqnorm_arg_76543;
if (slt32(gtid_72426, N_68508) && slt32(gtid_72427, K_68510)) {
res_76495 = *(__global double *) &mem_80652[gtid_72426 * 8];
res_elem_76497 = *(__global double *) &mem_80649[(gtid_72426 *
K_68510 +
gtid_72427) * 8];
res_76501 = futrts_exp64(res_elem_76497);
res_76502 = res_76495 * res_76501;
for (int32_t i_76511 = 0; i_76511 < D_68526; i_76511++) {
double qs_elem_elem_76513;
double x_elem_elem_76514;
double means_elem_elem_76515;
double res_76539;
qs_elem_elem_76513 = *(__global double *) &mem_80631[(i_76511 *
K_68513 +
gtid_72427) *
8];
x_elem_elem_76514 = *(__global
double *) &x_mem_80366[(gtid_72426 *
D_68509 +
i_76511) * 8];
means_elem_elem_76515 = *(__global
double *) &mem_80635[(i_76511 *
K_68511 +
gtid_72427) *
8];
for (int32_t i_76519 = 0; i_76519 < D_68526; i_76519++) {
bool cond_76521;
double res_76522;
cond_76521 = slt32(i_76511, i_76519);
if (cond_76521) {
res_76522 = 0.0;
} else {
bool cond_76523;
double res_76524;
cond_76523 = i_76511 == i_76519;
if (cond_76523) {
double res_76525;
res_76525 = futrts_exp64(qs_elem_elem_76513);
res_76524 = res_76525;
} else {
int32_t y_76526;
int32_t x_76527;
int32_t res_76528;
int32_t gmm_knossos_tri_arg_76529;
int32_t y_76530;
int32_t x_76531;
int32_t res_76532;
int32_t x_76533;
int32_t x_76534;
int32_t y_76535;
int32_t i_76536;
double res_76537;
y_76526 = D_68526 - 1;
x_76527 = D_68526 * y_76526;
res_76528 = sdiv32(x_76527, 2);
gmm_knossos_tri_arg_76529 = D_68526 - i_76519;
y_76530 = gmm_knossos_tri_arg_76529 - 1;
x_76531 = gmm_knossos_tri_arg_76529 * y_76530;
res_76532 = sdiv32(x_76531, 2);
x_76533 = res_76528 - res_76532;
x_76534 = i_76511 - i_76519;
y_76535 = x_76534 - 1;
i_76536 = x_76533 + y_76535;
res_76537 = *(__global
double *) &mem_80639[(i_76536 *
K_68515 +
gtid_72427) *
8];
res_76524 = res_76537;
}
res_76522 = res_76524;
}
*(__global double *) &mem_80659[(group_id_72452 *
(group_sizze_76452 *
D_68526 * D_68526) +
local_tid_72451 + i_76511 *
(group_sizze_76452 *
D_68526) + i_76519 *
group_sizze_76452) * 8] =
res_76522;
}
res_76539 = x_elem_elem_76514 - means_elem_elem_76515;
*(__global double *) &mem_80655[(group_id_72452 *
(group_sizze_76452 * D_68526) +
local_tid_72451 + i_76511 *
group_sizze_76452) * 8] =
res_76539;
}
y_76542 = 0.0 - res_76502;
rev_sqnorm_arg_76543 = 0.5 * y_76542;
for (int32_t i_76547 = 0; i_76547 < D_68526; i_76547++) {
double res_76549;
double res_76558;
double res_76559;
double redout_76550 = 0.0;
for (int32_t i_76551 = 0; i_76551 < D_68526; i_76551++) {
double x_76552;
double x_76553;
double res_76554;
double res_76557;
x_76552 = *(__global double *) &mem_80655[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 +
i_76551 *
group_sizze_76452) *
8];
x_76553 = *(__global double *) &mem_80659[(group_id_72452 *
(group_sizze_76452 *
D_68526 *
D_68526) +
local_tid_72451 +
(i_76547 *
(group_sizze_76452 *
D_68526) +
i_76551 *
group_sizze_76452)) *
8];
res_76554 = x_76552 * x_76553;
res_76557 = redout_76550 + res_76554;
double redout_tmp_82540 = res_76557;
redout_76550 = redout_tmp_82540;
}
res_76549 = redout_76550;
res_76558 = rev_sqnorm_arg_76543 * res_76549;
res_76559 = res_76558 + res_76558;
*(__global double *) &mem_80668[(group_id_72452 *
(group_sizze_76452 * D_68526) +
local_tid_72451 + i_76547 *
group_sizze_76452) * 8] =
res_76559;
}
for (int32_t i_76571 = 0; i_76571 < D_68526; i_76571++) {
double x_76572;
double qs_elem_elem_76575;
double res_76576;
double res_76585;
x_76572 = *(__global double *) &mem_80668[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 +
i_76571 *
group_sizze_76452) *
8];
qs_elem_elem_76575 = *(__global double *) &mem_80631[(i_76571 *
K_68513 +
gtid_72427) *
8];
double redout_76577 = 0.0;
for (int32_t i_76578 = 0; i_76578 < D_68526; i_76578++) {
double x_76579;
double x_76580;
double res_76581;
double res_76584;
x_76579 = *(__global double *) &mem_80659[(group_id_72452 *
(group_sizze_76452 *
D_68526 *
D_68526) +
local_tid_72451 +
(i_76578 *
(group_sizze_76452 *
D_68526) +
i_76571 *
group_sizze_76452)) *
8];
x_76580 = *(__global double *) &mem_80668[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 +
i_76578 *
group_sizze_76452) *
8];
res_76581 = x_76579 * x_76580;
res_76584 = redout_76577 + res_76581;
double redout_tmp_82544 = res_76584;
redout_76577 = redout_tmp_82544;
}
res_76576 = redout_76577;
res_76585 = 0.0 - res_76576;
for (int32_t i_76592 = 0; i_76592 < D_68526; i_76592++) {
double x_76593;
double res_76595;
bool cond_76596;
bool cond_76597;
x_76593 = *(__global double *) &mem_80655[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 +
i_76592 *
group_sizze_76452) *
8];
res_76595 = x_76572 * x_76593;
cond_76596 = slt32(i_76571, i_76592);
cond_76597 = i_76571 == i_76592;
if (cond_76596) {
for (int32_t i_82547 = 0; i_82547 < D_68526;
i_82547++) {
*(__global double *) &mem_80690[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 +
i_82547 *
group_sizze_76452) *
8] = 0.0;
}
for (int32_t i_82548 = 0; i_82548 < triD_68516;
i_82548++) {
*(__global double *) &mem_80693[(group_id_72452 *
(group_sizze_76452 *
triD_68516) +
local_tid_72451 +
i_82548 *
group_sizze_76452) *
8] = 0.0;
}
for (int32_t i_82549 = 0; i_82549 < D_68526;
i_82549++) {
*(__global double *) &mem_81718[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 +
i_82549 *
group_sizze_76452) *
8] = *(__global
double *) &mem_80690[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 +
i_82549 *
group_sizze_76452) *
8];
}
for (int32_t i_82550 = 0; i_82550 < triD_68516;
i_82550++) {
*(__global double *) &mem_81715[(group_id_72452 *
(group_sizze_76452 *
triD_68516) +
local_tid_72451 +
i_82550 *
group_sizze_76452) *
8] = *(__global
double *) &mem_80693[(group_id_72452 *
(group_sizze_76452 *
triD_68516) +
local_tid_72451 +
i_82550 *
group_sizze_76452) *
8];
}
} else {
if (cond_76597) {
double res_76604;
double deltaVec_arg_76605;
res_76604 = futrts_exp64(qs_elem_elem_76575);
deltaVec_arg_76605 = res_76595 * res_76604;
for (int32_t i_76610 = 0; i_76610 < D_68526;
i_76610++) {
bool cond_76612;
double res_76613;
cond_76612 = i_76610 == i_76571;
if (cond_76612) {
res_76613 = deltaVec_arg_76605;
} else {
res_76613 = 0.0;
}
*(__global
double *) &mem_80696[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 +
i_76610 *
group_sizze_76452) *
8] = res_76613;
}
for (int32_t i_82552 = 0; i_82552 < triD_68516;
i_82552++) {
*(__global
double *) &mem_80699[(group_id_72452 *
(group_sizze_76452 *
triD_68516) +
local_tid_72451 +
i_82552 *
group_sizze_76452) *
8] = 0.0;
}
for (int32_t i_82553 = 0; i_82553 < D_68526;
i_82553++) {
*(__global
double *) &mem_81708[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 +
i_82553 *
group_sizze_76452) *
8] = *(__global
double *) &mem_80696[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 +
i_82553 *
group_sizze_76452) *
8];
}
for (int32_t i_82554 = 0; i_82554 < triD_68516;
i_82554++) {
*(__global
double *) &mem_81705[(group_id_72452 *
(group_sizze_76452 *
triD_68516) +
local_tid_72451 +
i_82554 *
group_sizze_76452) *
8] = *(__global
double *) &mem_80699[(group_id_72452 *
(group_sizze_76452 *
triD_68516) +
local_tid_72451 +
i_82554 *
group_sizze_76452) *
8];
}
} else {
int32_t y_76616;
int32_t x_76617;
int32_t res_76618;
int32_t deltaVec_arg_76619;
y_76616 = i_76571 - 1;
x_76617 = i_76571 * y_76616;
res_76618 = sdiv32(x_76617, 2);
deltaVec_arg_76619 = i_76592 + res_76618;
for (int32_t i_76624 = 0; i_76624 < triD_68516;
i_76624++) {
bool cond_76626;
double res_76627;
cond_76626 = i_76624 == deltaVec_arg_76619;
if (cond_76626) {
res_76627 = res_76595;
} else {
res_76627 = 0.0;
}
*(__global
double *) &mem_80702[(group_id_72452 *
(group_sizze_76452 *
triD_68516) +
local_tid_72451 +
i_76624 *
group_sizze_76452) *
8] = res_76627;
}
for (int32_t i_82556 = 0; i_82556 < D_68526;
i_82556++) {
*(__global
double *) &mem_80705[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 +
i_82556 *
group_sizze_76452) *
8] = 0.0;
}
for (int32_t i_82557 = 0; i_82557 < D_68526;
i_82557++) {
*(__global
double *) &mem_81708[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 +
i_82557 *
group_sizze_76452) *
8] = *(__global
double *) &mem_80705[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 +
i_82557 *
group_sizze_76452) *
8];
}
for (int32_t i_82558 = 0; i_82558 < triD_68516;
i_82558++) {
*(__global
double *) &mem_81705[(group_id_72452 *
(group_sizze_76452 *
triD_68516) +
local_tid_72451 +
i_82558 *
group_sizze_76452) *
8] = *(__global
double *) &mem_80702[(group_id_72452 *
(group_sizze_76452 *
triD_68516) +
local_tid_72451 +
i_82558 *
group_sizze_76452) *
8];
}
}
for (int32_t i_82559 = 0; i_82559 < D_68526;
i_82559++) {
*(__global double *) &mem_81718[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 +
i_82559 *
group_sizze_76452) *
8] = *(__global
double *) &mem_81708[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 +
i_82559 *
group_sizze_76452) *
8];
}
for (int32_t i_82560 = 0; i_82560 < triD_68516;
i_82560++) {
*(__global double *) &mem_81715[(group_id_72452 *
(group_sizze_76452 *
triD_68516) +
local_tid_72451 +
i_82560 *
group_sizze_76452) *
8] = *(__global
double *) &mem_81705[(group_id_72452 *
(group_sizze_76452 *
triD_68516) +
local_tid_72451 +
i_82560 *
group_sizze_76452) *
8];
}
}
for (int32_t i_82561 = 0; i_82561 < triD_68516; i_82561++) {
*(__global double *) &mem_80687[(group_id_72452 *
(group_sizze_76452 *
triD_68516 *
D_68526) +
local_tid_72451 +
i_76592 *
(group_sizze_76452 *
triD_68516) +
i_82561 *
group_sizze_76452) *
8] = *(__global
double *) &mem_81715[(group_id_72452 *
(group_sizze_76452 *
triD_68516) +
local_tid_72451 +
i_82561 *
group_sizze_76452) *
8];
}
for (int32_t i_82562 = 0; i_82562 < D_68526; i_82562++) {
*(__global double *) &mem_80683[(group_id_72452 *
(group_sizze_76452 *
D_68526 * D_68526) +
local_tid_72451 +
i_76592 *
(group_sizze_76452 *
D_68526) + i_82562 *
group_sizze_76452) *
8] = *(__global
double *) &mem_81718[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 +
i_82562 *
group_sizze_76452) *
8];
}
}
for (int32_t i_76636 = 0; i_76636 < D_68526; i_76636++) {
double res_76638;
double redout_76639 = 0.0;
for (int32_t i_76640 = 0; i_76640 < D_68526; i_76640++) {
double x_76641;
double res_76644;
x_76641 = *(__global
double *) &mem_80683[(group_id_72452 *
(group_sizze_76452 *
D_68526 * D_68526) +
local_tid_72451 +
(i_76640 *
(group_sizze_76452 *
D_68526) + i_76636 *
group_sizze_76452)) *
8];
res_76644 = redout_76639 + x_76641;
double redout_tmp_82564 = res_76644;
redout_76639 = redout_tmp_82564;
}
res_76638 = redout_76639;
*(__global double *) &mem_80672[(group_id_72452 *
(group_sizze_76452 *
D_68526 * D_68526) +
local_tid_72451 + i_76571 *
(group_sizze_76452 *
D_68526) + i_76636 *
group_sizze_76452) * 8] =
res_76638;
}
for (int32_t i_76650 = 0; i_76650 < triD_68516; i_76650++) {
double res_76652;
double redout_76653 = 0.0;
for (int32_t i_76654 = 0; i_76654 < D_68526; i_76654++) {
double x_76655;
double res_76658;
x_76655 = *(__global
double *) &mem_80687[(group_id_72452 *
(group_sizze_76452 *
triD_68516 *
D_68526) +
local_tid_72451 +
(i_76654 *
(group_sizze_76452 *
triD_68516) +
i_76650 *
group_sizze_76452)) *
8];
res_76658 = redout_76653 + x_76655;
double redout_tmp_82566 = res_76658;
redout_76653 = redout_tmp_82566;
}
res_76652 = redout_76653;
*(__global double *) &mem_80676[(group_id_72452 *
(group_sizze_76452 *
triD_68516 * D_68526) +
local_tid_72451 + i_76571 *
(group_sizze_76452 *
triD_68516) + i_76650 *
group_sizze_76452) * 8] =
res_76652;
}
*(__global double *) &mem_80679[(group_id_72452 *
(group_sizze_76452 * D_68526) +
local_tid_72451 + i_76571 *
group_sizze_76452) * 8] =
res_76585;
}
for (int32_t i_76667 = 0; i_76667 < triD_68516; i_76667++) {
double res_76669;
double redout_76670 = 0.0;
for (int32_t i_76671 = 0; i_76671 < D_68526; i_76671++) {
double x_76672;
double res_76675;
x_76672 = *(__global double *) &mem_80676[(group_id_72452 *
(group_sizze_76452 *
triD_68516 *
D_68526) +
local_tid_72451 +
(i_76671 *
(group_sizze_76452 *
triD_68516) +
i_76667 *
group_sizze_76452)) *
8];
res_76675 = redout_76670 + x_76672;
double redout_tmp_82568 = res_76675;
redout_76670 = redout_tmp_82568;
}
res_76669 = redout_76670;
*(__global double *) &mem_80724[(group_id_72452 *
(group_sizze_76452 *
triD_68516) +
local_tid_72451 + i_76667 *
group_sizze_76452) * 8] =
res_76669;
}
for (int32_t i_76681 = 0; i_76681 < D_68526; i_76681++) {
double res_76683;
double res_76690;
double redout_76684 = 0.0;
for (int32_t i_76685 = 0; i_76685 < D_68526; i_76685++) {
double x_76686;
double res_76689;
x_76686 = *(__global double *) &mem_80672[(group_id_72452 *
(group_sizze_76452 *
D_68526 *
D_68526) +
local_tid_72451 +
(i_76685 *
(group_sizze_76452 *
D_68526) +
i_76681 *
group_sizze_76452)) *
8];
res_76689 = redout_76684 + x_76686;
double redout_tmp_82570 = res_76689;
redout_76684 = redout_tmp_82570;
}
res_76683 = redout_76684;
res_76690 = res_76502 + res_76683;
*(__global double *) &mem_80727[(group_id_72452 *
(group_sizze_76452 * D_68526) +
local_tid_72451 + i_76681 *
group_sizze_76452) * 8] =
res_76690;
}
}
if (slt32(gtid_72426, N_68508) && slt32(gtid_72427, K_68510)) {
*(__global double *) &mem_80731[(gtid_72426 * K_68510 +
gtid_72427) * 8] = res_76502;
}
if (slt32(gtid_72426, N_68508) && slt32(gtid_72427, K_68510)) {
for (int32_t i_82571 = 0; i_82571 < D_68526; i_82571++) {
*(__global double *) &mem_80736[(K_68510 * N_68508 * 0 +
gtid_72426 * K_68510 +
gtid_72427 + i_82571 *
(K_68510 * N_68508)) * 8] =
*(__global double *) &mem_80679[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 + i_82571 *
group_sizze_76452) * 8];
}
}
if (slt32(gtid_72426, N_68508) && slt32(gtid_72427, K_68510)) {
for (int32_t i_82572 = 0; i_82572 < D_68526; i_82572++) {
*(__global double *) &mem_80741[(K_68510 * N_68508 * 0 +
gtid_72426 * K_68510 +
gtid_72427 + i_82572 *
(K_68510 * N_68508)) * 8] =
*(__global double *) &mem_80727[(group_id_72452 *
(group_sizze_76452 *
D_68526) +
local_tid_72451 + i_82572 *
group_sizze_76452) * 8];
}
}
if (slt32(gtid_72426, N_68508) && slt32(gtid_72427, K_68510)) {
for (int32_t i_82573 = 0; i_82573 < triD_68516; i_82573++) {
*(__global double *) &mem_80746[(K_68510 * N_68508 * 0 +
gtid_72426 * K_68510 +
gtid_72427 + i_82573 *
(K_68510 * N_68508)) * 8] =
*(__global double *) &mem_80724[(group_id_72452 *
(group_sizze_76452 *
triD_68516) +
local_tid_72451 + i_82573 *
group_sizze_76452) * 8];
}
}
}
}
__kernel void map_72987(int32_t N_68508, int32_t K_68510, int32_t D_68526,
__global unsigned char *mem_81347, __global
unsigned char *mem_81359, __global
unsigned char *mem_81364)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_72987;
int32_t local_tid_72988;
int32_t group_sizze_82988;
int32_t wave_sizze_82987;
int32_t group_id_72989;
global_tid_72987 = get_global_id(0);
local_tid_72988 = get_local_id(0);
group_sizze_82988 = get_local_size(0);
wave_sizze_82987 = LOCKSTEP_WIDTH;
group_id_72989 = get_group_id(0);
int32_t gtid_72976;
int32_t gtid_72977;
int32_t gtid_72978;
gtid_72976 = squot32(global_tid_72987, K_68510 * D_68526);
gtid_72977 = squot32(global_tid_72987 - squot32(global_tid_72987, K_68510 *
D_68526) * (K_68510 *
D_68526),
D_68526);
gtid_72978 = global_tid_72987 - squot32(global_tid_72987, K_68510 *
D_68526) * (K_68510 * D_68526) -
squot32(global_tid_72987 - squot32(global_tid_72987, K_68510 *
D_68526) * (K_68510 * D_68526),
D_68526) * D_68526;
double res_77857;
double res_77859;
double res_77873;
if ((slt32(gtid_72976, N_68508) && slt32(gtid_72977, K_68510)) &&
slt32(gtid_72978, D_68526)) {
res_77857 = *(__global double *) &mem_81347[(gtid_72977 * N_68508 +
gtid_72976) * 8];
double x_77862 = 0.0;
for (int32_t chunk_offset_77861 = 0; chunk_offset_77861 < D_68526;
chunk_offset_77861++) {
double x_77869;
double res_77872;
x_77869 = *(__global double *) &mem_81359[(chunk_offset_77861 *
(D_68526 * K_68510 *
N_68508) + gtid_72976 *
(D_68526 * K_68510) +
gtid_72977 * D_68526 +
gtid_72978) * 8];
res_77872 = x_77862 + x_77869;
double x_tmp_82989 = res_77872;
x_77862 = x_tmp_82989;
}
res_77859 = x_77862;
res_77873 = res_77857 + res_77859;
}
if ((slt32(gtid_72976, N_68508) && slt32(gtid_72977, K_68510)) &&
slt32(gtid_72978, D_68526)) {
*(__global double *) &mem_81364[(gtid_72976 * (D_68526 * K_68510) +
gtid_72977 * D_68526 + gtid_72978) *
8] = res_77873;
}
}
__kernel void map_73051(int32_t N_68508, int32_t K_68510, int32_t D_68526,
__global unsigned char *mem_81387, __global
unsigned char *mem_81391, __global
unsigned char *mem_81396)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_73051;
int32_t local_tid_73052;
int32_t group_sizze_83035;
int32_t wave_sizze_83034;
int32_t group_id_73053;
global_tid_73051 = get_global_id(0);
local_tid_73052 = get_local_id(0);
group_sizze_83035 = get_local_size(0);
wave_sizze_83034 = LOCKSTEP_WIDTH;
group_id_73053 = get_group_id(0);
int32_t gtid_73040;
int32_t gtid_73041;
int32_t gtid_73042;
gtid_73040 = squot32(global_tid_73051, K_68510 * D_68526);
gtid_73041 = squot32(global_tid_73051 - squot32(global_tid_73051, K_68510 *
D_68526) * (K_68510 *
D_68526),
D_68526);
gtid_73042 = global_tid_73051 - squot32(global_tid_73051, K_68510 *
D_68526) * (K_68510 * D_68526) -
squot32(global_tid_73051 - squot32(global_tid_73051, K_68510 *
D_68526) * (K_68510 * D_68526),
D_68526) * D_68526;
double res_77922;
double res_77923;
double res_77924;
if ((slt32(gtid_73040, N_68508) && slt32(gtid_73041, K_68510)) &&
slt32(gtid_73042, D_68526)) {
res_77922 = *(__global double *) &mem_81391[(gtid_73041 * N_68508 +
gtid_73040) * 8];
res_77923 = *(__global double *) &mem_81387[(gtid_73040 * (D_68526 *
K_68510) +
gtid_73041 * D_68526 +
gtid_73042) * 8];
res_77924 = res_77922 + res_77923;
}
if ((slt32(gtid_73040, N_68508) && slt32(gtid_73041, K_68510)) &&
slt32(gtid_73042, D_68526)) {
*(__global double *) &mem_81396[(gtid_73040 * (D_68526 * K_68510) +
gtid_73041 * D_68526 + gtid_73042) *
8] = res_77924;
}
}
__kernel void map_73155(int32_t N_68508, int32_t K_68510, int32_t D_68526,
int32_t num_groups_77829, int32_t virt_groups_77836,
__global unsigned char *mem_81337, __global
unsigned char *mem_81343)
{
const int32_t group_sizze_77819 = rev_gmm_objectivezigroup_sizze_73135;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_73155;
int32_t local_tid_73156;
int32_t group_sizze_82981;
int32_t wave_sizze_82980;
int32_t group_id_73157;
global_tid_73155 = get_global_id(0);
local_tid_73156 = get_local_id(0);
group_sizze_82981 = get_local_size(0);
wave_sizze_82980 = LOCKSTEP_WIDTH;
group_id_73157 = get_group_id(0);
int32_t gtid_73131;
int32_t gtid_73132;
int32_t phys_group_id_82982;
phys_group_id_82982 = get_group_id(0);
for (int32_t i_82983 = 0; i_82983 < squot32(virt_groups_77836 -
phys_group_id_82982 +
num_groups_77829 - 1,
num_groups_77829); i_82983++) {
int32_t virt_group_id_82984 = phys_group_id_82982 + i_82983 *
num_groups_77829;
gtid_73131 = squot32(virt_group_id_82984 * group_sizze_77819 +
local_tid_73156, K_68510);
gtid_73132 = virt_group_id_82984 * group_sizze_77819 + local_tid_73156 -
squot32(virt_group_id_82984 * group_sizze_77819 + local_tid_73156,
K_68510) * K_68510;
if (slt32(gtid_73131, N_68508) && slt32(gtid_73132, K_68510)) { }
if (slt32(gtid_73131, N_68508) && slt32(gtid_73132, K_68510)) {
for (int32_t i_82985 = 0; i_82985 < D_68526; i_82985++) {
for (int32_t i_82986 = 0; i_82986 < D_68526; i_82986++) {
*(__global double *) &mem_81343[(gtid_73131 * K_68510 +
gtid_73132 + (i_82985 *
(K_68510 *
N_68508 *
D_68526) +
i_82986 *
(K_68510 *
N_68508))) *
8] = *(__global
double *) &mem_81337[(gtid_73131 *
K_68510 +
gtid_73132 +
(i_82986 *
(K_68510 *
N_68508 *
D_68526) +
i_82985 *
(K_68510 *
N_68508))) *
8];
}
}
}
}
}
__kernel void map_73182(int32_t N_68508, int32_t K_68510, int32_t triD_68516,
int32_t D_68526, __global unsigned char *mem_81301,
__global unsigned char *mem_81306)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_73182;
int32_t local_tid_73183;
int32_t group_sizze_82934;
int32_t wave_sizze_82933;
int32_t group_id_73184;
global_tid_73182 = get_global_id(0);
local_tid_73183 = get_local_id(0);
group_sizze_82934 = get_local_size(0);
wave_sizze_82933 = LOCKSTEP_WIDTH;
group_id_73184 = get_group_id(0);
int32_t gtid_73171;
int32_t gtid_73172;
int32_t gtid_73173;
gtid_73171 = squot32(global_tid_73182, K_68510 * triD_68516);
gtid_73172 = squot32(global_tid_73182 - squot32(global_tid_73182, K_68510 *
triD_68516) * (K_68510 *
triD_68516),
triD_68516);
gtid_73173 = global_tid_73182 - squot32(global_tid_73182, K_68510 *
triD_68516) * (K_68510 *
triD_68516) -
squot32(global_tid_73182 - squot32(global_tid_73182, K_68510 *
triD_68516) * (K_68510 * triD_68516),
triD_68516) * triD_68516;
double res_77764;
if ((slt32(gtid_73171, N_68508) && slt32(gtid_73172, K_68510)) &&
slt32(gtid_73173, triD_68516)) {
double x_77767 = 0.0;
for (int32_t chunk_offset_77766 = 0; chunk_offset_77766 < D_68526;
chunk_offset_77766++) {
double x_77774;
double res_77777;
x_77774 = *(__global double *) &mem_81301[(chunk_offset_77766 *
(triD_68516 * K_68510 *
N_68508) + gtid_73171 *
(triD_68516 * K_68510) +
gtid_73172 * triD_68516 +
gtid_73173) * 8];
res_77777 = x_77767 + x_77774;
double x_tmp_82935 = res_77777;
x_77767 = x_tmp_82935;
}
res_77764 = x_77767;
}
if ((slt32(gtid_73171, N_68508) && slt32(gtid_73172, K_68510)) &&
slt32(gtid_73173, triD_68516)) {
*(__global double *) &mem_81306[(gtid_73171 * (triD_68516 * K_68510) +
gtid_73172 * triD_68516 + gtid_73173) *
8] = res_77764;
}
}
__kernel void map_73315(int32_t N_68508, int32_t K_68510, int32_t triD_68516,
int32_t D_68526, int32_t num_groups_77735,
int32_t virt_groups_77742, __global
unsigned char *mem_81283, __global
unsigned char *mem_81289)
{
const int32_t group_sizze_77725 = rev_gmm_objectivezigroup_sizze_73295;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_73315;
int32_t local_tid_73316;
int32_t group_sizze_82927;
int32_t wave_sizze_82926;
int32_t group_id_73317;
global_tid_73315 = get_global_id(0);
local_tid_73316 = get_local_id(0);
group_sizze_82927 = get_local_size(0);
wave_sizze_82926 = LOCKSTEP_WIDTH;
group_id_73317 = get_group_id(0);
int32_t gtid_73291;
int32_t gtid_73292;
int32_t phys_group_id_82928;
phys_group_id_82928 = get_group_id(0);
for (int32_t i_82929 = 0; i_82929 < squot32(virt_groups_77742 -
phys_group_id_82928 +
num_groups_77735 - 1,
num_groups_77735); i_82929++) {
int32_t virt_group_id_82930 = phys_group_id_82928 + i_82929 *
num_groups_77735;
gtid_73291 = squot32(virt_group_id_82930 * group_sizze_77725 +
local_tid_73316, K_68510);
gtid_73292 = virt_group_id_82930 * group_sizze_77725 + local_tid_73316 -
squot32(virt_group_id_82930 * group_sizze_77725 + local_tid_73316,
K_68510) * K_68510;
if (slt32(gtid_73291, N_68508) && slt32(gtid_73292, K_68510)) { }
if (slt32(gtid_73291, N_68508) && slt32(gtid_73292, K_68510)) {
for (int32_t i_82931 = 0; i_82931 < triD_68516; i_82931++) {
for (int32_t i_82932 = 0; i_82932 < D_68526; i_82932++) {
*(__global double *) &mem_81289[(gtid_73291 * K_68510 +
gtid_73292 + (i_82931 *
(K_68510 *
N_68508 *
D_68526) +
i_82932 *
(K_68510 *
N_68508))) *
8] = *(__global
double *) &mem_81283[(gtid_73291 *
K_68510 +
gtid_73292 +
(i_82932 *
(K_68510 *
N_68508 *
triD_68516) +
i_82931 *
(K_68510 *
N_68508))) *
8];
}
}
}
}
}
__kernel void map_73392(int32_t N_68508, int32_t K_68510, int32_t triD_68516,
int32_t D_68526, int32_t num_groups_77161,
int32_t virt_groups_77168, __global
unsigned char *mem_80868, __global
unsigned char *mem_80889, __global
unsigned char *res_r_r_mem_80930, __global
unsigned char *mem_80954, __global
unsigned char *mem_80958, __global
unsigned char *mem_80962, __global
unsigned char *mem_80965, __global
unsigned char *mem_80968, __global
unsigned char *mem_80971, __global
unsigned char *mem_80974, __global
unsigned char *mem_80977, __global
unsigned char *mem_80980, __global
unsigned char *mem_80987, __global
unsigned char *mem_80990, __global
unsigned char *mem_80996, __global
unsigned char *mem_81002, __global
unsigned char *mem_81007, __global
unsigned char *mem_81749, __global
unsigned char *mem_81752, __global
unsigned char *mem_81759, __global
unsigned char *mem_81762)
{
const int32_t group_sizze_77151 = rev_gmm_objectivezigroup_sizze_73372;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_73392;
int32_t local_tid_73393;
int32_t group_sizze_82694;
int32_t wave_sizze_82693;
int32_t group_id_73394;
global_tid_73392 = get_global_id(0);
local_tid_73393 = get_local_id(0);
group_sizze_82694 = get_local_size(0);
wave_sizze_82693 = LOCKSTEP_WIDTH;
group_id_73394 = get_group_id(0);
int32_t gtid_73366;
int32_t gtid_73367;
int32_t gtid_73368;
int32_t phys_group_id_82695;
phys_group_id_82695 = get_group_id(0);
for (int32_t i_82696 = 0; i_82696 < squot32(virt_groups_77168 -
phys_group_id_82695 +
num_groups_77161 - 1,
num_groups_77161); i_82696++) {
int32_t virt_group_id_82697 = phys_group_id_82695 + i_82696 *
num_groups_77161;
gtid_73366 = squot32(virt_group_id_82697 * group_sizze_77151 +
local_tid_73393, K_68510 * D_68526);
gtid_73367 = squot32(virt_group_id_82697 * group_sizze_77151 +
local_tid_73393 - squot32(virt_group_id_82697 *
group_sizze_77151 +
local_tid_73393,
K_68510 * D_68526) *
(K_68510 * D_68526), D_68526);
gtid_73368 = virt_group_id_82697 * group_sizze_77151 + local_tid_73393 -
squot32(virt_group_id_82697 * group_sizze_77151 + local_tid_73393,
K_68510 * D_68526) * (K_68510 * D_68526) -
squot32(virt_group_id_82697 * group_sizze_77151 + local_tid_73393 -
squot32(virt_group_id_82697 * group_sizze_77151 +
local_tid_73393, K_68510 * D_68526) * (K_68510 *
D_68526),
D_68526) * D_68526;
double x_77195;
double qs_elem_elem_77198;
double res_77199;
double res_77217;
if ((slt32(gtid_73366, N_68508) && slt32(gtid_73367, K_68510)) &&
slt32(gtid_73368, D_68526)) {
x_77195 = *(__global double *) &res_r_r_mem_80930[(gtid_73366 *
(D_68526 *
K_68510) +
gtid_73367 *
D_68526 +
gtid_73368) * 8];
qs_elem_elem_77198 = *(__global double *) &mem_80868[(gtid_73366 *
(D_68526 *
K_68510) +
gtid_73367 *
D_68526 +
gtid_73368) *
8];
double x_77202 = 0.0;
for (int32_t chunk_offset_77201 = 0; chunk_offset_77201 < D_68526;
chunk_offset_77201++) {
double x_77211;
double x_77212;
double res_77214;
double res_77216;
x_77211 = *(__global double *) &mem_80954[(chunk_offset_77201 *
(D_68526 * K_68510 *
N_68508) +
gtid_73366 *
(D_68526 * K_68510) +
gtid_73367 *
D_68526 +
gtid_73368) * 8];
x_77212 = *(__global double *) &res_r_r_mem_80930[(gtid_73366 *
(D_68526 *
K_68510) +
gtid_73367 *
D_68526 +
chunk_offset_77201) *
8];
res_77214 = x_77211 * x_77212;
res_77216 = x_77202 + res_77214;
double x_tmp_82698 = res_77216;
x_77202 = x_tmp_82698;
}
res_77199 = x_77202;
res_77217 = 0.0 - res_77199;
for (int32_t i_77224 = 0; i_77224 < D_68526; i_77224++) {
double x_77225;
double res_77227;
bool cond_77228;
bool cond_77229;
x_77225 = *(__global double *) &mem_80889[(gtid_73366 *
(D_68526 * K_68510) +
gtid_73367 *
D_68526 + i_77224) *
8];
res_77227 = x_77195 * x_77225;
cond_77228 = slt32(gtid_73368, i_77224);
cond_77229 = gtid_73368 == i_77224;
if (cond_77228) {
for (int32_t i_82701 = 0; i_82701 < D_68526; i_82701++) {
*(__global double *) &mem_80965[(group_id_73394 *
(group_sizze_77151 *
D_68526) +
local_tid_73393 +
i_82701 *
group_sizze_77151) *
8] = 0.0;
}
for (int32_t i_82702 = 0; i_82702 < triD_68516; i_82702++) {
*(__global double *) &mem_80968[(group_id_73394 *
(group_sizze_77151 *
triD_68516) +
local_tid_73393 +
i_82702 *
group_sizze_77151) *
8] = 0.0;
}
for (int32_t i_82703 = 0; i_82703 < D_68526; i_82703++) {
*(__global double *) &mem_81762[(group_id_73394 *
(group_sizze_77151 *
D_68526) +
local_tid_73393 +
i_82703 *
group_sizze_77151) *
8] = *(__global
double *) &mem_80965[(group_id_73394 *
(group_sizze_77151 *
D_68526) +
local_tid_73393 +
i_82703 *
group_sizze_77151) *
8];
}
for (int32_t i_82704 = 0; i_82704 < triD_68516; i_82704++) {
*(__global double *) &mem_81759[(group_id_73394 *
(group_sizze_77151 *
triD_68516) +
local_tid_73393 +
i_82704 *
group_sizze_77151) *
8] = *(__global
double *) &mem_80968[(group_id_73394 *
(group_sizze_77151 *
triD_68516) +
local_tid_73393 +
i_82704 *
group_sizze_77151) *
8];
}
} else {
if (cond_77229) {
double res_77236;
double deltaVec_arg_77237;
res_77236 = futrts_exp64(qs_elem_elem_77198);
deltaVec_arg_77237 = res_77227 * res_77236;
for (int32_t i_77242 = 0; i_77242 < D_68526;
i_77242++) {
bool cond_77244;
double res_77245;
cond_77244 = i_77242 == gtid_73368;
if (cond_77244) {
res_77245 = deltaVec_arg_77237;
} else {
res_77245 = 0.0;
}
*(__global double *) &mem_80971[(group_id_73394 *
(group_sizze_77151 *
D_68526) +
local_tid_73393 +
i_77242 *
group_sizze_77151) *
8] = res_77245;
}
for (int32_t i_82706 = 0; i_82706 < triD_68516;
i_82706++) {
*(__global double *) &mem_80974[(group_id_73394 *
(group_sizze_77151 *
triD_68516) +
local_tid_73393 +
i_82706 *
group_sizze_77151) *
8] = 0.0;
}
for (int32_t i_82707 = 0; i_82707 < D_68526;
i_82707++) {
*(__global double *) &mem_81752[(group_id_73394 *
(group_sizze_77151 *
D_68526) +
local_tid_73393 +
i_82707 *
group_sizze_77151) *
8] = *(__global
double *) &mem_80971[(group_id_73394 *
(group_sizze_77151 *
D_68526) +
local_tid_73393 +
i_82707 *
group_sizze_77151) *
8];
}
for (int32_t i_82708 = 0; i_82708 < triD_68516;
i_82708++) {
*(__global double *) &mem_81749[(group_id_73394 *
(group_sizze_77151 *
triD_68516) +
local_tid_73393 +
i_82708 *
group_sizze_77151) *
8] = *(__global
double *) &mem_80974[(group_id_73394 *
(group_sizze_77151 *
triD_68516) +
local_tid_73393 +
i_82708 *
group_sizze_77151) *
8];
}
} else {
int32_t y_77248;
int32_t x_77249;
int32_t res_77250;
int32_t deltaVec_arg_77251;
y_77248 = gtid_73368 - 1;
x_77249 = gtid_73368 * y_77248;
res_77250 = sdiv32(x_77249, 2);
deltaVec_arg_77251 = i_77224 + res_77250;
for (int32_t i_77256 = 0; i_77256 < triD_68516;
i_77256++) {
bool cond_77258;
double res_77259;
cond_77258 = i_77256 == deltaVec_arg_77251;
if (cond_77258) {
res_77259 = res_77227;
} else {
res_77259 = 0.0;
}
*(__global double *) &mem_80977[(group_id_73394 *
(group_sizze_77151 *
triD_68516) +
local_tid_73393 +
i_77256 *
group_sizze_77151) *
8] = res_77259;
}
for (int32_t i_82710 = 0; i_82710 < D_68526;
i_82710++) {
*(__global double *) &mem_80980[(group_id_73394 *
(group_sizze_77151 *
D_68526) +
local_tid_73393 +
i_82710 *
group_sizze_77151) *
8] = 0.0;
}
for (int32_t i_82711 = 0; i_82711 < D_68526;
i_82711++) {
*(__global double *) &mem_81752[(group_id_73394 *
(group_sizze_77151 *
D_68526) +
local_tid_73393 +
i_82711 *
group_sizze_77151) *
8] = *(__global
double *) &mem_80980[(group_id_73394 *
(group_sizze_77151 *
D_68526) +
local_tid_73393 +
i_82711 *
group_sizze_77151) *
8];
}
for (int32_t i_82712 = 0; i_82712 < triD_68516;
i_82712++) {
*(__global double *) &mem_81749[(group_id_73394 *
(group_sizze_77151 *
triD_68516) +
local_tid_73393 +
i_82712 *
group_sizze_77151) *
8] = *(__global
double *) &mem_80977[(group_id_73394 *
(group_sizze_77151 *
triD_68516) +
local_tid_73393 +
i_82712 *
group_sizze_77151) *
8];
}
}
for (int32_t i_82713 = 0; i_82713 < D_68526; i_82713++) {
*(__global double *) &mem_81762[(group_id_73394 *
(group_sizze_77151 *
D_68526) +
local_tid_73393 +
i_82713 *
group_sizze_77151) *
8] = *(__global
double *) &mem_81752[(group_id_73394 *
(group_sizze_77151 *
D_68526) +
local_tid_73393 +
i_82713 *
group_sizze_77151) *
8];
}
for (int32_t i_82714 = 0; i_82714 < triD_68516; i_82714++) {
*(__global double *) &mem_81759[(group_id_73394 *
(group_sizze_77151 *
triD_68516) +
local_tid_73393 +
i_82714 *
group_sizze_77151) *
8] = *(__global
double *) &mem_81749[(group_id_73394 *
(group_sizze_77151 *
triD_68516) +
local_tid_73393 +
i_82714 *
group_sizze_77151) *
8];
}
}
for (int32_t i_82715 = 0; i_82715 < triD_68516; i_82715++) {
*(__global double *) &mem_80962[(group_id_73394 *
(group_sizze_77151 *
triD_68516 * D_68526) +
local_tid_73393 + i_77224 *
(group_sizze_77151 *
triD_68516) + i_82715 *
group_sizze_77151) * 8] =
*(__global double *) &mem_81759[(group_id_73394 *
(group_sizze_77151 *
triD_68516) +
local_tid_73393 +
i_82715 *
group_sizze_77151) *
8];
}
for (int32_t i_82716 = 0; i_82716 < D_68526; i_82716++) {
*(__global double *) &mem_80958[(group_id_73394 *
(group_sizze_77151 *
D_68526 * D_68526) +
local_tid_73393 + i_77224 *
(group_sizze_77151 *
D_68526) + i_82716 *
group_sizze_77151) * 8] =
*(__global double *) &mem_81762[(group_id_73394 *
(group_sizze_77151 *
D_68526) +
local_tid_73393 +
i_82716 *
group_sizze_77151) *
8];
}
}
for (int32_t i_77268 = 0; i_77268 < D_68526; i_77268++) {
double res_77270;
double redout_77271 = 0.0;
for (int32_t i_77272 = 0; i_77272 < D_68526; i_77272++) {
double x_77273;
double res_77276;
x_77273 = *(__global double *) &mem_80958[(group_id_73394 *
(group_sizze_77151 *
D_68526 *
D_68526) +
local_tid_73393 +
(i_77272 *
(group_sizze_77151 *
D_68526) +
i_77268 *
group_sizze_77151)) *
8];
res_77276 = redout_77271 + x_77273;
double redout_tmp_82718 = res_77276;
redout_77271 = redout_tmp_82718;
}
res_77270 = redout_77271;
*(__global double *) &mem_80987[(group_id_73394 *
(group_sizze_77151 * D_68526) +
local_tid_73393 + i_77268 *
group_sizze_77151) * 8] =
res_77270;
}
for (int32_t i_77282 = 0; i_77282 < triD_68516; i_77282++) {
double res_77284;
double redout_77285 = 0.0;
for (int32_t i_77286 = 0; i_77286 < D_68526; i_77286++) {
double x_77287;
double res_77290;
x_77287 = *(__global double *) &mem_80962[(group_id_73394 *
(group_sizze_77151 *
triD_68516 *
D_68526) +
local_tid_73393 +
(i_77286 *
(group_sizze_77151 *
triD_68516) +
i_77282 *
group_sizze_77151)) *
8];
res_77290 = redout_77285 + x_77287;
double redout_tmp_82720 = res_77290;
redout_77285 = redout_tmp_82720;
}
res_77284 = redout_77285;
*(__global double *) &mem_80990[(group_id_73394 *
(group_sizze_77151 *
triD_68516) +
local_tid_73393 + i_77282 *
group_sizze_77151) * 8] =
res_77284;
}
}
if ((slt32(gtid_73366, N_68508) && slt32(gtid_73367, K_68510)) &&
slt32(gtid_73368, D_68526)) {
for (int32_t i_82721 = 0; i_82721 < triD_68516; i_82721++) {
*(__global double *) &mem_80996[(D_68526 * K_68510 * N_68508 *
0 + gtid_73366 * (D_68526 *
K_68510) +
gtid_73367 * D_68526 +
gtid_73368 + i_82721 *
(D_68526 * K_68510 *
N_68508)) * 8] = *(__global
double *) &mem_80990[(group_id_73394 *
(group_sizze_77151 *
triD_68516) +
local_tid_73393 +
i_82721 *
group_sizze_77151) *
8];
}
}
if ((slt32(gtid_73366, N_68508) && slt32(gtid_73367, K_68510)) &&
slt32(gtid_73368, D_68526)) {
for (int32_t i_82722 = 0; i_82722 < D_68526; i_82722++) {
*(__global double *) &mem_81002[(D_68526 * K_68510 * N_68508 *
0 + gtid_73366 * (D_68526 *
K_68510) +
gtid_73367 * D_68526 +
gtid_73368 + i_82722 *
(D_68526 * K_68510 *
N_68508)) * 8] = *(__global
double *) &mem_80987[(group_id_73394 *
(group_sizze_77151 *
D_68526) +
local_tid_73393 +
i_82722 *
group_sizze_77151) *
8];
}
}
if ((slt32(gtid_73366, N_68508) && slt32(gtid_73367, K_68510)) &&
slt32(gtid_73368, D_68526)) {
*(__global double *) &mem_81007[(gtid_73366 * (D_68526 * K_68510) +
gtid_73367 * D_68526 +
gtid_73368) * 8] = res_77217;
}
}
}
__kernel void map_73675(int32_t N_68508, int32_t K_68510, int32_t triD_68516,
int32_t D_68526, __global unsigned char *mem_81237,
__global unsigned char *mem_81243)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_73675;
int32_t local_tid_73676;
int32_t group_sizze_82880;
int32_t wave_sizze_82879;
int32_t group_id_73677;
global_tid_73675 = get_global_id(0);
local_tid_73676 = get_local_id(0);
group_sizze_82880 = get_local_size(0);
wave_sizze_82879 = LOCKSTEP_WIDTH;
group_id_73677 = get_group_id(0);
int32_t gtid_73662;
int32_t gtid_73663;
int32_t gtid_73664;
int32_t gtid_73665;
gtid_73662 = squot32(global_tid_73675, K_68510 * D_68526 * triD_68516);
gtid_73663 = squot32(global_tid_73675 - squot32(global_tid_73675, K_68510 *
D_68526 * triD_68516) *
(K_68510 * D_68526 * triD_68516), D_68526 *
triD_68516);
gtid_73664 = squot32(global_tid_73675 - squot32(global_tid_73675, K_68510 *
D_68526 * triD_68516) *
(K_68510 * D_68526 * triD_68516) -
squot32(global_tid_73675 - squot32(global_tid_73675,
K_68510 * D_68526 *
triD_68516) *
(K_68510 * D_68526 * triD_68516), D_68526 *
triD_68516) * (D_68526 * triD_68516),
triD_68516);
gtid_73665 = global_tid_73675 - squot32(global_tid_73675, K_68510 *
D_68526 * triD_68516) * (K_68510 *
D_68526 *
triD_68516) -
squot32(global_tid_73675 - squot32(global_tid_73675, K_68510 * D_68526 *
triD_68516) * (K_68510 * D_68526 *
triD_68516), D_68526 *
triD_68516) * (D_68526 * triD_68516) -
squot32(global_tid_73675 - squot32(global_tid_73675, K_68510 * D_68526 *
triD_68516) * (K_68510 * D_68526 *
triD_68516) -
squot32(global_tid_73675 - squot32(global_tid_73675, K_68510 *
D_68526 * triD_68516) *
(K_68510 * D_68526 * triD_68516), D_68526 *
triD_68516) * (D_68526 * triD_68516), triD_68516) *
triD_68516;
double res_77665;
if (((slt32(gtid_73662, N_68508) && slt32(gtid_73663, K_68510)) &&
slt32(gtid_73664, D_68526)) && slt32(gtid_73665, triD_68516)) {
double x_77668 = 0.0;
for (int32_t chunk_offset_77667 = 0; chunk_offset_77667 < D_68526;
chunk_offset_77667++) {
double x_77675;
double res_77678;
x_77675 = *(__global double *) &mem_81237[(chunk_offset_77667 *
(triD_68516 * D_68526 *
K_68510 * N_68508) +
gtid_73662 *
(triD_68516 * D_68526 *
K_68510) + gtid_73663 *
(triD_68516 * D_68526) +
gtid_73664 * triD_68516 +
gtid_73665) * 8];
res_77678 = x_77668 + x_77675;
double x_tmp_82881 = res_77678;
x_77668 = x_tmp_82881;
}
res_77665 = x_77668;
}
if (((slt32(gtid_73662, N_68508) && slt32(gtid_73663, K_68510)) &&
slt32(gtid_73664, D_68526)) && slt32(gtid_73665, triD_68516)) {
*(__global double *) &mem_81243[(gtid_73662 * (triD_68516 * D_68526 *
K_68510) + gtid_73663 *
(triD_68516 * D_68526) + gtid_73664 *
triD_68516 + gtid_73665) * 8] =
res_77665;
}
}
__kernel void map_73818(int32_t N_68508, int32_t K_68510, int32_t triD_68516,
int32_t D_68526, int32_t num_groups_77634,
int32_t virt_groups_77641, __global
unsigned char *mem_81216, __global
unsigned char *mem_81223)
{
const int32_t group_sizze_77624 = rev_gmm_objectivezigroup_sizze_73798;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_73818;
int32_t local_tid_73819;
int32_t group_sizze_82873;
int32_t wave_sizze_82872;
int32_t group_id_73820;
global_tid_73818 = get_global_id(0);
local_tid_73819 = get_local_id(0);
group_sizze_82873 = get_local_size(0);
wave_sizze_82872 = LOCKSTEP_WIDTH;
group_id_73820 = get_group_id(0);
int32_t gtid_73792;
int32_t gtid_73793;
int32_t gtid_73794;
int32_t phys_group_id_82874;
phys_group_id_82874 = get_group_id(0);
for (int32_t i_82875 = 0; i_82875 < squot32(virt_groups_77641 -
phys_group_id_82874 +
num_groups_77634 - 1,
num_groups_77634); i_82875++) {
int32_t virt_group_id_82876 = phys_group_id_82874 + i_82875 *
num_groups_77634;
gtid_73792 = squot32(virt_group_id_82876 * group_sizze_77624 +
local_tid_73819, K_68510 * D_68526);
gtid_73793 = squot32(virt_group_id_82876 * group_sizze_77624 +
local_tid_73819 - squot32(virt_group_id_82876 *
group_sizze_77624 +
local_tid_73819,
K_68510 * D_68526) *
(K_68510 * D_68526), D_68526);
gtid_73794 = virt_group_id_82876 * group_sizze_77624 + local_tid_73819 -
squot32(virt_group_id_82876 * group_sizze_77624 + local_tid_73819,
K_68510 * D_68526) * (K_68510 * D_68526) -
squot32(virt_group_id_82876 * group_sizze_77624 + local_tid_73819 -
squot32(virt_group_id_82876 * group_sizze_77624 +
local_tid_73819, K_68510 * D_68526) * (K_68510 *
D_68526),
D_68526) * D_68526;
if ((slt32(gtid_73792, N_68508) && slt32(gtid_73793, K_68510)) &&
slt32(gtid_73794, D_68526)) { }
if ((slt32(gtid_73792, N_68508) && slt32(gtid_73793, K_68510)) &&
slt32(gtid_73794, D_68526)) {
for (int32_t i_82877 = 0; i_82877 < triD_68516; i_82877++) {
for (int32_t i_82878 = 0; i_82878 < D_68526; i_82878++) {
*(__global double *) &mem_81223[(D_68526 * K_68510 *
N_68508 * D_68526 * 0 +
D_68526 * K_68510 *
N_68508 * 0 + gtid_73792 *
(D_68526 * K_68510) +
gtid_73793 * D_68526 +
gtid_73794 + (i_82877 *
(D_68526 *
K_68510 *
N_68508 *
D_68526) +
i_82878 *
(D_68526 *
K_68510 *
N_68508))) *
8] = *(__global
double *) &mem_81216[(gtid_73792 *
(D_68526 *
K_68510) +
gtid_73793 *
D_68526 +
gtid_73794 +
(i_82878 *
(D_68526 *
K_68510 *
N_68508 *
triD_68516) +
i_82877 *
(D_68526 *
K_68510 *
N_68508))) *
8];
}
}
}
}
}
__kernel void map_73848(int32_t N_68508, int32_t K_68510, int32_t D_68526,
__global unsigned char *mem_81168, __global
unsigned char *mem_81174)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_73848;
int32_t local_tid_73849;
int32_t group_sizze_82826;
int32_t wave_sizze_82825;
int32_t group_id_73850;
global_tid_73848 = get_global_id(0);
local_tid_73849 = get_local_id(0);
group_sizze_82826 = get_local_size(0);
wave_sizze_82825 = LOCKSTEP_WIDTH;
group_id_73850 = get_group_id(0);
int32_t gtid_73835;
int32_t gtid_73836;
int32_t gtid_73837;
int32_t gtid_73838;
gtid_73835 = squot32(global_tid_73848, K_68510 * D_68526 * D_68526);
gtid_73836 = squot32(global_tid_73848 - squot32(global_tid_73848, K_68510 *
D_68526 * D_68526) *
(K_68510 * D_68526 * D_68526), D_68526 * D_68526);
gtid_73837 = squot32(global_tid_73848 - squot32(global_tid_73848, K_68510 *
D_68526 * D_68526) *
(K_68510 * D_68526 * D_68526) -
squot32(global_tid_73848 - squot32(global_tid_73848,
K_68510 * D_68526 *
D_68526) *
(K_68510 * D_68526 * D_68526), D_68526 *
D_68526) * (D_68526 * D_68526), D_68526);
gtid_73838 = global_tid_73848 - squot32(global_tid_73848, K_68510 *
D_68526 * D_68526) * (K_68510 *
D_68526 *
D_68526) -
squot32(global_tid_73848 - squot32(global_tid_73848, K_68510 * D_68526 *
D_68526) * (K_68510 * D_68526 *
D_68526), D_68526 *
D_68526) * (D_68526 * D_68526) - squot32(global_tid_73848 -
squot32(global_tid_73848,
K_68510 *
D_68526 *
D_68526) *
(K_68510 * D_68526 *
D_68526) -
squot32(global_tid_73848 -
squot32(global_tid_73848,
K_68510 *
D_68526 *
D_68526) *
(K_68510 *
D_68526 *
D_68526),
D_68526 *
D_68526) *
(D_68526 * D_68526),
D_68526) * D_68526;
double res_77566;
if (((slt32(gtid_73835, N_68508) && slt32(gtid_73836, K_68510)) &&
slt32(gtid_73837, D_68526)) && slt32(gtid_73838, D_68526)) {
double x_77569 = 0.0;
for (int32_t chunk_offset_77568 = 0; chunk_offset_77568 < D_68526;
chunk_offset_77568++) {
double x_77576;
double res_77579;
x_77576 = *(__global double *) &mem_81168[(chunk_offset_77568 *
(D_68526 * D_68526 *
K_68510 * N_68508) +
gtid_73835 * (D_68526 *
D_68526 *
K_68510) +
gtid_73836 * (D_68526 *
D_68526) +
gtid_73837 * D_68526 +
gtid_73838) * 8];
res_77579 = x_77569 + x_77576;
double x_tmp_82827 = res_77579;
x_77569 = x_tmp_82827;
}
res_77566 = x_77569;
}
if (((slt32(gtid_73835, N_68508) && slt32(gtid_73836, K_68510)) &&
slt32(gtid_73837, D_68526)) && slt32(gtid_73838, D_68526)) {
*(__global double *) &mem_81174[(gtid_73835 * (D_68526 * D_68526 *
K_68510) + gtid_73836 *
(D_68526 * D_68526) + gtid_73837 *
D_68526 + gtid_73838) * 8] = res_77566;
}
}
__kernel void map_73991(int32_t N_68508, int32_t K_68510, int32_t D_68526,
int32_t num_groups_77535, int32_t virt_groups_77542,
__global unsigned char *mem_81147, __global
unsigned char *mem_81154)
{
const int32_t group_sizze_77525 = rev_gmm_objectivezigroup_sizze_73971;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_73991;
int32_t local_tid_73992;
int32_t group_sizze_82819;
int32_t wave_sizze_82818;
int32_t group_id_73993;
global_tid_73991 = get_global_id(0);
local_tid_73992 = get_local_id(0);
group_sizze_82819 = get_local_size(0);
wave_sizze_82818 = LOCKSTEP_WIDTH;
group_id_73993 = get_group_id(0);
int32_t gtid_73965;
int32_t gtid_73966;
int32_t gtid_73967;
int32_t phys_group_id_82820;
phys_group_id_82820 = get_group_id(0);
for (int32_t i_82821 = 0; i_82821 < squot32(virt_groups_77542 -
phys_group_id_82820 +
num_groups_77535 - 1,
num_groups_77535); i_82821++) {
int32_t virt_group_id_82822 = phys_group_id_82820 + i_82821 *
num_groups_77535;
gtid_73965 = squot32(virt_group_id_82822 * group_sizze_77525 +
local_tid_73992, K_68510 * D_68526);
gtid_73966 = squot32(virt_group_id_82822 * group_sizze_77525 +
local_tid_73992 - squot32(virt_group_id_82822 *
group_sizze_77525 +
local_tid_73992,
K_68510 * D_68526) *
(K_68510 * D_68526), D_68526);
gtid_73967 = virt_group_id_82822 * group_sizze_77525 + local_tid_73992 -
squot32(virt_group_id_82822 * group_sizze_77525 + local_tid_73992,
K_68510 * D_68526) * (K_68510 * D_68526) -
squot32(virt_group_id_82822 * group_sizze_77525 + local_tid_73992 -
squot32(virt_group_id_82822 * group_sizze_77525 +
local_tid_73992, K_68510 * D_68526) * (K_68510 *
D_68526),
D_68526) * D_68526;
if ((slt32(gtid_73965, N_68508) && slt32(gtid_73966, K_68510)) &&
slt32(gtid_73967, D_68526)) { }
if ((slt32(gtid_73965, N_68508) && slt32(gtid_73966, K_68510)) &&
slt32(gtid_73967, D_68526)) {
for (int32_t i_82823 = 0; i_82823 < D_68526; i_82823++) {
for (int32_t i_82824 = 0; i_82824 < D_68526; i_82824++) {
*(__global double *) &mem_81154[(D_68526 * K_68510 *
N_68508 * 0 + gtid_73965 *
(D_68526 * K_68510) +
gtid_73966 * D_68526 +
gtid_73967 + (i_82823 *
(D_68526 *
K_68510 *
N_68508 *
D_68526) +
i_82824 *
(D_68526 *
K_68510 *
N_68508))) *
8] = *(__global
double *) &mem_81147[(gtid_73965 *
(D_68526 *
K_68510) +
gtid_73966 *
D_68526 +
gtid_73967 +
(i_82824 *
(D_68526 *
K_68510 *
N_68508 *
D_68526) +
i_82823 *
(D_68526 *
K_68510 *
N_68508))) *
8];
}
}
}
}
}
__kernel void map_74126(int32_t N_68508, int32_t K_68510, int32_t triD_68516,
int32_t D_68526, int32_t num_groups_77470,
int32_t virt_groups_77477, __global
unsigned char *mem_80873, __global
unsigned char *mem_80889, __global
unsigned char *mem_81097, __global
unsigned char *mem_81100, __global
unsigned char *mem_81103, __global
unsigned char *mem_81106, __global
unsigned char *mem_81109, __global
unsigned char *mem_81112, __global
unsigned char *mem_81115, __global
unsigned char *mem_81126, __global
unsigned char *mem_81133, __global
unsigned char *mem_81789, __global
unsigned char *mem_81792, __global
unsigned char *mem_81799, __global
unsigned char *mem_81802)
{
const int32_t group_sizze_77460 = rev_gmm_objectivezigroup_sizze_74106;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_74126;
int32_t local_tid_74127;
int32_t group_sizze_82798;
int32_t wave_sizze_82797;
int32_t group_id_74128;
global_tid_74126 = get_global_id(0);
local_tid_74127 = get_local_id(0);
group_sizze_82798 = get_local_size(0);
wave_sizze_82797 = LOCKSTEP_WIDTH;
group_id_74128 = get_group_id(0);
int32_t gtid_74098;
int32_t gtid_74099;
int32_t gtid_74100;
int32_t gtid_74101;
int32_t phys_group_id_82799;
phys_group_id_82799 = get_group_id(0);
for (int32_t i_82800 = 0; i_82800 < squot32(virt_groups_77477 -
phys_group_id_82799 +
num_groups_77470 - 1,
num_groups_77470); i_82800++) {
int32_t virt_group_id_82801 = phys_group_id_82799 + i_82800 *
num_groups_77470;
gtid_74098 = squot32(virt_group_id_82801 * group_sizze_77460 +
local_tid_74127, K_68510 * D_68526 * D_68526);
gtid_74099 = squot32(virt_group_id_82801 * group_sizze_77460 +
local_tid_74127 - squot32(virt_group_id_82801 *
group_sizze_77460 +
local_tid_74127,
K_68510 * D_68526 *
D_68526) * (K_68510 *
D_68526 *
D_68526),
D_68526 * D_68526);
gtid_74100 = squot32(virt_group_id_82801 * group_sizze_77460 +
local_tid_74127 - squot32(virt_group_id_82801 *
group_sizze_77460 +
local_tid_74127,
K_68510 * D_68526 *
D_68526) * (K_68510 *
D_68526 *
D_68526) -
squot32(virt_group_id_82801 * group_sizze_77460 +
local_tid_74127 -
squot32(virt_group_id_82801 *
group_sizze_77460 +
local_tid_74127, K_68510 *
D_68526 * D_68526) * (K_68510 *
D_68526 *
D_68526),
D_68526 * D_68526) * (D_68526 * D_68526),
D_68526);
gtid_74101 = virt_group_id_82801 * group_sizze_77460 + local_tid_74127 -
squot32(virt_group_id_82801 * group_sizze_77460 + local_tid_74127,
K_68510 * D_68526 * D_68526) * (K_68510 * D_68526 *
D_68526) -
squot32(virt_group_id_82801 * group_sizze_77460 + local_tid_74127 -
squot32(virt_group_id_82801 * group_sizze_77460 +
local_tid_74127, K_68510 * D_68526 * D_68526) *
(K_68510 * D_68526 * D_68526), D_68526 * D_68526) *
(D_68526 * D_68526) - squot32(virt_group_id_82801 *
group_sizze_77460 + local_tid_74127 -
squot32(virt_group_id_82801 *
group_sizze_77460 +
local_tid_74127, K_68510 *
D_68526 * D_68526) *
(K_68510 * D_68526 * D_68526) -
squot32(virt_group_id_82801 *
group_sizze_77460 +
local_tid_74127 -
squot32(virt_group_id_82801 *
group_sizze_77460 +
local_tid_74127,
K_68510 * D_68526 *
D_68526) * (K_68510 *
D_68526 *
D_68526),
D_68526 * D_68526) *
(D_68526 * D_68526), D_68526) *
D_68526;
double x_77480;
double qs_elem_elem_77482;
double x_77483;
double res_77485;
bool cond_77486;
bool cond_77487;
if (((slt32(gtid_74098, N_68508) && slt32(gtid_74099, K_68510)) &&
slt32(gtid_74100, D_68526)) && slt32(gtid_74101, D_68526)) {
x_77480 = *(__global double *) &mem_81097[(gtid_74099 * (N_68508 *
D_68526) +
gtid_74100 * N_68508 +
gtid_74098) * 8];
qs_elem_elem_77482 = *(__global double *) &mem_80873[(gtid_74099 *
(N_68508 *
D_68526) +
gtid_74100 *
N_68508 +
gtid_74098) *
8];
x_77483 = *(__global double *) &mem_80889[(gtid_74098 * (D_68526 *
K_68510) +
gtid_74099 * D_68526 +
gtid_74101) * 8];
res_77485 = x_77480 * x_77483;
cond_77486 = slt32(gtid_74100, gtid_74101);
cond_77487 = gtid_74100 == gtid_74101;
if (cond_77486) {
for (int32_t i_82802 = 0; i_82802 < D_68526; i_82802++) {
*(__global double *) &mem_81100[(group_id_74128 *
(group_sizze_77460 *
D_68526) +
local_tid_74127 + i_82802 *
group_sizze_77460) * 8] =
0.0;
}
for (int32_t i_82803 = 0; i_82803 < triD_68516; i_82803++) {
*(__global double *) &mem_81103[(group_id_74128 *
(group_sizze_77460 *
triD_68516) +
local_tid_74127 + i_82803 *
group_sizze_77460) * 8] =
0.0;
}
for (int32_t i_82804 = 0; i_82804 < D_68526; i_82804++) {
*(__global double *) &mem_81802[(group_id_74128 *
(group_sizze_77460 *
D_68526) +
local_tid_74127 + i_82804 *
group_sizze_77460) * 8] =
*(__global double *) &mem_81100[(group_id_74128 *
(group_sizze_77460 *
D_68526) +
local_tid_74127 +
i_82804 *
group_sizze_77460) *
8];
}
for (int32_t i_82805 = 0; i_82805 < triD_68516; i_82805++) {
*(__global double *) &mem_81799[(group_id_74128 *
(group_sizze_77460 *
triD_68516) +
local_tid_74127 + i_82805 *
group_sizze_77460) * 8] =
*(__global double *) &mem_81103[(group_id_74128 *
(group_sizze_77460 *
triD_68516) +
local_tid_74127 +
i_82805 *
group_sizze_77460) *
8];
}
} else {
if (cond_77487) {
double res_77494;
double deltaVec_arg_77495;
res_77494 = futrts_exp64(qs_elem_elem_77482);
deltaVec_arg_77495 = res_77485 * res_77494;
for (int32_t i_77500 = 0; i_77500 < D_68526; i_77500++) {
bool cond_77502;
double res_77503;
cond_77502 = i_77500 == gtid_74100;
if (cond_77502) {
res_77503 = deltaVec_arg_77495;
} else {
res_77503 = 0.0;
}
*(__global double *) &mem_81106[(group_id_74128 *
(group_sizze_77460 *
D_68526) +
local_tid_74127 +
i_77500 *
group_sizze_77460) *
8] = res_77503;
}
for (int32_t i_82807 = 0; i_82807 < triD_68516; i_82807++) {
*(__global double *) &mem_81109[(group_id_74128 *
(group_sizze_77460 *
triD_68516) +
local_tid_74127 +
i_82807 *
group_sizze_77460) *
8] = 0.0;
}
for (int32_t i_82808 = 0; i_82808 < D_68526; i_82808++) {
*(__global double *) &mem_81792[(group_id_74128 *
(group_sizze_77460 *
D_68526) +
local_tid_74127 +
i_82808 *
group_sizze_77460) *
8] = *(__global
double *) &mem_81106[(group_id_74128 *
(group_sizze_77460 *
D_68526) +
local_tid_74127 +
i_82808 *
group_sizze_77460) *
8];
}
for (int32_t i_82809 = 0; i_82809 < triD_68516; i_82809++) {
*(__global double *) &mem_81789[(group_id_74128 *
(group_sizze_77460 *
triD_68516) +
local_tid_74127 +
i_82809 *
group_sizze_77460) *
8] = *(__global
double *) &mem_81109[(group_id_74128 *
(group_sizze_77460 *
triD_68516) +
local_tid_74127 +
i_82809 *
group_sizze_77460) *
8];
}
} else {
int32_t y_77506;
int32_t x_77507;
int32_t res_77508;
int32_t deltaVec_arg_77509;
y_77506 = gtid_74100 - 1;
x_77507 = gtid_74100 * y_77506;
res_77508 = sdiv32(x_77507, 2);
deltaVec_arg_77509 = gtid_74101 + res_77508;
for (int32_t i_77514 = 0; i_77514 < triD_68516; i_77514++) {
bool cond_77516;
double res_77517;
cond_77516 = i_77514 == deltaVec_arg_77509;
if (cond_77516) {
res_77517 = res_77485;
} else {
res_77517 = 0.0;
}
*(__global double *) &mem_81112[(group_id_74128 *
(group_sizze_77460 *
triD_68516) +
local_tid_74127 +
i_77514 *
group_sizze_77460) *
8] = res_77517;
}
for (int32_t i_82811 = 0; i_82811 < D_68526; i_82811++) {
*(__global double *) &mem_81115[(group_id_74128 *
(group_sizze_77460 *
D_68526) +
local_tid_74127 +
i_82811 *
group_sizze_77460) *
8] = 0.0;
}
for (int32_t i_82812 = 0; i_82812 < D_68526; i_82812++) {
*(__global double *) &mem_81792[(group_id_74128 *
(group_sizze_77460 *
D_68526) +
local_tid_74127 +
i_82812 *
group_sizze_77460) *
8] = *(__global
double *) &mem_81115[(group_id_74128 *
(group_sizze_77460 *
D_68526) +
local_tid_74127 +
i_82812 *
group_sizze_77460) *
8];
}
for (int32_t i_82813 = 0; i_82813 < triD_68516; i_82813++) {
*(__global double *) &mem_81789[(group_id_74128 *
(group_sizze_77460 *
triD_68516) +
local_tid_74127 +
i_82813 *
group_sizze_77460) *
8] = *(__global
double *) &mem_81112[(group_id_74128 *
(group_sizze_77460 *
triD_68516) +
local_tid_74127 +
i_82813 *
group_sizze_77460) *
8];
}
}
for (int32_t i_82814 = 0; i_82814 < D_68526; i_82814++) {
*(__global double *) &mem_81802[(group_id_74128 *
(group_sizze_77460 *
D_68526) +
local_tid_74127 + i_82814 *
group_sizze_77460) * 8] =
*(__global double *) &mem_81792[(group_id_74128 *
(group_sizze_77460 *
D_68526) +
local_tid_74127 +
i_82814 *
group_sizze_77460) *
8];
}
for (int32_t i_82815 = 0; i_82815 < triD_68516; i_82815++) {
*(__global double *) &mem_81799[(group_id_74128 *
(group_sizze_77460 *
triD_68516) +
local_tid_74127 + i_82815 *
group_sizze_77460) * 8] =
*(__global double *) &mem_81789[(group_id_74128 *
(group_sizze_77460 *
triD_68516) +
local_tid_74127 +
i_82815 *
group_sizze_77460) *
8];
}
}
}
if (((slt32(gtid_74098, N_68508) && slt32(gtid_74099, K_68510)) &&
slt32(gtid_74100, D_68526)) && slt32(gtid_74101, D_68526)) {
for (int32_t i_82816 = 0; i_82816 < D_68526; i_82816++) {
*(__global double *) &mem_81126[(gtid_74098 * (D_68526 *
D_68526 *
K_68510) +
gtid_74099 * (D_68526 *
D_68526) +
gtid_74100 * D_68526 +
gtid_74101 + i_82816 *
(D_68526 * D_68526 * K_68510 *
N_68508)) * 8] = *(__global
double *) &mem_81802[(group_id_74128 *
(group_sizze_77460 *
D_68526) +
local_tid_74127 +
i_82816 *
group_sizze_77460) *
8];
}
}
if (((slt32(gtid_74098, N_68508) && slt32(gtid_74099, K_68510)) &&
slt32(gtid_74100, D_68526)) && slt32(gtid_74101, D_68526)) {
for (int32_t i_82817 = 0; i_82817 < triD_68516; i_82817++) {
*(__global double *) &mem_81133[(D_68526 * D_68526 * K_68510 *
N_68508 * 0 + gtid_74098 *
(D_68526 * D_68526 * K_68510) +
gtid_74099 * (D_68526 *
D_68526) +
gtid_74100 * D_68526 +
gtid_74101 + i_82817 *
(D_68526 * D_68526 * K_68510 *
N_68508)) * 8] = *(__global
double *) &mem_81799[(group_id_74128 *
(group_sizze_77460 *
triD_68516) +
local_tid_74127 +
i_82817 *
group_sizze_77460) *
8];
}
}
}
}
__kernel void map_74247(int32_t N_68508, int32_t K_68510, int32_t D_68526,
__global unsigned char *mem_81087, __global
unsigned char *mem_81092)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_74247;
int32_t local_tid_74248;
int32_t group_sizze_82796;
int32_t wave_sizze_82795;
int32_t group_id_74249;
global_tid_74247 = get_global_id(0);
local_tid_74248 = get_local_id(0);
group_sizze_82796 = get_local_size(0);
wave_sizze_82795 = LOCKSTEP_WIDTH;
group_id_74249 = get_group_id(0);
int32_t gtid_74236;
int32_t gtid_74237;
int32_t gtid_74238;
gtid_74236 = squot32(global_tid_74247, K_68510 * D_68526);
gtid_74237 = squot32(global_tid_74247 - squot32(global_tid_74247, K_68510 *
D_68526) * (K_68510 *
D_68526),
D_68526);
gtid_74238 = global_tid_74247 - squot32(global_tid_74247, K_68510 *
D_68526) * (K_68510 * D_68526) -
squot32(global_tid_74247 - squot32(global_tid_74247, K_68510 *
D_68526) * (K_68510 * D_68526),
D_68526) * D_68526;
double res_77431;
double res_77432;
if ((slt32(gtid_74236, N_68508) && slt32(gtid_74237, K_68510)) &&
slt32(gtid_74238, D_68526)) {
res_77431 = *(__global double *) &mem_81087[(gtid_74236 * (D_68526 *
K_68510) +
gtid_74237 * D_68526 +
gtid_74238) * 8];
res_77432 = 0.0 - res_77431;
}
if ((slt32(gtid_74236, N_68508) && slt32(gtid_74237, K_68510)) &&
slt32(gtid_74238, D_68526)) {
*(__global double *) &mem_81092[(gtid_74236 * (D_68526 * K_68510) +
gtid_74237 * D_68526 + gtid_74238) *
8] = res_77432;
}
}
__kernel void map_74648(int32_t N_68508, int32_t K_68510, int32_t D_68526,
int32_t num_groups_77137, int32_t virt_groups_77144,
__global unsigned char *mem_80936, __global
unsigned char *mem_80942)
{
const int32_t group_sizze_77127 = rev_gmm_objectivezigroup_sizze_74628;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_74648;
int32_t local_tid_74649;
int32_t group_sizze_82687;
int32_t wave_sizze_82686;
int32_t group_id_74650;
global_tid_74648 = get_global_id(0);
local_tid_74649 = get_local_id(0);
group_sizze_82687 = get_local_size(0);
wave_sizze_82686 = LOCKSTEP_WIDTH;
group_id_74650 = get_group_id(0);
int32_t gtid_74624;
int32_t gtid_74625;
int32_t phys_group_id_82688;
phys_group_id_82688 = get_group_id(0);
for (int32_t i_82689 = 0; i_82689 < squot32(virt_groups_77144 -
phys_group_id_82688 +
num_groups_77137 - 1,
num_groups_77137); i_82689++) {
int32_t virt_group_id_82690 = phys_group_id_82688 + i_82689 *
num_groups_77137;
gtid_74624 = squot32(virt_group_id_82690 * group_sizze_77127 +
local_tid_74649, K_68510);
gtid_74625 = virt_group_id_82690 * group_sizze_77127 + local_tid_74649 -
squot32(virt_group_id_82690 * group_sizze_77127 + local_tid_74649,
K_68510) * K_68510;
if (slt32(gtid_74624, N_68508) && slt32(gtid_74625, K_68510)) { }
if (slt32(gtid_74624, N_68508) && slt32(gtid_74625, K_68510)) {
for (int32_t i_82691 = 0; i_82691 < D_68526; i_82691++) {
for (int32_t i_82692 = 0; i_82692 < D_68526; i_82692++) {
*(__global double *) &mem_80942[(gtid_74624 * K_68510 +
gtid_74625 + (i_82691 *
(K_68510 *
N_68508 *
D_68526) +
i_82692 *
(K_68510 *
N_68508))) *
8] = *(__global
double *) &mem_80936[(gtid_74624 *
K_68510 +
gtid_74625 +
(i_82692 *
(K_68510 *
N_68508 *
D_68526) +
i_82691 *
(K_68510 *
N_68508))) *
8];
}
}
}
}
}
__kernel void map_74677(int32_t N_68508, int32_t K_68510, int32_t D_68526,
__global unsigned char *mem_80889, __global
unsigned char *mem_80897, __global
unsigned char *mem_80903, __global
unsigned char *mem_80908)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_74677;
int32_t local_tid_74678;
int32_t group_sizze_82638;
int32_t wave_sizze_82637;
int32_t group_id_74679;
global_tid_74677 = get_global_id(0);
local_tid_74678 = get_local_id(0);
group_sizze_82638 = get_local_size(0);
wave_sizze_82637 = LOCKSTEP_WIDTH;
group_id_74679 = get_group_id(0);
int32_t gtid_74666;
int32_t gtid_74667;
int32_t gtid_74668;
gtid_74666 = squot32(global_tid_74677, K_68510 * D_68526);
gtid_74667 = squot32(global_tid_74677 - squot32(global_tid_74677, K_68510 *
D_68526) * (K_68510 *
D_68526),
D_68526);
gtid_74668 = global_tid_74677 - squot32(global_tid_74677, K_68510 *
D_68526) * (K_68510 * D_68526) -
squot32(global_tid_74677 - squot32(global_tid_74677, K_68510 *
D_68526) * (K_68510 * D_68526),
D_68526) * D_68526;
double rev_sqnorm_arg_77038;
double res_77040;
double res_77058;
double res_77059;
if ((slt32(gtid_74666, N_68508) && slt32(gtid_74667, K_68510)) &&
slt32(gtid_74668, D_68526)) {
rev_sqnorm_arg_77038 = *(__global double *) &mem_80897[(gtid_74667 *
N_68508 +
gtid_74666) *
8];
double x_77043 = 0.0;
for (int32_t chunk_offset_77042 = 0; chunk_offset_77042 < D_68526;
chunk_offset_77042++) {
double x_77052;
double x_77053;
double res_77055;
double res_77057;
x_77052 = *(__global double *) &mem_80889[(gtid_74666 * (D_68526 *
K_68510) +
gtid_74667 * D_68526 +
chunk_offset_77042) * 8];
x_77053 = *(__global double *) &mem_80903[(chunk_offset_77042 *
(D_68526 * K_68510 *
N_68508) + gtid_74666 *
(D_68526 * K_68510) +
gtid_74667 * D_68526 +
gtid_74668) * 8];
res_77055 = x_77052 * x_77053;
res_77057 = x_77043 + res_77055;
double x_tmp_82639 = res_77057;
x_77043 = x_tmp_82639;
}
res_77040 = x_77043;
res_77058 = rev_sqnorm_arg_77038 * res_77040;
res_77059 = res_77058 + res_77058;
}
if ((slt32(gtid_74666, N_68508) && slt32(gtid_74667, K_68510)) &&
slt32(gtid_74668, D_68526)) {
*(__global double *) &mem_80908[(gtid_74666 * (D_68526 * K_68510) +
gtid_74667 * D_68526 + gtid_74668) *
8] = res_77059;
}
}
__kernel void map_74751(int32_t N_68508, int32_t K_68510, int32_t D_68526,
__global unsigned char *mem_80919, __global
unsigned char *mem_80923, __global
unsigned char *mem_80928)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_74751;
int32_t local_tid_74752;
int32_t group_sizze_82685;
int32_t wave_sizze_82684;
int32_t group_id_74753;
global_tid_74751 = get_global_id(0);
local_tid_74752 = get_local_id(0);
group_sizze_82685 = get_local_size(0);
wave_sizze_82684 = LOCKSTEP_WIDTH;
group_id_74753 = get_group_id(0);
int32_t gtid_74740;
int32_t gtid_74741;
int32_t gtid_74742;
gtid_74740 = squot32(global_tid_74751, K_68510 * D_68526);
gtid_74741 = squot32(global_tid_74751 - squot32(global_tid_74751, K_68510 *
D_68526) * (K_68510 *
D_68526),
D_68526);
gtid_74742 = global_tid_74751 - squot32(global_tid_74751, K_68510 *
D_68526) * (K_68510 * D_68526) -
squot32(global_tid_74751 - squot32(global_tid_74751, K_68510 *
D_68526) * (K_68510 * D_68526),
D_68526) * D_68526;
double rev_sqnorm_arg_77120;
double res_77121;
double res_77122;
double res_77123;
if ((slt32(gtid_74740, N_68508) && slt32(gtid_74741, K_68510)) &&
slt32(gtid_74742, D_68526)) {
rev_sqnorm_arg_77120 = *(__global double *) &mem_80923[(gtid_74741 *
N_68508 +
gtid_74740) *
8];
res_77121 = *(__global double *) &mem_80919[(gtid_74740 * (D_68526 *
K_68510) +
gtid_74741 * D_68526 +
gtid_74742) * 8];
res_77122 = rev_sqnorm_arg_77120 * res_77121;
res_77123 = res_77122 + res_77122;
}
if ((slt32(gtid_74740, N_68508) && slt32(gtid_74741, K_68510)) &&
slt32(gtid_74742, D_68526)) {
*(__global double *) &mem_80928[(gtid_74740 * (D_68526 * K_68510) +
gtid_74741 * D_68526 + gtid_74742) *
8] = res_77123;
}
}
__kernel void map_74848(int32_t N_68508, int32_t K_68510, __global
unsigned char *mem_80863, __global
unsigned char *mem_80893)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_74848;
int32_t local_tid_74849;
int32_t group_sizze_82636;
int32_t wave_sizze_82635;
int32_t group_id_74850;
global_tid_74848 = get_global_id(0);
local_tid_74849 = get_local_id(0);
group_sizze_82636 = get_local_size(0);
wave_sizze_82635 = LOCKSTEP_WIDTH;
group_id_74850 = get_group_id(0);
int32_t gtid_74839;
int32_t gtid_74840;
gtid_74839 = squot32(global_tid_74848, K_68510);
gtid_74840 = global_tid_74848 - squot32(global_tid_74848, K_68510) *
K_68510;
double res_77017;
double y_77018;
double rev_sqnorm_arg_77019;
if (slt32(gtid_74839, N_68508) && slt32(gtid_74840, K_68510)) {
res_77017 = *(__global double *) &mem_80863[(gtid_74839 * K_68510 +
gtid_74840) * 8];
y_77018 = 0.0 - res_77017;
rev_sqnorm_arg_77019 = 0.5 * y_77018;
}
if (slt32(gtid_74839, N_68508) && slt32(gtid_74840, K_68510)) {
*(__global double *) &mem_80893[(gtid_74839 * K_68510 + gtid_74840) *
8] = rev_sqnorm_arg_77019;
}
}
__kernel void map_74932(int32_t N_68508, int32_t D_68509, int32_t K_68510,
int32_t D_68526, __global unsigned char *x_mem_80366,
__global unsigned char *mem_80884, __global
unsigned char *mem_80889)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_74932;
int32_t local_tid_74933;
int32_t group_sizze_82634;
int32_t wave_sizze_82633;
int32_t group_id_74934;
global_tid_74932 = get_global_id(0);
local_tid_74933 = get_local_id(0);
group_sizze_82634 = get_local_size(0);
wave_sizze_82633 = LOCKSTEP_WIDTH;
group_id_74934 = get_group_id(0);
int32_t gtid_74921;
int32_t gtid_74922;
int32_t gtid_74923;
gtid_74921 = squot32(global_tid_74932, K_68510 * D_68526);
gtid_74922 = squot32(global_tid_74932 - squot32(global_tid_74932, K_68510 *
D_68526) * (K_68510 *
D_68526),
D_68526);
gtid_74923 = global_tid_74932 - squot32(global_tid_74932, K_68510 *
D_68526) * (K_68510 * D_68526) -
squot32(global_tid_74932 - squot32(global_tid_74932, K_68510 *
D_68526) * (K_68510 * D_68526),
D_68526) * D_68526;
double x_elem_elem_77005;
double means_elem_elem_77006;
double res_77007;
if ((slt32(gtid_74921, N_68508) && slt32(gtid_74922, K_68510)) &&
slt32(gtid_74923, D_68526)) {
x_elem_elem_77005 = *(__global double *) &x_mem_80366[(gtid_74921 *
D_68509 +
gtid_74923) * 8];
means_elem_elem_77006 = *(__global double *) &mem_80884[(gtid_74921 *
(D_68526 *
K_68510) +
gtid_74922 *
D_68526 +
gtid_74923) *
8];
res_77007 = x_elem_elem_77005 - means_elem_elem_77006;
}
if ((slt32(gtid_74921, N_68508) && slt32(gtid_74922, K_68510)) &&
slt32(gtid_74923, D_68526)) {
*(__global double *) &mem_80889[(gtid_74921 * (D_68526 * K_68510) +
gtid_74922 * D_68526 + gtid_74923) *
8] = res_77007;
}
}
__kernel void map_74990(int32_t N_68508, int32_t K_68510, int32_t triD_68516,
int32_t D_68526, __global unsigned char *icf_mem_80370,
__global unsigned char *mem_80873, __global
unsigned char *mem_80879)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_74990;
int32_t local_tid_74991;
int32_t group_sizze_82632;
int32_t wave_sizze_82631;
int32_t group_id_74992;
global_tid_74990 = get_global_id(0);
local_tid_74991 = get_local_id(0);
group_sizze_82632 = get_local_size(0);
wave_sizze_82631 = LOCKSTEP_WIDTH;
group_id_74992 = get_group_id(0);
int32_t gtid_74977;
int32_t gtid_74978;
int32_t gtid_74979;
int32_t gtid_74980;
gtid_74977 = squot32(global_tid_74990, K_68510 * D_68526 * D_68526);
gtid_74978 = squot32(global_tid_74990 - squot32(global_tid_74990, K_68510 *
D_68526 * D_68526) *
(K_68510 * D_68526 * D_68526), D_68526 * D_68526);
gtid_74979 = squot32(global_tid_74990 - squot32(global_tid_74990, K_68510 *
D_68526 * D_68526) *
(K_68510 * D_68526 * D_68526) -
squot32(global_tid_74990 - squot32(global_tid_74990,
K_68510 * D_68526 *
D_68526) *
(K_68510 * D_68526 * D_68526), D_68526 *
D_68526) * (D_68526 * D_68526), D_68526);
gtid_74980 = global_tid_74990 - squot32(global_tid_74990, K_68510 *
D_68526 * D_68526) * (K_68510 *
D_68526 *
D_68526) -
squot32(global_tid_74990 - squot32(global_tid_74990, K_68510 * D_68526 *
D_68526) * (K_68510 * D_68526 *
D_68526), D_68526 *
D_68526) * (D_68526 * D_68526) - squot32(global_tid_74990 -
squot32(global_tid_74990,
K_68510 *
D_68526 *
D_68526) *
(K_68510 * D_68526 *
D_68526) -
squot32(global_tid_74990 -
squot32(global_tid_74990,
K_68510 *
D_68526 *
D_68526) *
(K_68510 *
D_68526 *
D_68526),
D_68526 *
D_68526) *
(D_68526 * D_68526),
D_68526) * D_68526;
double qs_elem_elem_76977;
bool cond_76979;
double res_76980;
if (((slt32(gtid_74977, N_68508) && slt32(gtid_74978, K_68510)) &&
slt32(gtid_74979, D_68526)) && slt32(gtid_74980, D_68526)) {
qs_elem_elem_76977 = *(__global double *) &mem_80873[(gtid_74978 *
(N_68508 *
D_68526) +
gtid_74979 *
N_68508 +
gtid_74977) * 8];
cond_76979 = slt32(gtid_74979, gtid_74980);
if (cond_76979) {
res_76980 = 0.0;
} else {
bool cond_76981;
double res_76982;
cond_76981 = gtid_74979 == gtid_74980;
if (cond_76981) {
double res_76983;
res_76983 = futrts_exp64(qs_elem_elem_76977);
res_76982 = res_76983;
} else {
int32_t y_76984;
int32_t x_76985;
int32_t res_76986;
int32_t gmm_knossos_tri_arg_76987;
int32_t y_76988;
int32_t x_76989;
int32_t res_76990;
int32_t x_76991;
int32_t x_76992;
int32_t y_76993;
int32_t i_76994;
double res_76995;
y_76984 = D_68526 - 1;
x_76985 = D_68526 * y_76984;
res_76986 = sdiv32(x_76985, 2);
gmm_knossos_tri_arg_76987 = D_68526 - gtid_74980;
y_76988 = gmm_knossos_tri_arg_76987 - 1;
x_76989 = gmm_knossos_tri_arg_76987 * y_76988;
res_76990 = sdiv32(x_76989, 2);
x_76991 = res_76986 - res_76990;
x_76992 = gtid_74979 - gtid_74980;
y_76993 = x_76992 - 1;
i_76994 = x_76991 + y_76993;
res_76995 = *(__global double *) &icf_mem_80370[(gtid_74978 *
triD_68516 +
i_76994) * 8];
res_76982 = res_76995;
}
res_76980 = res_76982;
}
}
if (((slt32(gtid_74977, N_68508) && slt32(gtid_74978, K_68510)) &&
slt32(gtid_74979, D_68526)) && slt32(gtid_74980, D_68526)) {
*(__global double *) &mem_80879[(gtid_74977 * (D_68526 * D_68526 *
K_68510) + gtid_74978 *
(D_68526 * D_68526) + gtid_74979 *
D_68526 + gtid_74980) * 8] = res_76980;
}
}
__kernel void map_75119(int32_t N_68508, int32_t K_68510, int32_t K_68511,
int32_t K_68513, int32_t D_68526,
int32_t num_groups_76916, int32_t virt_groups_76923,
__global unsigned char *mem_80631, __global
unsigned char *mem_80635, __global
unsigned char *mem_80649, __global
unsigned char *mem_80652, __global
unsigned char *mem_80854, __global
unsigned char *mem_80859, __global
unsigned char *mem_80863)
{
const int32_t group_sizze_76906 = rev_gmm_objectivezigroup_sizze_75099;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_75119;
int32_t local_tid_75120;
int32_t group_sizze_82625;
int32_t wave_sizze_82624;
int32_t group_id_75121;
global_tid_75119 = get_global_id(0);
local_tid_75120 = get_local_id(0);
group_sizze_82625 = get_local_size(0);
wave_sizze_82624 = LOCKSTEP_WIDTH;
group_id_75121 = get_group_id(0);
int32_t gtid_75095;
int32_t gtid_75096;
int32_t phys_group_id_82626;
phys_group_id_82626 = get_group_id(0);
for (int32_t i_82627 = 0; i_82627 < squot32(virt_groups_76923 -
phys_group_id_82626 +
num_groups_76916 - 1,
num_groups_76916); i_82627++) {
int32_t virt_group_id_82628 = phys_group_id_82626 + i_82627 *
num_groups_76916;
gtid_75095 = squot32(virt_group_id_82628 * group_sizze_76906 +
local_tid_75120, K_68510);
gtid_75096 = virt_group_id_82628 * group_sizze_76906 + local_tid_75120 -
squot32(virt_group_id_82628 * group_sizze_76906 + local_tid_75120,
K_68510) * K_68510;
double res_76927;
double res_elem_76928;
double res_76931;
double res_76932;
if (slt32(gtid_75095, N_68508) && slt32(gtid_75096, K_68510)) {
res_76927 = *(__global double *) &mem_80652[gtid_75095 * 8];
res_elem_76928 = *(__global double *) &mem_80649[(gtid_75095 *
K_68510 +
gtid_75096) * 8];
res_76931 = futrts_exp64(res_elem_76928);
res_76932 = res_76927 * res_76931;
}
if (slt32(gtid_75095, N_68508) && slt32(gtid_75096, K_68510)) {
for (int32_t i_82629 = 0; i_82629 < D_68526; i_82629++) {
*(__global double *) &mem_80854[(K_68510 * N_68508 * 0 +
gtid_75095 * K_68510 +
gtid_75096 + i_82629 *
(K_68510 * N_68508)) * 8] =
*(__global double *) &mem_80635[(K_68511 * 0 + gtid_75096 +
i_82629 * K_68511) * 8];
}
}
if (slt32(gtid_75095, N_68508) && slt32(gtid_75096, K_68510)) {
for (int32_t i_82630 = 0; i_82630 < D_68526; i_82630++) {
*(__global double *) &mem_80859[(K_68510 * N_68508 * 0 +
gtid_75095 * K_68510 +
gtid_75096 + i_82630 *
(K_68510 * N_68508)) * 8] =
*(__global double *) &mem_80631[(K_68513 * 0 + gtid_75096 +
i_82630 * K_68513) * 8];
}
}
if (slt32(gtid_75095, N_68508) && slt32(gtid_75096, K_68510)) {
*(__global double *) &mem_80863[(gtid_75095 * K_68510 +
gtid_75096) * 8] = res_76932;
}
}
}
__kernel void map_76183(int32_t N_68508, double d_r_68524,
int32_t num_groups_76435, int32_t virt_groups_76442,
__global unsigned char *mem_80645, __global
unsigned char *mem_80652)
{
const int32_t group_sizze_76425 = rev_gmm_objectivezigroup_sizze_76163;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_76183;
int32_t local_tid_76184;
int32_t group_sizze_82527;
int32_t wave_sizze_82526;
int32_t group_id_76185;
global_tid_76183 = get_global_id(0);
local_tid_76184 = get_local_id(0);
group_sizze_82527 = get_local_size(0);
wave_sizze_82526 = LOCKSTEP_WIDTH;
group_id_76185 = get_group_id(0);
int32_t gtid_76161;
int32_t phys_group_id_82528;
phys_group_id_82528 = get_group_id(0);
for (int32_t i_82529 = 0; i_82529 < squot32(virt_groups_76442 -
phys_group_id_82528 +
num_groups_76435 - 1,
num_groups_76435); i_82529++) {
int32_t virt_group_id_82530 = phys_group_id_82528 + i_82529 *
num_groups_76435;
gtid_76161 = virt_group_id_82530 * group_sizze_76425 + local_tid_76184;
double res_76446;
double x_76447;
double res_76448;
if (slt32(gtid_76161, N_68508)) {
res_76446 = *(__global double *) &mem_80645[gtid_76161 * 8];
x_76447 = 1.0 / res_76446;
res_76448 = d_r_68524 * x_76447;
}
if (slt32(gtid_76161, N_68508)) {
*(__global double *) &mem_80652[gtid_76161 * 8] = res_76448;
}
}
}
__kernel void map_78058(int32_t N_68508, int32_t K_68510, int32_t D_68526,
int32_t num_groups_78392, int32_t virt_groups_78399,
__global unsigned char *alphas_mem_80367, __global
unsigned char *mem_81439, __global
unsigned char *mem_81442, __global
unsigned char *mem_81445, __global
unsigned char *mem_81449)
{
const int32_t group_sizze_78382 = rev_gmm_objectivezigroup_sizze_78038;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_78058;
int32_t local_tid_78059;
int32_t group_sizze_83067;
int32_t wave_sizze_83066;
int32_t group_id_78060;
global_tid_78058 = get_global_id(0);
local_tid_78059 = get_local_id(0);
group_sizze_83067 = get_local_size(0);
wave_sizze_83066 = LOCKSTEP_WIDTH;
group_id_78060 = get_group_id(0);
int32_t gtid_78036;
int32_t phys_group_id_83068;
phys_group_id_83068 = get_group_id(0);
for (int32_t i_83069 = 0; i_83069 < squot32(virt_groups_78399 -
phys_group_id_83068 +
num_groups_78392 - 1,
num_groups_78392); i_83069++) {
int32_t virt_group_id_83070 = phys_group_id_83068 + i_83069 *
num_groups_78392;
gtid_78036 = virt_group_id_83070 * group_sizze_78382 + local_tid_78059;
double alphas_elem_78403;
double res_78417;
if (slt32(gtid_78036, K_68510)) {
alphas_elem_78403 = *(__global
double *) &alphas_mem_80367[gtid_78036 * 8];
for (int32_t i_78407 = 0; i_78407 < D_68526; i_78407++) {
double res_78409;
double redout_78410 = 0.0;
for (int32_t i_78411 = 0; i_78411 < N_68508; i_78411++) {
double x_78412;
double res_78415;
x_78412 = *(__global double *) &mem_81439[(i_78411 *
(K_68510 *
D_68526) +
i_78407 *
K_68510 +
gtid_78036) * 8];
res_78415 = redout_78410 + x_78412;
double redout_tmp_83072 = res_78415;
redout_78410 = redout_tmp_83072;
}
res_78409 = redout_78410;
*(__global double *) &mem_81442[(group_id_78060 *
(group_sizze_78382 * D_68526) +
local_tid_78059 + i_78407 *
group_sizze_78382) * 8] =
res_78409;
}
res_78417 = futrts_exp64(alphas_elem_78403);
}
if (slt32(gtid_78036, K_68510)) {
*(__global double *) &mem_81445[gtid_78036 * 8] = res_78417;
}
if (slt32(gtid_78036, K_68510)) {
for (int32_t i_83073 = 0; i_83073 < D_68526; i_83073++) {
*(__global double *) &mem_81449[(gtid_78036 + i_83073 *
K_68510) * 8] = *(__global
double *) &mem_81442[(group_id_78060 *
(group_sizze_78382 *
D_68526) +
local_tid_78059 +
i_83073 *
group_sizze_78382) *
8];
}
}
}
}
__kernel void map_78148(int32_t K_68510, __global
unsigned char *alphas_mem_80367, __global
unsigned char *mem_81493)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_78148;
int32_t local_tid_78149;
int32_t group_sizze_83129;
int32_t wave_sizze_83128;
int32_t group_id_78150;
global_tid_78148 = get_global_id(0);
local_tid_78149 = get_local_id(0);
group_sizze_83129 = get_local_size(0);
wave_sizze_83128 = LOCKSTEP_WIDTH;
group_id_78150 = get_group_id(0);
int32_t gtid_78141;
gtid_78141 = global_tid_78148;
double alphas_elem_78516;
double res_78517;
if (slt32(gtid_78141, K_68510)) {
alphas_elem_78516 = *(__global double *) &alphas_mem_80367[gtid_78141 *
8];
res_78517 = futrts_exp64(alphas_elem_78516);
}
if (slt32(gtid_78141, K_68510)) {
*(__global double *) &mem_81493[gtid_78141 * 8] = res_78517;
}
}
__kernel void map_78171(int32_t N_68508, int32_t K_68510, int32_t D_68526,
__global unsigned char *res_mem_81412, __global
unsigned char *mem_81468)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_78171;
int32_t local_tid_78172;
int32_t group_sizze_83082;
int32_t wave_sizze_83081;
int32_t group_id_78173;
global_tid_78171 = get_global_id(0);
local_tid_78172 = get_local_id(0);
group_sizze_83082 = get_local_size(0);
wave_sizze_83081 = LOCKSTEP_WIDTH;
group_id_78173 = get_group_id(0);
int32_t gtid_78162;
int32_t gtid_78163;
gtid_78162 = squot32(global_tid_78171, D_68526);
gtid_78163 = global_tid_78171 - squot32(global_tid_78171, D_68526) *
D_68526;
double res_78459;
if (slt32(gtid_78162, K_68510) && slt32(gtid_78163, D_68526)) {
double x_78462 = 0.0;
for (int32_t chunk_offset_78461 = 0; chunk_offset_78461 < N_68508;
chunk_offset_78461++) {
double x_78469;
double res_78472;
x_78469 = *(__global double *) &res_mem_81412[(chunk_offset_78461 *
(D_68526 * K_68510) +
gtid_78162 *
D_68526 +
gtid_78163) * 8];
res_78472 = x_78462 + x_78469;
double x_tmp_83083 = res_78472;
x_78462 = x_tmp_83083;
}
res_78459 = x_78462;
}
if (slt32(gtid_78162, K_68510) && slt32(gtid_78163, D_68526)) {
*(__global double *) &mem_81468[(gtid_78162 * D_68526 + gtid_78163) *
8] = res_78459;
}
}
__kernel void map_78623(int32_t N_68508, int32_t K_68510, int32_t K_68513,
int32_t K_68515, int32_t triD_68516, int32_t D_68526,
double res_68862, double t1389_68865, double res_68867,
int32_t num_groups_78700, int32_t virt_groups_78707,
__global unsigned char *alphas_mem_80367, __global
unsigned char *res_mem_81411, __global
unsigned char *mem_81505, __global
unsigned char *mem_81510, __global
unsigned char *mem_81514, __global
unsigned char *mem_81519, __global
unsigned char *mem_81522, __global
unsigned char *mem_81525, __global
unsigned char *mem_81529, __global
unsigned char *mem_81533, __global
unsigned char *mem_81536)
{
const int32_t group_sizze_78690 = rev_gmm_objectivezigroup_sizze_78603;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_78623;
int32_t local_tid_78624;
int32_t group_sizze_83153;
int32_t wave_sizze_83152;
int32_t group_id_78625;
global_tid_78623 = get_global_id(0);
local_tid_78624 = get_local_id(0);
group_sizze_83153 = get_local_size(0);
wave_sizze_83152 = LOCKSTEP_WIDTH;
group_id_78625 = get_group_id(0);
int32_t gtid_78601;
int32_t phys_group_id_83154;
phys_group_id_83154 = get_group_id(0);
for (int32_t i_83155 = 0; i_83155 < squot32(virt_groups_78707 -
phys_group_id_83154 +
num_groups_78700 - 1,
num_groups_78700); i_83155++) {
int32_t virt_group_id_83156 = phys_group_id_83154 + i_83155 *
num_groups_78700;
gtid_78601 = virt_group_id_83156 * group_sizze_78690 + local_tid_78624;
double alphas_elem_78714;
double res_78717;
double res_78731;
double res_78732;
double res_78733;
if (slt32(gtid_78601, K_68510)) {
alphas_elem_78714 = *(__global
double *) &alphas_mem_80367[gtid_78601 * 8];
double x_78720 = 0.0;
for (int32_t chunk_offset_78719 = 0; chunk_offset_78719 < N_68508;
chunk_offset_78719++) {
double x_78727;
double res_78730;
x_78727 = *(__global
double *) &res_mem_81411[(chunk_offset_78719 *
K_68510 + gtid_78601) *
8];
res_78730 = x_78720 + x_78727;
double x_tmp_83157 = res_78730;
x_78720 = x_tmp_83157;
}
res_78717 = x_78720;
res_78731 = futrts_exp64(alphas_elem_78714);
res_78732 = res_68862 * res_78731;
res_78733 = res_78717 + res_78732;
for (int32_t i_78738 = 0; i_78738 < D_68526; i_78738++) {
double qs_elem_elem_78740;
double res_78742;
double res_78749;
double res_78750;
double res_78751;
double res_78753;
double res_78754;
double res_78755;
qs_elem_elem_78740 = *(__global double *) &mem_81505[(i_78738 *
K_68513 +
gtid_78601) *
8];
double redout_78743 = 0.0;
for (int32_t i_78744 = 0; i_78744 < N_68508; i_78744++) {
double x_78745;
double res_78748;
x_78745 = *(__global double *) &mem_81510[(i_78744 *
(K_68510 *
D_68526) +
i_78738 *
K_68510 +
gtid_78601) * 8];
res_78748 = redout_78743 + x_78745;
double redout_tmp_83159 = res_78748;
redout_78743 = redout_tmp_83159;
}
res_78742 = redout_78743;
res_78749 = futrts_exp64(qs_elem_elem_78740);
res_78750 = t1389_68865 * res_78749;
res_78751 = res_78750 + res_78750;
res_78753 = res_78749 * res_78751;
res_78754 = res_68867 + res_78753;
res_78755 = res_78742 + res_78754;
*(__global double *) &mem_81522[(group_id_78625 *
(group_sizze_78690 * D_68526) +
local_tid_78624 + i_78738 *
group_sizze_78690) * 8] =
res_78755;
}
for (int32_t i_78760 = 0; i_78760 < triD_68516; i_78760++) {
double icf_elem_elem_78762;
double res_78763;
double res_78770;
double res_78771;
double res_78772;
icf_elem_elem_78762 = *(__global double *) &mem_81514[(i_78760 *
K_68515 +
gtid_78601) *
8];
double redout_78764 = 0.0;
for (int32_t i_78765 = 0; i_78765 < N_68508; i_78765++) {
double x_78766;
double res_78769;
x_78766 = *(__global double *) &mem_81519[(i_78765 *
(K_68510 *
triD_68516) +
i_78760 *
K_68510 +
gtid_78601) * 8];
res_78769 = redout_78764 + x_78766;
double redout_tmp_83161 = res_78769;
redout_78764 = redout_tmp_83161;
}
res_78763 = redout_78764;
res_78770 = t1389_68865 * icf_elem_elem_78762;
res_78771 = res_78770 + res_78770;
res_78772 = res_78763 + res_78771;
*(__global double *) &mem_81525[(group_id_78625 *
(group_sizze_78690 *
triD_68516) +
local_tid_78624 + i_78760 *
group_sizze_78690) * 8] =
res_78772;
}
}
if (slt32(gtid_78601, K_68510)) {
for (int32_t i_83162 = 0; i_83162 < triD_68516; i_83162++) {
*(__global double *) &mem_81529[(gtid_78601 + i_83162 *
K_68510) * 8] = *(__global
double *) &mem_81525[(group_id_78625 *
(group_sizze_78690 *
triD_68516) +
local_tid_78624 +
i_83162 *
group_sizze_78690) *
8];
}
}
if (slt32(gtid_78601, K_68510)) {
for (int32_t i_83163 = 0; i_83163 < D_68526; i_83163++) {
*(__global double *) &mem_81533[(gtid_78601 + i_83163 *
K_68510) * 8] = *(__global
double *) &mem_81522[(group_id_78625 *
(group_sizze_78690 *
D_68526) +
local_tid_78624 +
i_83163 *
group_sizze_78690) *
8];
}
}
if (slt32(gtid_78601, K_68510)) {
*(__global double *) &mem_81536[gtid_78601 * 8] = res_78733;
}
}
}
__kernel void map_78863(int32_t N_68508, int32_t K_68510, int32_t triD_68516,
double t1389_68865, __global
unsigned char *icf_mem_80370, __global
unsigned char *res_mem_81414, __global
unsigned char *mem_81614)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_78863;
int32_t local_tid_78864;
int32_t group_sizze_83273;
int32_t wave_sizze_83272;
int32_t group_id_78865;
global_tid_78863 = get_global_id(0);
local_tid_78864 = get_local_id(0);
group_sizze_83273 = get_local_size(0);
wave_sizze_83272 = LOCKSTEP_WIDTH;
group_id_78865 = get_group_id(0);
int32_t gtid_78854;
int32_t gtid_78855;
gtid_78854 = squot32(global_tid_78863, triD_68516);
gtid_78855 = global_tid_78863 - squot32(global_tid_78863, triD_68516) *
triD_68516;
double icf_elem_elem_79406;
double res_79407;
double res_79421;
double res_79422;
double res_79423;
if (slt32(gtid_78854, K_68510) && slt32(gtid_78855, triD_68516)) {
icf_elem_elem_79406 = *(__global double *) &icf_mem_80370[(gtid_78854 *
triD_68516 +
gtid_78855) *
8];
double x_79410 = 0.0;
for (int32_t chunk_offset_79409 = 0; chunk_offset_79409 < N_68508;
chunk_offset_79409++) {
double x_79417;
double res_79420;
x_79417 = *(__global double *) &res_mem_81414[(chunk_offset_79409 *
(triD_68516 *
K_68510) +
gtid_78854 *
triD_68516 +
gtid_78855) * 8];
res_79420 = x_79410 + x_79417;
double x_tmp_83274 = res_79420;
x_79410 = x_tmp_83274;
}
res_79407 = x_79410;
res_79421 = t1389_68865 * icf_elem_elem_79406;
res_79422 = res_79421 + res_79421;
res_79423 = res_79407 + res_79422;
}
if (slt32(gtid_78854, K_68510) && slt32(gtid_78855, triD_68516)) {
*(__global double *) &mem_81614[(gtid_78854 * triD_68516 + gtid_78855) *
8] = res_79423;
}
}
__kernel void map_78926(int32_t K_68510, int32_t triD_68516, double t1389_68865,
__global unsigned char *icf_mem_80370, __global
unsigned char *mem_81638, __global
unsigned char *mem_81642)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_78926;
int32_t local_tid_78927;
int32_t group_sizze_83320;
int32_t wave_sizze_83319;
int32_t group_id_78928;
global_tid_78926 = get_global_id(0);
local_tid_78927 = get_local_id(0);
group_sizze_83320 = get_local_size(0);
wave_sizze_83319 = LOCKSTEP_WIDTH;
group_id_78928 = get_group_id(0);
int32_t gtid_78917;
int32_t gtid_78918;
gtid_78917 = squot32(global_tid_78926, triD_68516);
gtid_78918 = global_tid_78926 - squot32(global_tid_78926, triD_68516) *
triD_68516;
double icf_elem_elem_79471;
double res_79472;
double res_79473;
double res_79474;
double res_79475;
if (slt32(gtid_78917, K_68510) && slt32(gtid_78918, triD_68516)) {
icf_elem_elem_79471 = *(__global double *) &icf_mem_80370[(gtid_78917 *
triD_68516 +
gtid_78918) *
8];
res_79472 = *(__global double *) &mem_81638[(gtid_78917 * triD_68516 +
gtid_78918) * 8];
res_79473 = t1389_68865 * icf_elem_elem_79471;
res_79474 = res_79473 + res_79473;
res_79475 = res_79472 + res_79474;
}
if (slt32(gtid_78917, K_68510) && slt32(gtid_78918, triD_68516)) {
*(__global double *) &mem_81642[(gtid_78917 * triD_68516 + gtid_78918) *
8] = res_79475;
}
}
__kernel void map_79020(int32_t N_68508, int32_t K_68510, int32_t D_68514,
int32_t D_68526, double t1389_68865, double res_68867,
__global unsigned char *qs_mem_80369, __global
unsigned char *res_mem_81413, __global
unsigned char *mem_81580)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_79020;
int32_t local_tid_79021;
int32_t group_sizze_83224;
int32_t wave_sizze_83223;
int32_t group_id_79022;
global_tid_79020 = get_global_id(0);
local_tid_79021 = get_local_id(0);
group_sizze_83224 = get_local_size(0);
wave_sizze_83223 = LOCKSTEP_WIDTH;
group_id_79022 = get_group_id(0);
int32_t gtid_79011;
int32_t gtid_79012;
gtid_79011 = squot32(global_tid_79020, D_68526);
gtid_79012 = global_tid_79020 - squot32(global_tid_79020, D_68526) *
D_68526;
double qs_elem_elem_79304;
double res_79306;
double res_79320;
double res_79321;
double res_79322;
double res_79324;
double res_79325;
double res_79326;
if (slt32(gtid_79011, K_68510) && slt32(gtid_79012, D_68526)) {
qs_elem_elem_79304 = *(__global double *) &qs_mem_80369[(gtid_79011 *
D_68514 +
gtid_79012) *
8];
double x_79309 = 0.0;
for (int32_t chunk_offset_79308 = 0; chunk_offset_79308 < N_68508;
chunk_offset_79308++) {
double x_79316;
double res_79319;
x_79316 = *(__global double *) &res_mem_81413[(chunk_offset_79308 *
(D_68526 * K_68510) +
gtid_79011 *
D_68526 +
gtid_79012) * 8];
res_79319 = x_79309 + x_79316;
double x_tmp_83225 = res_79319;
x_79309 = x_tmp_83225;
}
res_79306 = x_79309;
res_79320 = futrts_exp64(qs_elem_elem_79304);
res_79321 = t1389_68865 * res_79320;
res_79322 = res_79321 + res_79321;
res_79324 = res_79320 * res_79322;
res_79325 = res_68867 + res_79324;
res_79326 = res_79306 + res_79325;
}
if (slt32(gtid_79011, K_68510) && slt32(gtid_79012, D_68526)) {
*(__global double *) &mem_81580[(gtid_79011 * D_68526 + gtid_79012) *
8] = res_79326;
}
}
__kernel void map_79093(int32_t K_68510, int32_t D_68514, int32_t D_68526,
double t1389_68865, double res_68867, __global
unsigned char *qs_mem_80369, __global
unsigned char *mem_81604, __global
unsigned char *mem_81608)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_79093;
int32_t local_tid_79094;
int32_t group_sizze_83271;
int32_t wave_sizze_83270;
int32_t group_id_79095;
global_tid_79093 = get_global_id(0);
local_tid_79094 = get_local_id(0);
group_sizze_83271 = get_local_size(0);
wave_sizze_83270 = LOCKSTEP_WIDTH;
group_id_79095 = get_group_id(0);
int32_t gtid_79084;
int32_t gtid_79085;
gtid_79084 = squot32(global_tid_79093, D_68526);
gtid_79085 = global_tid_79093 - squot32(global_tid_79093, D_68526) *
D_68526;
double qs_elem_elem_79379;
double res_79381;
double res_79382;
double res_79383;
double res_79384;
double res_79386;
double res_79387;
double res_79388;
if (slt32(gtid_79084, K_68510) && slt32(gtid_79085, D_68526)) {
qs_elem_elem_79379 = *(__global double *) &qs_mem_80369[(gtid_79084 *
D_68514 +
gtid_79085) *
8];
res_79381 = *(__global double *) &mem_81604[(gtid_79084 * D_68526 +
gtid_79085) * 8];
res_79382 = futrts_exp64(qs_elem_elem_79379);
res_79383 = t1389_68865 * res_79382;
res_79384 = res_79383 + res_79383;
res_79386 = res_79382 * res_79384;
res_79387 = res_68867 + res_79386;
res_79388 = res_79381 + res_79387;
}
if (slt32(gtid_79084, K_68510) && slt32(gtid_79085, D_68526)) {
*(__global double *) &mem_81608[(gtid_79084 * D_68526 + gtid_79085) *
8] = res_79388;
}
}
__kernel void map_79201(int32_t K_68510, double res_68862,
int32_t num_groups_79271, int32_t virt_groups_79278,
__global unsigned char *alphas_mem_80367, __global
unsigned char *mem_81573, __global
unsigned char *mem_81576)
{
const int32_t group_sizze_79261 = rev_gmm_objectivezigroup_sizze_79181;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
int32_t global_tid_79201;
int32_t local_tid_79202;
int32_t group_sizze_83219;
int32_t wave_sizze_83218;
int32_t group_id_79203;
global_tid_79201 = get_global_id(0);
local_tid_79202 = get_local_id(0);
group_sizze_83219 = get_local_size(0);
wave_sizze_83218 = LOCKSTEP_WIDTH;
group_id_79203 = get_group_id(0);
int32_t gtid_79179;
int32_t phys_group_id_83220;
phys_group_id_83220 = get_group_id(0);
for (int32_t i_83221 = 0; i_83221 < squot32(virt_groups_79278 -
phys_group_id_83220 +
num_groups_79271 - 1,
num_groups_79271); i_83221++) {
int32_t virt_group_id_83222 = phys_group_id_83220 + i_83221 *
num_groups_79271;
gtid_79179 = virt_group_id_83222 * group_sizze_79261 + local_tid_79202;
double alphas_elem_79281;
double res_79283;
double res_79284;
double res_79285;
double res_79286;
if (slt32(gtid_79179, K_68510)) {
alphas_elem_79281 = *(__global
double *) &alphas_mem_80367[gtid_79179 * 8];
res_79283 = *(__global double *) &mem_81573[gtid_79179 * 8];
res_79284 = futrts_exp64(alphas_elem_79281);
res_79285 = res_68862 * res_79284;
res_79286 = res_79283 + res_79285;
}
if (slt32(gtid_79179, K_68510)) {
*(__global double *) &mem_81576[gtid_79179 * 8] = res_79286;
}
}
}
__kernel void map_intra_group_69162(__local volatile
int64_t *mem_80411_backing_aligned_0,
int32_t N_68316, int32_t D_68317,
int32_t K_68318, int32_t K_68319,
int32_t K_68321, int32_t K_68323,
int32_t D_68333, __global
unsigned char *x_mem_80366, __global
unsigned char *alphas_mem_80367, __global
unsigned char *mem_80397, __global
unsigned char *mem_80401, __global
unsigned char *mem_80405, __global
unsigned char *mem_80408, __global
unsigned char *mem_80414)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
__local volatile char *restrict mem_80411_backing_0 =
mem_80411_backing_aligned_0;
int32_t global_tid_69162;
int32_t local_tid_69163;
int32_t group_sizze_82046;
int32_t wave_sizze_82045;
int32_t group_id_69164;
global_tid_69162 = get_global_id(0);
local_tid_69163 = get_local_id(0);
group_sizze_82046 = get_local_size(0);
wave_sizze_82045 = LOCKSTEP_WIDTH;
group_id_69164 = get_group_id(0);
int32_t gtid_69122;
int32_t ltid_69123;
gtid_69122 = squot32(global_tid_69162, K_68318);
ltid_69123 = global_tid_69162 - squot32(global_tid_69162, K_68318) *
K_68318;
double x_79555;
double x_79559;
double x_70011;
double x_79563;
double y_70074;
double res_70075;
double res_70076;
if (slt32(gtid_69122, N_68316) && slt32(ltid_69123, K_68318)) {
x_79555 = *(__global double *) &alphas_mem_80367[ltid_69123 * 8];
double x_70000 = 0.0;
for (int32_t chunk_offset_69999 = 0; chunk_offset_69999 < D_68333;
chunk_offset_69999++) {
double x_70007;
double res_70010;
x_70007 = *(__global double *) &mem_80397[(chunk_offset_69999 *
K_68321 + ltid_69123) *
8];
res_70010 = x_70000 + x_70007;
double x_tmp_82047 = res_70010;
x_70000 = x_tmp_82047;
}
x_79559 = x_70000;
x_70011 = x_79555 + x_79559;
for (int32_t i_70016 = 0; i_70016 < D_68333; i_70016++) {
double x_elem_elem_70017;
double means_elem_elem_70018;
double res_70019;
x_elem_elem_70017 = *(__global double *) &x_mem_80366[(gtid_69122 *
D_68317 +
i_70016) *
8];
means_elem_elem_70018 = *(__global double *) &mem_80401[(i_70016 *
K_68319 +
ltid_69123) *
8];
res_70019 = x_elem_elem_70017 - means_elem_elem_70018;
*(__global double *) &mem_80408[(group_id_69164 * (K_68318 *
D_68333) +
local_tid_69163 + i_70016 *
K_68318) * 8] = res_70019;
}
double x_70024 = 0.0;
for (int32_t chunk_offset_70023 = 0; chunk_offset_70023 < D_68333;
chunk_offset_70023++) {
double qs_elem_elem_70034;
double res_70036;
double res_70071;
double res_70073;
qs_elem_elem_70034 = *(__global
double *) &mem_80397[(chunk_offset_70023 *
K_68321 + ltid_69123) *
8];
double x_70039 = 0.0;
for (int32_t chunk_offset_70038 = 0; chunk_offset_70038 < D_68333;
chunk_offset_70038++) {
double x_70049;
bool cond_70051;
double res_70052;
double res_70068;
double res_70070;
x_70049 = *(__global double *) &mem_80408[(group_id_69164 *
(K_68318 * D_68333) +
local_tid_69163 +
chunk_offset_70038 *
K_68318) * 8];
cond_70051 = slt32(chunk_offset_70023, chunk_offset_70038);
if (cond_70051) {
res_70052 = 0.0;
} else {
bool cond_70053;
double res_70054;
cond_70053 = chunk_offset_70023 == chunk_offset_70038;
if (cond_70053) {
double res_70055;
res_70055 = futrts_exp64(qs_elem_elem_70034);
res_70054 = res_70055;
} else {
int32_t y_70056;
int32_t x_70057;
int32_t res_70058;
int32_t gmm_knossos_tri_arg_70059;
int32_t y_70060;
int32_t x_70061;
int32_t res_70062;
int32_t x_70063;
int32_t x_70064;
int32_t y_70065;
int32_t i_70066;
double res_70067;
y_70056 = D_68333 - 1;
x_70057 = D_68333 * y_70056;
res_70058 = sdiv32(x_70057, 2);
gmm_knossos_tri_arg_70059 = D_68333 -
chunk_offset_70038;
y_70060 = gmm_knossos_tri_arg_70059 - 1;
x_70061 = gmm_knossos_tri_arg_70059 * y_70060;
res_70062 = sdiv32(x_70061, 2);
x_70063 = res_70058 - res_70062;
x_70064 = chunk_offset_70023 - chunk_offset_70038;
y_70065 = x_70064 - 1;
i_70066 = x_70063 + y_70065;
res_70067 = *(__global double *) &mem_80405[(i_70066 *
K_68323 +
ltid_69123) *
8];
res_70054 = res_70067;
}
res_70052 = res_70054;
}
res_70068 = x_70049 * res_70052;
res_70070 = x_70039 + res_70068;
double x_tmp_82050 = res_70070;
x_70039 = x_tmp_82050;
}
res_70036 = x_70039;
res_70071 = res_70036 * res_70036;
res_70073 = x_70024 + res_70071;
double x_tmp_82049 = res_70073;
x_70024 = x_tmp_82049;
}
x_79563 = x_70024;
y_70074 = 0.5 * x_79563;
res_70075 = x_70011 - y_70074;
res_70076 = futrts_exp64(res_70075);
}
__local char *mem_80411;
double res_70077;
mem_80411 = (__local char *) mem_80411_backing_0;
for (int32_t comb_iter_82051 = 0; comb_iter_82051 < 1; comb_iter_82051++) {
int32_t ctid_69160;
int32_t flat_comb_id_82052 = comb_iter_82051 * K_68318 +
local_tid_69163;
ctid_69160 = flat_comb_id_82052;
if (slt32(ctid_69160, K_68318) && 1) {
*(__local double *) &mem_80411[ctid_69160 * 8] = res_70076;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82053;
int32_t skip_waves_82054;
double x_70078;
double x_70079;
offset_82053 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_69163, K_68318)) {
x_70078 = *(__local double *) &mem_80411[(local_tid_69163 +
offset_82053) * 8];
}
}
offset_82053 = 1;
while (slt32(offset_82053, wave_sizze_82045)) {
if (slt32(local_tid_69163 + offset_82053, K_68318) &&
((local_tid_69163 - squot32(local_tid_69163, wave_sizze_82045) *
wave_sizze_82045) & (2 * offset_82053 - 1)) == 0) {
// read array element
{
x_70079 = *(volatile __local
double *) &mem_80411[(local_tid_69163 +
offset_82053) * 8];
}
// apply reduction operation
{
double res_70080;
if (slt32(gtid_69122, N_68316) && slt32(ltid_69123, K_68318)) {
res_70080 = x_70078 + x_70079;
}
x_70078 = res_70080;
}
// write result of operation
{
*(volatile __local double *) &mem_80411[local_tid_69163 * 8] =
x_70078;
}
}
offset_82053 *= 2;
}
skip_waves_82054 = 1;
while (slt32(skip_waves_82054, squot32(K_68318 + wave_sizze_82045 - 1,
wave_sizze_82045))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82053 = skip_waves_82054 * wave_sizze_82045;
if (slt32(local_tid_69163 + offset_82053, K_68318) &&
((local_tid_69163 - squot32(local_tid_69163, wave_sizze_82045) *
wave_sizze_82045) == 0 && (squot32(local_tid_69163,
wave_sizze_82045) & (2 *
skip_waves_82054 -
1)) ==
0)) {
// read array element
{
x_70079 = *(__local double *) &mem_80411[(local_tid_69163 +
offset_82053) * 8];
}
// apply reduction operation
{
double res_70080;
if (slt32(gtid_69122, N_68316) && slt32(ltid_69123, K_68318)) {
res_70080 = x_70078 + x_70079;
}
x_70078 = res_70080;
}
// write result of operation
{
*(__local double *) &mem_80411[local_tid_69163 * 8] = x_70078;
}
}
skip_waves_82054 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
res_70077 = *(__local double *) &mem_80411[0];
double res_70081;
if (slt32(gtid_69122, N_68316) && slt32(ltid_69123, K_68318)) {
res_70081 = futrts_log64(res_70077);
}
if (local_tid_69163 == 0) {
*(__global double *) &mem_80414[group_id_69164 * 8] = res_70081;
}
}
__kernel void map_intra_group_70441(__local volatile
int64_t *mem_80471_backing_aligned_0,
__local volatile
int64_t *mem_80474_backing_aligned_1,
__local volatile
int64_t *mem_80477_backing_aligned_2,
int32_t K_68318, int32_t D_68322,
int32_t triD_68324, int32_t D_68333,
double x_68452, double res_68453,
double y_68475,
int32_t computed_group_sizze_70945, __global
unsigned char *qs_mem_80369, __global
unsigned char *icf_mem_80370, __global
unsigned char *mem_80480)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
__local volatile char *restrict mem_80471_backing_0 =
mem_80471_backing_aligned_0;
__local volatile char *restrict mem_80474_backing_1 =
mem_80474_backing_aligned_1;
__local volatile char *restrict mem_80477_backing_2 =
mem_80477_backing_aligned_2;
int32_t global_tid_70441;
int32_t local_tid_70442;
int32_t group_sizze_82203;
int32_t wave_sizze_82202;
int32_t group_id_70443;
global_tid_70441 = get_global_id(0);
local_tid_70442 = get_local_id(0);
group_sizze_82203 = get_local_size(0);
wave_sizze_82202 = LOCKSTEP_WIDTH;
group_id_70443 = get_group_id(0);
int32_t gtid_70430;
int32_t ltid_70431;
gtid_70430 = squot32(global_tid_70441, computed_group_sizze_70945);
ltid_70431 = global_tid_70441 - squot32(global_tid_70441,
computed_group_sizze_70945) *
computed_group_sizze_70945;
__local char *mem_80471;
double res_71035;
__local char *mem_80474;
double res_71042;
mem_80471 = (__local char *) mem_80471_backing_0;
for (int32_t comb_iter_82204 = 0; comb_iter_82204 < squot32(D_68333 +
computed_group_sizze_70945 -
1,
computed_group_sizze_70945);
comb_iter_82204++) {
int32_t ctid_70433;
int32_t flat_comb_id_82205 = comb_iter_82204 *
computed_group_sizze_70945 + local_tid_70442;
ctid_70433 = flat_comb_id_82205;
if (slt32(ctid_70433, D_68333) && 1) {
double qs_elem_elem_71032;
double res_71033;
double res_71034;
qs_elem_elem_71032 = *(__global
double *) &qs_mem_80369[(gtid_70430 *
D_68322 +
ltid_70431) * 8];
res_71033 = futrts_exp64(qs_elem_elem_71032);
res_71034 = res_71033 * res_71033;
*(__local double *) &mem_80471[ctid_70433 * 8] = res_71034;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82206;
int32_t skip_waves_82207;
double x_71036;
double x_71037;
offset_82206 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_70442, D_68333)) {
x_71036 = *(__local double *) &mem_80471[(local_tid_70442 +
offset_82206) * 8];
}
}
offset_82206 = 1;
while (slt32(offset_82206, wave_sizze_82202)) {
if (slt32(local_tid_70442 + offset_82206, D_68333) &&
((local_tid_70442 - squot32(local_tid_70442, wave_sizze_82202) *
wave_sizze_82202) & (2 * offset_82206 - 1)) == 0) {
// read array element
{
x_71037 = *(volatile __local
double *) &mem_80471[(local_tid_70442 +
offset_82206) * 8];
}
// apply reduction operation
{
double res_71038;
if (slt32(gtid_70430, K_68318) && slt32(ltid_70431,
computed_group_sizze_70945)) {
res_71038 = x_71036 + x_71037;
}
x_71036 = res_71038;
}
// write result of operation
{
*(volatile __local double *) &mem_80471[local_tid_70442 * 8] =
x_71036;
}
}
offset_82206 *= 2;
}
skip_waves_82207 = 1;
while (slt32(skip_waves_82207, squot32(computed_group_sizze_70945 +
wave_sizze_82202 - 1,
wave_sizze_82202))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82206 = skip_waves_82207 * wave_sizze_82202;
if (slt32(local_tid_70442 + offset_82206, D_68333) &&
((local_tid_70442 - squot32(local_tid_70442, wave_sizze_82202) *
wave_sizze_82202) == 0 && (squot32(local_tid_70442,
wave_sizze_82202) & (2 *
skip_waves_82207 -
1)) ==
0)) {
// read array element
{
x_71037 = *(__local double *) &mem_80471[(local_tid_70442 +
offset_82206) * 8];
}
// apply reduction operation
{
double res_71038;
if (slt32(gtid_70430, K_68318) && slt32(ltid_70431,
computed_group_sizze_70945)) {
res_71038 = x_71036 + x_71037;
}
x_71036 = res_71038;
}
// write result of operation
{
*(__local double *) &mem_80471[local_tid_70442 * 8] = x_71036;
}
}
skip_waves_82207 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
res_71035 = *(__local double *) &mem_80471[0];
mem_80474 = (__local char *) mem_80474_backing_1;
for (int32_t comb_iter_82208 = 0; comb_iter_82208 < squot32(triD_68324 +
computed_group_sizze_70945 -
1,
computed_group_sizze_70945);
comb_iter_82208++) {
int32_t ctid_70435;
int32_t flat_comb_id_82209 = comb_iter_82208 *
computed_group_sizze_70945 + local_tid_70442;
ctid_70435 = flat_comb_id_82209;
if (slt32(ctid_70435, triD_68324) && 1) {
double x_71040;
double res_71041;
x_71040 = *(__global double *) &icf_mem_80370[(gtid_70430 *
triD_68324 +
ltid_70431) * 8];
res_71041 = x_71040 * x_71040;
*(__local double *) &mem_80474[ctid_70435 * 8] = res_71041;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82210;
int32_t skip_waves_82211;
double x_71043;
double x_71044;
offset_82210 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_70442, triD_68324)) {
x_71043 = *(__local double *) &mem_80474[(local_tid_70442 +
offset_82210) * 8];
}
}
offset_82210 = 1;
while (slt32(offset_82210, wave_sizze_82202)) {
if (slt32(local_tid_70442 + offset_82210, triD_68324) &&
((local_tid_70442 - squot32(local_tid_70442, wave_sizze_82202) *
wave_sizze_82202) & (2 * offset_82210 - 1)) == 0) {
// read array element
{
x_71044 = *(volatile __local
double *) &mem_80474[(local_tid_70442 +
offset_82210) * 8];
}
// apply reduction operation
{
double res_71045;
if (slt32(gtid_70430, K_68318) && slt32(ltid_70431,
computed_group_sizze_70945)) {
res_71045 = x_71043 + x_71044;
}
x_71043 = res_71045;
}
// write result of operation
{
*(volatile __local double *) &mem_80474[local_tid_70442 * 8] =
x_71043;
}
}
offset_82210 *= 2;
}
skip_waves_82211 = 1;
while (slt32(skip_waves_82211, squot32(computed_group_sizze_70945 +
wave_sizze_82202 - 1,
wave_sizze_82202))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82210 = skip_waves_82211 * wave_sizze_82202;
if (slt32(local_tid_70442 + offset_82210, triD_68324) &&
((local_tid_70442 - squot32(local_tid_70442, wave_sizze_82202) *
wave_sizze_82202) == 0 && (squot32(local_tid_70442,
wave_sizze_82202) & (2 *
skip_waves_82211 -
1)) ==
0)) {
// read array element
{
x_71044 = *(__local double *) &mem_80474[(local_tid_70442 +
offset_82210) * 8];
}
// apply reduction operation
{
double res_71045;
if (slt32(gtid_70430, K_68318) && slt32(ltid_70431,
computed_group_sizze_70945)) {
res_71045 = x_71043 + x_71044;
}
x_71043 = res_71045;
}
// write result of operation
{
*(__local double *) &mem_80474[local_tid_70442 * 8] = x_71043;
}
}
skip_waves_82211 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
res_71042 = *(__local double *) &mem_80474[0];
double y_71046;
double y_71047;
double x_71048;
if (slt32(gtid_70430, K_68318) && slt32(ltid_70431,
computed_group_sizze_70945)) {
y_71046 = res_71035 + res_71042;
y_71047 = x_68452 * y_71046;
x_71048 = 0.5 * y_71047;
}
__local char *mem_80477;
double res_71051;
mem_80477 = (__local char *) mem_80477_backing_2;
for (int32_t comb_iter_82212 = 0; comb_iter_82212 < squot32(D_68333 +
computed_group_sizze_70945 -
1,
computed_group_sizze_70945);
comb_iter_82212++) {
int32_t ctid_70437;
int32_t flat_comb_id_82213 = comb_iter_82212 *
computed_group_sizze_70945 + local_tid_70442;
ctid_70437 = flat_comb_id_82213;
if (slt32(ctid_70437, D_68333) && 1) {
double x_71050 = *(__global double *) &qs_mem_80369[(gtid_70430 *
D_68322 +
ltid_70431) *
8];
*(__local double *) &mem_80477[ctid_70437 * 8] = x_71050;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82214;
int32_t skip_waves_82215;
double x_71052;
double x_71053;
offset_82214 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_70442, D_68333)) {
x_71052 = *(__local double *) &mem_80477[(local_tid_70442 +
offset_82214) * 8];
}
}
offset_82214 = 1;
while (slt32(offset_82214, wave_sizze_82202)) {
if (slt32(local_tid_70442 + offset_82214, D_68333) &&
((local_tid_70442 - squot32(local_tid_70442, wave_sizze_82202) *
wave_sizze_82202) & (2 * offset_82214 - 1)) == 0) {
// read array element
{
x_71053 = *(volatile __local
double *) &mem_80477[(local_tid_70442 +
offset_82214) * 8];
}
// apply reduction operation
{
double res_71054;
if (slt32(gtid_70430, K_68318) && slt32(ltid_70431,
computed_group_sizze_70945)) {
res_71054 = x_71052 + x_71053;
}
x_71052 = res_71054;
}
// write result of operation
{
*(volatile __local double *) &mem_80477[local_tid_70442 * 8] =
x_71052;
}
}
offset_82214 *= 2;
}
skip_waves_82215 = 1;
while (slt32(skip_waves_82215, squot32(computed_group_sizze_70945 +
wave_sizze_82202 - 1,
wave_sizze_82202))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82214 = skip_waves_82215 * wave_sizze_82202;
if (slt32(local_tid_70442 + offset_82214, D_68333) &&
((local_tid_70442 - squot32(local_tid_70442, wave_sizze_82202) *
wave_sizze_82202) == 0 && (squot32(local_tid_70442,
wave_sizze_82202) & (2 *
skip_waves_82215 -
1)) ==
0)) {
// read array element
{
x_71053 = *(__local double *) &mem_80477[(local_tid_70442 +
offset_82214) * 8];
}
// apply reduction operation
{
double res_71054;
if (slt32(gtid_70430, K_68318) && slt32(ltid_70431,
computed_group_sizze_70945)) {
res_71054 = x_71052 + x_71053;
}
x_71052 = res_71054;
}
// write result of operation
{
*(__local double *) &mem_80477[local_tid_70442 * 8] = x_71052;
}
}
skip_waves_82215 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
res_71051 = *(__local double *) &mem_80477[0];
double y_71055;
double x_71056;
double res_71057;
if (slt32(gtid_70430, K_68318) && slt32(ltid_70431,
computed_group_sizze_70945)) {
y_71055 = res_68453 * res_71051;
x_71056 = x_71048 - y_71055;
res_71057 = x_71056 - y_68475;
}
if (local_tid_70442 == 0) {
*(__global double *) &mem_80480[group_id_70443 * 8] = res_71057;
}
}
__kernel void map_intra_group_71320(__local volatile
int64_t *mem_80524_backing_aligned_0,
__local volatile
int64_t *mem_80527_backing_aligned_1,
__local volatile
int64_t *mem_80597_backing_aligned_2,
__local volatile
int64_t *mem_80601_backing_aligned_3,
__local volatile
int64_t *mem_80605_backing_aligned_4,
__local volatile
int64_t *mem_80608_backing_aligned_5,
int32_t N_68508, int32_t D_68509,
int32_t K_68510, int32_t D_68512,
int32_t D_68514, int32_t triD_68516,
double d_r_68524, int32_t D_68526,
int32_t computed_group_sizze_71318, __global
unsigned char *x_mem_80366, __global
unsigned char *alphas_mem_80367, __global
unsigned char *means_mem_80368, __global
unsigned char *qs_mem_80369, __global
unsigned char *icf_mem_80370, __global
unsigned char *mem_80521, __global
unsigned char *mem_80530, __global
unsigned char *mem_80534, __global
unsigned char *mem_80537, __global
unsigned char *mem_80541, __global
unsigned char *mem_80545, __global
unsigned char *mem_80548, __global
unsigned char *mem_80551, __global
unsigned char *mem_80554, __global
unsigned char *mem_80557, __global
unsigned char *mem_80561, __global
unsigned char *mem_80565, __global
unsigned char *mem_80568, __global
unsigned char *mem_80571, __global
unsigned char *mem_80574, __global
unsigned char *mem_80577, __global
unsigned char *mem_80580, __global
unsigned char *mem_80583, __global
unsigned char *mem_80590, __global
unsigned char *mem_80593, __global
unsigned char *mem_80612, __global
unsigned char *mem_80617, __global
unsigned char *mem_80622, __global
unsigned char *mem_80627, __global
unsigned char *mem_81683, __global
unsigned char *mem_81686, __global
unsigned char *mem_81693, __global
unsigned char *mem_81696)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
__local volatile char *restrict mem_80524_backing_0 =
mem_80524_backing_aligned_0;
__local volatile char *restrict mem_80527_backing_1 =
mem_80527_backing_aligned_1;
__local volatile char *restrict mem_80597_backing_2 =
mem_80597_backing_aligned_2;
__local volatile char *restrict mem_80601_backing_3 =
mem_80601_backing_aligned_3;
__local volatile char *restrict mem_80605_backing_4 =
mem_80605_backing_aligned_4;
__local volatile char *restrict mem_80608_backing_5 =
mem_80608_backing_aligned_5;
int32_t global_tid_71320;
int32_t local_tid_71321;
int32_t group_sizze_82424;
int32_t wave_sizze_82423;
int32_t group_id_71322;
global_tid_71320 = get_global_id(0);
local_tid_71321 = get_local_id(0);
group_sizze_82424 = get_local_size(0);
wave_sizze_82423 = LOCKSTEP_WIDTH;
group_id_71322 = get_group_id(0);
int32_t gtid_71203;
int32_t ltid_71204;
gtid_71203 = squot32(global_tid_71320, computed_group_sizze_71318);
ltid_71204 = global_tid_71320 - squot32(global_tid_71320,
computed_group_sizze_71318) *
computed_group_sizze_71318;
if (slt32(gtid_71203, N_68508) && slt32(ltid_71204,
computed_group_sizze_71318)) { }
__local char *mem_80524;
__local char *mem_80527;
double res_72145;
mem_80524 = (__local char *) mem_80524_backing_0;
mem_80527 = (__local char *) mem_80527_backing_1;
for (int32_t comb_iter_82425 = 0; comb_iter_82425 < squot32(K_68510 +
computed_group_sizze_71318 -
1,
computed_group_sizze_71318);
comb_iter_82425++) {
int32_t ctid_71241;
int32_t flat_comb_id_82426 = comb_iter_82425 *
computed_group_sizze_71318 + local_tid_71321;
ctid_71241 = flat_comb_id_82426;
if (slt32(ctid_71241, K_68510) && 1) {
double alphas_elem_72060 = *(__global
double *) &alphas_mem_80367[ltid_71204 *
8];
double res_72065;
double x_72068 = 0.0;
int32_t chunk_sizze_72066;
int32_t chunk_offset_72067 = 0;
chunk_sizze_72066 = D_68526;
double res_72070;
double acc_72073 = x_72068;
int32_t groupstream_mapaccum_dummy_chunk_sizze_72071;
groupstream_mapaccum_dummy_chunk_sizze_72071 = 1;
if (chunk_sizze_72066 == D_68526) {
for (int32_t i_72072 = 0; i_72072 < D_68526; i_72072++) {
double x_72075;
double res_72078;
x_72075 = *(__global double *) &qs_mem_80369[(ltid_71204 *
D_68514 +
chunk_offset_72067 +
i_72072) * 8];
res_72078 = acc_72073 + x_72075;
double acc_tmp_82427 = res_72078;
acc_72073 = acc_tmp_82427;
}
} else {
for (int32_t i_72072 = 0; i_72072 < chunk_sizze_72066;
i_72072++) {
double x_72075;
double res_72078;
x_72075 = *(__global double *) &qs_mem_80369[(ltid_71204 *
D_68514 +
chunk_offset_72067 +
i_72072) * 8];
res_72078 = acc_72073 + x_72075;
double acc_tmp_82428 = res_72078;
acc_72073 = acc_tmp_82428;
}
}
res_72070 = acc_72073;
x_72068 = res_72070;
res_72065 = x_72068;
double x_72079;
x_72079 = alphas_elem_72060 + res_72065;
for (int32_t i_72084 = 0; i_72084 < D_68526; i_72084++) {
double x_elem_elem_72085;
double means_elem_elem_72086;
double res_72087;
x_elem_elem_72085 = *(__global
double *) &x_mem_80366[(gtid_71203 *
D_68509 +
i_72084) * 8];
means_elem_elem_72086 = *(__global
double *) &means_mem_80368[(ltid_71204 *
D_68512 +
i_72084) *
8];
res_72087 = x_elem_elem_72085 - means_elem_elem_72086;
*(__global double *) &mem_80521[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) + local_tid_71321 +
i_72084 *
computed_group_sizze_71318) *
8] = res_72087;
}
double res_72089;
double x_72092 = 0.0;
int32_t chunk_sizze_72090;
int32_t chunk_offset_72091 = 0;
chunk_sizze_72090 = D_68526;
double res_72095;
double acc_72098 = x_72092;
int32_t groupstream_mapaccum_dummy_chunk_sizze_72096;
int32_t i_72097 = 0;
groupstream_mapaccum_dummy_chunk_sizze_72096 = chunk_sizze_72090;
for (int32_t i_72097 = 0; i_72097 < chunk_sizze_72090; i_72097++) {
int32_t convop_x_79593;
double qs_elem_elem_72102;
convop_x_79593 = chunk_offset_72091 + i_72097;
qs_elem_elem_72102 = *(__global
double *) &qs_mem_80369[(ltid_71204 *
D_68514 +
chunk_offset_72091 +
i_72097) * 8];
double res_72104;
double x_72107 = 0.0;
int32_t chunk_sizze_72105;
int32_t chunk_offset_72106 = 0;
chunk_sizze_72105 = D_68526;
double res_72110;
double acc_72113 = x_72107;
int32_t groupstream_mapaccum_dummy_chunk_sizze_72111;
groupstream_mapaccum_dummy_chunk_sizze_72111 = 1;
if (chunk_sizze_72105 == D_68526) {
for (int32_t i_72112 = 0; i_72112 < D_68526; i_72112++) {
int32_t convop_x_79589;
double x_72117;
bool cond_72119;
double res_72120;
double res_72136;
double res_72138;
convop_x_79589 = chunk_offset_72106 + i_72112;
x_72117 = *(__global
double *) &mem_80521[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
computed_group_sizze_71318 *
(chunk_offset_72106 +
i_72112) + 0 *
computed_group_sizze_71318) *
8];
cond_72119 = slt32(convop_x_79593, convop_x_79589);
if (cond_72119) {
res_72120 = 0.0;
} else {
bool cond_72121;
double res_72122;
cond_72121 = convop_x_79593 == convop_x_79589;
if (cond_72121) {
double res_72123;
res_72123 = futrts_exp64(qs_elem_elem_72102);
res_72122 = res_72123;
} else {
int32_t y_72124;
int32_t x_72125;
int32_t res_72126;
int32_t gmm_knossos_tri_arg_72127;
int32_t y_72128;
int32_t x_72129;
int32_t res_72130;
int32_t x_72131;
int32_t x_72132;
int32_t y_72133;
int32_t i_72134;
double res_72135;
y_72124 = D_68526 - 1;
x_72125 = D_68526 * y_72124;
res_72126 = sdiv32(x_72125, 2);
gmm_knossos_tri_arg_72127 = D_68526 -
convop_x_79589;
y_72128 = gmm_knossos_tri_arg_72127 - 1;
x_72129 = gmm_knossos_tri_arg_72127 * y_72128;
res_72130 = sdiv32(x_72129, 2);
x_72131 = res_72126 - res_72130;
x_72132 = convop_x_79593 - convop_x_79589;
y_72133 = x_72132 - 1;
i_72134 = x_72131 + y_72133;
res_72135 = *(__global
double *) &icf_mem_80370[(ltid_71204 *
triD_68516 +
i_72134) *
8];
res_72122 = res_72135;
}
res_72120 = res_72122;
}
res_72136 = x_72117 * res_72120;
res_72138 = acc_72113 + res_72136;
double acc_tmp_82430 = res_72138;
acc_72113 = acc_tmp_82430;
}
} else {
for (int32_t i_72112 = 0; i_72112 < chunk_sizze_72105;
i_72112++) {
int32_t convop_x_79589;
double x_72117;
bool cond_72119;
double res_72120;
double res_72136;
double res_72138;
convop_x_79589 = chunk_offset_72106 + i_72112;
x_72117 = *(__global
double *) &mem_80521[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
computed_group_sizze_71318 *
(chunk_offset_72106 +
i_72112) + 0 *
computed_group_sizze_71318) *
8];
cond_72119 = slt32(convop_x_79593, convop_x_79589);
if (cond_72119) {
res_72120 = 0.0;
} else {
bool cond_72121;
double res_72122;
cond_72121 = convop_x_79593 == convop_x_79589;
if (cond_72121) {
double res_72123;
res_72123 = futrts_exp64(qs_elem_elem_72102);
res_72122 = res_72123;
} else {
int32_t y_72124;
int32_t x_72125;
int32_t res_72126;
int32_t gmm_knossos_tri_arg_72127;
int32_t y_72128;
int32_t x_72129;
int32_t res_72130;
int32_t x_72131;
int32_t x_72132;
int32_t y_72133;
int32_t i_72134;
double res_72135;
y_72124 = D_68526 - 1;
x_72125 = D_68526 * y_72124;
res_72126 = sdiv32(x_72125, 2);
gmm_knossos_tri_arg_72127 = D_68526 -
convop_x_79589;
y_72128 = gmm_knossos_tri_arg_72127 - 1;
x_72129 = gmm_knossos_tri_arg_72127 * y_72128;
res_72130 = sdiv32(x_72129, 2);
x_72131 = res_72126 - res_72130;
x_72132 = convop_x_79593 - convop_x_79589;
y_72133 = x_72132 - 1;
i_72134 = x_72131 + y_72133;
res_72135 = *(__global
double *) &icf_mem_80370[(ltid_71204 *
triD_68516 +
i_72134) *
8];
res_72122 = res_72135;
}
res_72120 = res_72122;
}
res_72136 = x_72117 * res_72120;
res_72138 = acc_72113 + res_72136;
double acc_tmp_82431 = res_72138;
acc_72113 = acc_tmp_82431;
}
}
res_72110 = acc_72113;
x_72107 = res_72110;
res_72104 = x_72107;
double res_72139;
double res_72141;
res_72139 = res_72104 * res_72104;
res_72141 = acc_72098 + res_72139;
acc_72098 = res_72141;
}
res_72095 = acc_72098;
x_72092 = res_72095;
res_72089 = x_72092;
double y_72142;
double res_72143;
double res_72144;
y_72142 = 0.5 * res_72089;
res_72143 = x_72079 - y_72142;
res_72144 = futrts_exp64(res_72143);
*(__local double *) &mem_80524[ctid_71241 * 8] = res_72144;
*(__local double *) &mem_80527[ctid_71241 * 8] = res_72143;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82432;
int32_t skip_waves_82433;
double x_72146;
double x_72147;
offset_82432 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_71321, K_68510)) {
x_72146 = *(__local double *) &mem_80524[(local_tid_71321 +
offset_82432) * 8];
}
}
offset_82432 = 1;
while (slt32(offset_82432, wave_sizze_82423)) {
if (slt32(local_tid_71321 + offset_82432, K_68510) &&
((local_tid_71321 - squot32(local_tid_71321, wave_sizze_82423) *
wave_sizze_82423) & (2 * offset_82432 - 1)) == 0) {
// read array element
{
x_72147 = *(volatile __local
double *) &mem_80524[(local_tid_71321 +
offset_82432) * 8];
}
// apply reduction operation
{
double res_72148;
if (slt32(gtid_71203, N_68508) && slt32(ltid_71204,
computed_group_sizze_71318)) {
res_72148 = x_72146 + x_72147;
}
x_72146 = res_72148;
}
// write result of operation
{
*(volatile __local double *) &mem_80524[local_tid_71321 * 8] =
x_72146;
}
}
offset_82432 *= 2;
}
skip_waves_82433 = 1;
while (slt32(skip_waves_82433, squot32(computed_group_sizze_71318 +
wave_sizze_82423 - 1,
wave_sizze_82423))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82432 = skip_waves_82433 * wave_sizze_82423;
if (slt32(local_tid_71321 + offset_82432, K_68510) &&
((local_tid_71321 - squot32(local_tid_71321, wave_sizze_82423) *
wave_sizze_82423) == 0 && (squot32(local_tid_71321,
wave_sizze_82423) & (2 *
skip_waves_82433 -
1)) ==
0)) {
// read array element
{
x_72147 = *(__local double *) &mem_80524[(local_tid_71321 +
offset_82432) * 8];
}
// apply reduction operation
{
double res_72148;
if (slt32(gtid_71203, N_68508) && slt32(ltid_71204,
computed_group_sizze_71318)) {
res_72148 = x_72146 + x_72147;
}
x_72146 = res_72148;
}
// write result of operation
{
*(__local double *) &mem_80524[local_tid_71321 * 8] = x_72146;
}
}
skip_waves_82433 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
res_72145 = *(__local double *) &mem_80524[0];
double x_72149;
double res_72150;
if (slt32(gtid_71203, N_68508) && slt32(ltid_71204,
computed_group_sizze_71318)) {
x_72149 = 1.0 / res_72145;
res_72150 = d_r_68524 * x_72149;
}
__local char *mem_80597;
__local char *mem_80601;
__local char *mem_80605;
__local char *mem_80608;
mem_80597 = (__local char *) mem_80597_backing_2;
mem_80601 = (__local char *) mem_80601_backing_3;
mem_80605 = (__local char *) mem_80605_backing_4;
mem_80608 = (__local char *) mem_80608_backing_5;
for (int32_t comb_iter_82434 = 0; comb_iter_82434 < squot32(K_68510 +
computed_group_sizze_71318 -
1,
computed_group_sizze_71318);
comb_iter_82434++) {
int32_t ctid_71310;
int32_t flat_comb_id_82435 = comb_iter_82434 *
computed_group_sizze_71318 + local_tid_71321;
ctid_71310 = flat_comb_id_82435;
if (slt32(ctid_71310, K_68510) && 1) {
double res_elem_72156;
double res_72160;
double res_72161;
double y_72201;
double rev_sqnorm_arg_72202;
res_elem_72156 = *(__local double *) &mem_80527[ltid_71204 * 8];
res_72160 = futrts_exp64(res_elem_72156);
res_72161 = res_72150 * res_72160;
for (int32_t i_72170 = 0; i_72170 < D_68526; i_72170++) {
double qs_elem_elem_72172;
double x_elem_elem_72173;
double means_elem_elem_72174;
double res_72198;
qs_elem_elem_72172 = *(__global
double *) &qs_mem_80369[(ltid_71204 *
D_68514 +
i_72170) * 8];
x_elem_elem_72173 = *(__global
double *) &x_mem_80366[(gtid_71203 *
D_68509 +
i_72170) * 8];
means_elem_elem_72174 = *(__global
double *) &means_mem_80368[(ltid_71204 *
D_68512 +
i_72170) *
8];
for (int32_t i_72178 = 0; i_72178 < D_68526; i_72178++) {
bool cond_72180;
double res_72181;
cond_72180 = slt32(i_72170, i_72178);
if (cond_72180) {
res_72181 = 0.0;
} else {
bool cond_72182;
double res_72183;
cond_72182 = i_72170 == i_72178;
if (cond_72182) {
double res_72184;
res_72184 = futrts_exp64(qs_elem_elem_72172);
res_72183 = res_72184;
} else {
int32_t y_72185;
int32_t x_72186;
int32_t res_72187;
int32_t gmm_knossos_tri_arg_72188;
int32_t y_72189;
int32_t x_72190;
int32_t res_72191;
int32_t x_72192;
int32_t x_72193;
int32_t y_72194;
int32_t i_72195;
double res_72196;
y_72185 = D_68526 - 1;
x_72186 = D_68526 * y_72185;
res_72187 = sdiv32(x_72186, 2);
gmm_knossos_tri_arg_72188 = D_68526 - i_72178;
y_72189 = gmm_knossos_tri_arg_72188 - 1;
x_72190 = gmm_knossos_tri_arg_72188 * y_72189;
res_72191 = sdiv32(x_72190, 2);
x_72192 = res_72187 - res_72191;
x_72193 = i_72170 - i_72178;
y_72194 = x_72193 - 1;
i_72195 = x_72192 + y_72194;
res_72196 = *(__global
double *) &icf_mem_80370[(ltid_71204 *
triD_68516 +
i_72195) *
8];
res_72183 = res_72196;
}
res_72181 = res_72183;
}
*(__global double *) &mem_80557[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 + i_72178 *
computed_group_sizze_71318) *
8] = res_72181;
}
res_72198 = x_elem_elem_72173 - means_elem_elem_72174;
*(__global double *) &mem_80530[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) + local_tid_71321 +
i_72170 *
computed_group_sizze_71318) *
8] = res_72198;
for (int32_t i_82439 = 0; i_82439 < D_68526; i_82439++) {
*(__global double *) &mem_80534[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526 * D_68526) +
local_tid_71321 + i_72170 *
(computed_group_sizze_71318 *
D_68526) + i_82439 *
computed_group_sizze_71318) *
8] = *(__global
double *) &mem_80557[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_82439 *
computed_group_sizze_71318) *
8];
}
}
y_72201 = 0.0 - res_72161;
rev_sqnorm_arg_72202 = 0.5 * y_72201;
for (int32_t i_72206 = 0; i_72206 < D_68526; i_72206++) {
double res_72208;
double res_72217;
double res_72218;
double redout_72209 = 0.0;
for (int32_t i_72210 = 0; i_72210 < D_68526; i_72210++) {
double x_72211;
double x_72212;
double res_72213;
double res_72216;
x_72211 = *(__global double *) &mem_80530[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_72210 *
computed_group_sizze_71318) *
8];
x_72212 = *(__global double *) &mem_80534[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526 *
D_68526) +
local_tid_71321 +
(i_72206 *
(computed_group_sizze_71318 *
D_68526) +
i_72210 *
computed_group_sizze_71318)) *
8];
res_72213 = x_72211 * x_72212;
res_72216 = redout_72209 + res_72213;
double redout_tmp_82441 = res_72216;
redout_72209 = redout_tmp_82441;
}
res_72208 = redout_72209;
res_72217 = rev_sqnorm_arg_72202 * res_72208;
res_72218 = res_72217 + res_72217;
*(__global double *) &mem_80537[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) + local_tid_71321 +
i_72206 *
computed_group_sizze_71318) *
8] = res_72218;
}
for (int32_t i_72230 = 0; i_72230 < D_68526; i_72230++) {
double x_72231;
double qs_elem_elem_72234;
double res_72235;
double res_72244;
x_72231 = *(__global double *) &mem_80537[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_72230 *
computed_group_sizze_71318) *
8];
qs_elem_elem_72234 = *(__global
double *) &qs_mem_80369[(ltid_71204 *
D_68514 +
i_72230) * 8];
double redout_72236 = 0.0;
for (int32_t i_72237 = 0; i_72237 < D_68526; i_72237++) {
double x_72238;
double x_72239;
double res_72240;
double res_72243;
x_72238 = *(__global double *) &mem_80534[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526 *
D_68526) +
local_tid_71321 +
(i_72237 *
(computed_group_sizze_71318 *
D_68526) +
i_72230 *
computed_group_sizze_71318)) *
8];
x_72239 = *(__global double *) &mem_80537[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_72237 *
computed_group_sizze_71318) *
8];
res_72240 = x_72238 * x_72239;
res_72243 = redout_72236 + res_72240;
double redout_tmp_82445 = res_72243;
redout_72236 = redout_tmp_82445;
}
res_72235 = redout_72236;
res_72244 = 0.0 - res_72235;
for (int32_t i_72251 = 0; i_72251 < D_68526; i_72251++) {
double x_72252;
double res_72254;
bool cond_72255;
bool cond_72256;
x_72252 = *(__global double *) &mem_80530[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_72251 *
computed_group_sizze_71318) *
8];
res_72254 = x_72231 * x_72252;
cond_72255 = slt32(i_72230, i_72251);
cond_72256 = i_72230 == i_72251;
if (cond_72255) {
for (int32_t i_82448 = 0; i_82448 < D_68526;
i_82448++) {
*(__global double *) &mem_80568[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_82448 *
computed_group_sizze_71318) *
8] = 0.0;
}
for (int32_t i_82449 = 0; i_82449 < triD_68516;
i_82449++) {
*(__global double *) &mem_80571[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516) +
local_tid_71321 +
i_82449 *
computed_group_sizze_71318) *
8] = 0.0;
}
for (int32_t i_82450 = 0; i_82450 < D_68526;
i_82450++) {
*(__global double *) &mem_81696[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_82450 *
computed_group_sizze_71318) *
8] = *(__global
double *) &mem_80568[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_82450 *
computed_group_sizze_71318) *
8];
}
for (int32_t i_82451 = 0; i_82451 < triD_68516;
i_82451++) {
*(__global double *) &mem_81693[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516) +
local_tid_71321 +
i_82451 *
computed_group_sizze_71318) *
8] = *(__global
double *) &mem_80571[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516) +
local_tid_71321 +
i_82451 *
computed_group_sizze_71318) *
8];
}
} else {
if (cond_72256) {
double res_72263;
double deltaVec_arg_72264;
res_72263 = futrts_exp64(qs_elem_elem_72234);
deltaVec_arg_72264 = res_72254 * res_72263;
for (int32_t i_72269 = 0; i_72269 < D_68526;
i_72269++) {
bool cond_72271;
double res_72272;
cond_72271 = i_72269 == i_72230;
if (cond_72271) {
res_72272 = deltaVec_arg_72264;
} else {
res_72272 = 0.0;
}
*(__global
double *) &mem_80574[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_72269 *
computed_group_sizze_71318) *
8] = res_72272;
}
for (int32_t i_82453 = 0; i_82453 < triD_68516;
i_82453++) {
*(__global
double *) &mem_80577[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516) +
local_tid_71321 +
i_82453 *
computed_group_sizze_71318) *
8] = 0.0;
}
for (int32_t i_82454 = 0; i_82454 < D_68526;
i_82454++) {
*(__global
double *) &mem_81686[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_82454 *
computed_group_sizze_71318) *
8] = *(__global
double *) &mem_80574[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_82454 *
computed_group_sizze_71318) *
8];
}
for (int32_t i_82455 = 0; i_82455 < triD_68516;
i_82455++) {
*(__global
double *) &mem_81683[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516) +
local_tid_71321 +
i_82455 *
computed_group_sizze_71318) *
8] = *(__global
double *) &mem_80577[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516) +
local_tid_71321 +
i_82455 *
computed_group_sizze_71318) *
8];
}
} else {
int32_t y_72275;
int32_t x_72276;
int32_t res_72277;
int32_t deltaVec_arg_72278;
y_72275 = i_72230 - 1;
x_72276 = i_72230 * y_72275;
res_72277 = sdiv32(x_72276, 2);
deltaVec_arg_72278 = i_72251 + res_72277;
for (int32_t i_72283 = 0; i_72283 < triD_68516;
i_72283++) {
bool cond_72285;
double res_72286;
cond_72285 = i_72283 == deltaVec_arg_72278;
if (cond_72285) {
res_72286 = res_72254;
} else {
res_72286 = 0.0;
}
*(__global
double *) &mem_80580[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516) +
local_tid_71321 +
i_72283 *
computed_group_sizze_71318) *
8] = res_72286;
}
for (int32_t i_82457 = 0; i_82457 < D_68526;
i_82457++) {
*(__global
double *) &mem_80583[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_82457 *
computed_group_sizze_71318) *
8] = 0.0;
}
for (int32_t i_82458 = 0; i_82458 < D_68526;
i_82458++) {
*(__global
double *) &mem_81686[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_82458 *
computed_group_sizze_71318) *
8] = *(__global
double *) &mem_80583[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_82458 *
computed_group_sizze_71318) *
8];
}
for (int32_t i_82459 = 0; i_82459 < triD_68516;
i_82459++) {
*(__global
double *) &mem_81683[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516) +
local_tid_71321 +
i_82459 *
computed_group_sizze_71318) *
8] = *(__global
double *) &mem_80580[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516) +
local_tid_71321 +
i_82459 *
computed_group_sizze_71318) *
8];
}
}
for (int32_t i_82460 = 0; i_82460 < D_68526;
i_82460++) {
*(__global double *) &mem_81696[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_82460 *
computed_group_sizze_71318) *
8] = *(__global
double *) &mem_81686[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_82460 *
computed_group_sizze_71318) *
8];
}
for (int32_t i_82461 = 0; i_82461 < triD_68516;
i_82461++) {
*(__global double *) &mem_81693[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516) +
local_tid_71321 +
i_82461 *
computed_group_sizze_71318) *
8] = *(__global
double *) &mem_81683[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516) +
local_tid_71321 +
i_82461 *
computed_group_sizze_71318) *
8];
}
}
for (int32_t i_82462 = 0; i_82462 < D_68526; i_82462++) {
*(__global double *) &mem_80561[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526 * D_68526) +
local_tid_71321 +
i_72251 *
(computed_group_sizze_71318 *
D_68526) + i_82462 *
computed_group_sizze_71318) *
8] = *(__global
double *) &mem_81696[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_82462 *
computed_group_sizze_71318) *
8];
}
for (int32_t i_82463 = 0; i_82463 < triD_68516; i_82463++) {
*(__global double *) &mem_80565[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516 *
D_68526) +
local_tid_71321 +
i_72251 *
(computed_group_sizze_71318 *
triD_68516) +
i_82463 *
computed_group_sizze_71318) *
8] = *(__global
double *) &mem_81693[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516) +
local_tid_71321 +
i_82463 *
computed_group_sizze_71318) *
8];
}
}
for (int32_t i_72295 = 0; i_72295 < D_68526; i_72295++) {
double res_72297;
double redout_72298 = 0.0;
for (int32_t i_72299 = 0; i_72299 < D_68526; i_72299++) {
double x_72300;
double res_72303;
x_72300 = *(__global
double *) &mem_80561[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526 * D_68526) +
local_tid_71321 +
(i_72299 *
(computed_group_sizze_71318 *
D_68526) + i_72295 *
computed_group_sizze_71318)) *
8];
res_72303 = redout_72298 + x_72300;
double redout_tmp_82465 = res_72303;
redout_72298 = redout_tmp_82465;
}
res_72297 = redout_72298;
*(__global double *) &mem_80590[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 + i_72295 *
computed_group_sizze_71318) *
8] = res_72297;
}
for (int32_t i_72309 = 0; i_72309 < triD_68516; i_72309++) {
double res_72311;
double redout_72312 = 0.0;
for (int32_t i_72313 = 0; i_72313 < D_68526; i_72313++) {
double x_72314;
double res_72317;
x_72314 = *(__global
double *) &mem_80565[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516 *
D_68526) +
local_tid_71321 +
(i_72313 *
(computed_group_sizze_71318 *
triD_68516) +
i_72309 *
computed_group_sizze_71318)) *
8];
res_72317 = redout_72312 + x_72314;
double redout_tmp_82467 = res_72317;
redout_72312 = redout_tmp_82467;
}
res_72311 = redout_72312;
*(__global double *) &mem_80593[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516) +
local_tid_71321 + i_72309 *
computed_group_sizze_71318) *
8] = res_72311;
}
for (int32_t i_82468 = 0; i_82468 < D_68526; i_82468++) {
*(__global double *) &mem_80541[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526 * D_68526) +
local_tid_71321 + i_72230 *
(computed_group_sizze_71318 *
D_68526) + i_82468 *
computed_group_sizze_71318) *
8] = *(__global
double *) &mem_80590[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_82468 *
computed_group_sizze_71318) *
8];
}
for (int32_t i_82469 = 0; i_82469 < triD_68516; i_82469++) {
*(__global double *) &mem_80545[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516 * D_68526) +
local_tid_71321 + i_72230 *
(computed_group_sizze_71318 *
triD_68516) + i_82469 *
computed_group_sizze_71318) *
8] = *(__global
double *) &mem_80593[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516) +
local_tid_71321 +
i_82469 *
computed_group_sizze_71318) *
8];
}
*(__global double *) &mem_80548[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) + local_tid_71321 +
i_72230 *
computed_group_sizze_71318) *
8] = res_72244;
}
for (int32_t i_72326 = 0; i_72326 < triD_68516; i_72326++) {
double res_72328;
double redout_72329 = 0.0;
for (int32_t i_72330 = 0; i_72330 < D_68526; i_72330++) {
double x_72331;
double res_72334;
x_72331 = *(__global double *) &mem_80545[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516 *
D_68526) +
local_tid_71321 +
(i_72330 *
(computed_group_sizze_71318 *
triD_68516) +
i_72326 *
computed_group_sizze_71318)) *
8];
res_72334 = redout_72329 + x_72331;
double redout_tmp_82471 = res_72334;
redout_72329 = redout_tmp_82471;
}
res_72328 = redout_72329;
*(__global double *) &mem_80551[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516) +
local_tid_71321 + i_72326 *
computed_group_sizze_71318) *
8] = res_72328;
}
for (int32_t i_72340 = 0; i_72340 < D_68526; i_72340++) {
double res_72342;
double res_72349;
double redout_72343 = 0.0;
for (int32_t i_72344 = 0; i_72344 < D_68526; i_72344++) {
double x_72345;
double res_72348;
x_72345 = *(__global double *) &mem_80541[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526 *
D_68526) +
local_tid_71321 +
(i_72344 *
(computed_group_sizze_71318 *
D_68526) +
i_72340 *
computed_group_sizze_71318)) *
8];
res_72348 = redout_72343 + x_72345;
double redout_tmp_82473 = res_72348;
redout_72343 = redout_tmp_82473;
}
res_72342 = redout_72343;
res_72349 = res_72161 + res_72342;
*(__global double *) &mem_80554[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) + local_tid_71321 +
i_72340 *
computed_group_sizze_71318) *
8] = res_72349;
}
for (int32_t i_82474 = 0; i_82474 < D_68526; i_82474++) {
*(__local double *) &mem_80597[(ctid_71310 * D_68526 +
i_82474) * 8] = *(__global
double *) &mem_80548[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_82474 *
computed_group_sizze_71318) *
8];
}
for (int32_t i_82475 = 0; i_82475 < D_68526; i_82475++) {
*(__local double *) &mem_80601[(ctid_71310 * D_68526 +
i_82475) * 8] = *(__global
double *) &mem_80554[(group_id_71322 *
(computed_group_sizze_71318 *
D_68526) +
local_tid_71321 +
i_82475 *
computed_group_sizze_71318) *
8];
}
for (int32_t i_82476 = 0; i_82476 < triD_68516; i_82476++) {
*(__local double *) &mem_80605[(ctid_71310 * triD_68516 +
i_82476) * 8] = *(__global
double *) &mem_80551[(group_id_71322 *
(computed_group_sizze_71318 *
triD_68516) +
local_tid_71321 +
i_82476 *
computed_group_sizze_71318) *
8];
}
*(__local double *) &mem_80608[ctid_71310 * 8] = res_72161;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int32_t i_82477 = 0; i_82477 < squot32(K_68510 - local_tid_71321 +
computed_group_sizze_71318 - 1,
computed_group_sizze_71318);
i_82477++) {
*(__global double *) &mem_80612[(group_id_71322 * K_68510 + (i_82477 *
computed_group_sizze_71318 +
local_tid_71321)) *
8] = *(__local
double *) &mem_80608[(i_82477 *
computed_group_sizze_71318 +
local_tid_71321) *
8];
}
for (int32_t i_82478 = 0; i_82478 < squot32(K_68510 * D_68526 -
local_tid_71321 +
computed_group_sizze_71318 - 1,
computed_group_sizze_71318);
i_82478++) {
*(__global double *) &mem_80617[(group_id_71322 * (D_68526 * K_68510) +
squot32(i_82478 *
computed_group_sizze_71318 +
local_tid_71321, D_68526) *
D_68526 + (i_82478 *
computed_group_sizze_71318 +
local_tid_71321 -
squot32(i_82478 *
computed_group_sizze_71318 +
local_tid_71321,
D_68526) *
D_68526)) * 8] = *(__local
double *) &mem_80597[(squot32(i_82478 *
computed_group_sizze_71318 +
local_tid_71321,
D_68526) *
D_68526 +
(i_82478 *
computed_group_sizze_71318 +
local_tid_71321 -
squot32(i_82478 *
computed_group_sizze_71318 +
local_tid_71321,
D_68526) *
D_68526)) *
8];
}
for (int32_t i_82479 = 0; i_82479 < squot32(K_68510 * D_68526 -
local_tid_71321 +
computed_group_sizze_71318 - 1,
computed_group_sizze_71318);
i_82479++) {
*(__global double *) &mem_80622[(group_id_71322 * (D_68526 * K_68510) +
squot32(i_82479 *
computed_group_sizze_71318 +
local_tid_71321, D_68526) *
D_68526 + (i_82479 *
computed_group_sizze_71318 +
local_tid_71321 -
squot32(i_82479 *
computed_group_sizze_71318 +
local_tid_71321,
D_68526) *
D_68526)) * 8] = *(__local
double *) &mem_80601[(squot32(i_82479 *
computed_group_sizze_71318 +
local_tid_71321,
D_68526) *
D_68526 +
(i_82479 *
computed_group_sizze_71318 +
local_tid_71321 -
squot32(i_82479 *
computed_group_sizze_71318 +
local_tid_71321,
D_68526) *
D_68526)) *
8];
}
for (int32_t i_82480 = 0; i_82480 < squot32(K_68510 * triD_68516 -
local_tid_71321 +
computed_group_sizze_71318 - 1,
computed_group_sizze_71318);
i_82480++) {
*(__global double *) &mem_80627[(group_id_71322 * (triD_68516 *
K_68510) +
squot32(i_82480 *
computed_group_sizze_71318 +
local_tid_71321, triD_68516) *
triD_68516 + (i_82480 *
computed_group_sizze_71318 +
local_tid_71321 -
squot32(i_82480 *
computed_group_sizze_71318 +
local_tid_71321,
triD_68516) *
triD_68516)) * 8] =
*(__local double *) &mem_80605[(squot32(i_82480 *
computed_group_sizze_71318 +
local_tid_71321,
triD_68516) * triD_68516 +
(i_82480 *
computed_group_sizze_71318 +
local_tid_71321 - squot32(i_82480 *
computed_group_sizze_71318 +
local_tid_71321,
triD_68516) *
triD_68516)) * 8];
}
}
__kernel void map_intra_group_72545(__local volatile
int64_t *mem_80774_backing_aligned_0,
__local volatile
int64_t *mem_80778_backing_aligned_1,
__local volatile
int64_t *mem_80781_backing_aligned_2,
__local volatile
int64_t *mem_80821_backing_aligned_3,
__local volatile
int64_t *mem_80825_backing_aligned_4,
__local volatile
int64_t *mem_80828_backing_aligned_5,
__local volatile
int64_t *mem_80831_backing_aligned_6,
__local volatile
int64_t *mem_80834_backing_aligned_7,
int32_t N_68508, int32_t D_68509,
int32_t K_68510, int32_t D_68512,
int32_t D_68514, int32_t triD_68516,
int32_t D_68526,
int32_t computed_group_sizze_76480, __global
unsigned char *x_mem_80366, __global
unsigned char *means_mem_80368, __global
unsigned char *qs_mem_80369, __global
unsigned char *icf_mem_80370, __global
unsigned char *mem_80652, __global
unsigned char *mem_80768, __global
unsigned char *mem_80771, __global
unsigned char *mem_80785, __global
unsigned char *mem_80789, __global
unsigned char *mem_80792, __global
unsigned char *mem_80795, __global
unsigned char *mem_80798, __global
unsigned char *mem_80801, __global
unsigned char *mem_80804, __global
unsigned char *mem_80807, __global
unsigned char *mem_80810, __global
unsigned char *mem_80813, __global
unsigned char *mem_80837, __global
unsigned char *mem_80841, __global
unsigned char *mem_80845, __global
unsigned char *mem_80849, __global
unsigned char *mem_81729, __global
unsigned char *mem_81732, __global
unsigned char *mem_81739, __global
unsigned char *mem_81742)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
__local volatile char *restrict mem_80774_backing_0 =
mem_80774_backing_aligned_0;
__local volatile char *restrict mem_80778_backing_1 =
mem_80778_backing_aligned_1;
__local volatile char *restrict mem_80781_backing_2 =
mem_80781_backing_aligned_2;
__local volatile char *restrict mem_80821_backing_3 =
mem_80821_backing_aligned_3;
__local volatile char *restrict mem_80825_backing_4 =
mem_80825_backing_aligned_4;
__local volatile char *restrict mem_80828_backing_5 =
mem_80828_backing_aligned_5;
__local volatile char *restrict mem_80831_backing_6 =
mem_80831_backing_aligned_6;
__local volatile char *restrict mem_80834_backing_7 =
mem_80834_backing_aligned_7;
int32_t global_tid_72545;
int32_t local_tid_72546;
int32_t group_sizze_82575;
int32_t wave_sizze_82574;
int32_t group_id_72547;
global_tid_72545 = get_global_id(0);
local_tid_72546 = get_local_id(0);
group_sizze_82575 = get_local_size(0);
wave_sizze_82574 = LOCKSTEP_WIDTH;
group_id_72547 = get_group_id(0);
int32_t gtid_72454;
int32_t gtid_72455;
int32_t ltid_72457;
gtid_72454 = squot32(global_tid_72545, K_68510 *
computed_group_sizze_76480);
gtid_72455 = squot32(global_tid_72545 - squot32(global_tid_72545, K_68510 *
computed_group_sizze_76480) *
(K_68510 * computed_group_sizze_76480),
computed_group_sizze_76480);
ltid_72457 = global_tid_72545 - squot32(global_tid_72545, K_68510 *
computed_group_sizze_76480) *
(K_68510 * computed_group_sizze_76480) - squot32(global_tid_72545 -
squot32(global_tid_72545,
K_68510 *
computed_group_sizze_76480) *
(K_68510 *
computed_group_sizze_76480),
computed_group_sizze_76480) *
computed_group_sizze_76480;
double res_76700;
double res_elem_76702;
double res_76706;
double res_76707;
if ((slt32(gtid_72454, N_68508) && slt32(gtid_72455, K_68510)) &&
slt32(ltid_72457, computed_group_sizze_76480)) {
res_76700 = *(__global double *) &mem_80652[gtid_72454 * 8];
res_elem_76702 = *(__global double *) &mem_80768[(gtid_72455 * N_68508 +
gtid_72454) * 8];
res_76706 = futrts_exp64(res_elem_76702);
res_76707 = res_76700 * res_76706;
}
__local char *mem_80774;
__local char *mem_80778;
mem_80774 = (__local char *) mem_80774_backing_0;
mem_80778 = (__local char *) mem_80778_backing_1;
for (int32_t comb_iter_82576 = 0; comb_iter_82576 < squot32(D_68526 +
computed_group_sizze_76480 -
1,
computed_group_sizze_76480);
comb_iter_82576++) {
int32_t ctid_72462;
int32_t flat_comb_id_82577 = comb_iter_82576 *
computed_group_sizze_76480 + local_tid_72546;
ctid_72462 = flat_comb_id_82577;
if (slt32(ctid_72462, D_68526) && 1) {
double qs_elem_elem_76713;
double x_elem_elem_76714;
double means_elem_elem_76715;
double res_76739;
qs_elem_elem_76713 = *(__global
double *) &qs_mem_80369[(gtid_72455 *
D_68514 +
ltid_72457) * 8];
x_elem_elem_76714 = *(__global double *) &x_mem_80366[(gtid_72454 *
D_68509 +
ltid_72457) *
8];
means_elem_elem_76715 = *(__global
double *) &means_mem_80368[(gtid_72455 *
D_68512 +
ltid_72457) *
8];
for (int32_t i_76719 = 0; i_76719 < D_68526; i_76719++) {
bool cond_76721;
double res_76722;
cond_76721 = slt32(ltid_72457, i_76719);
if (cond_76721) {
res_76722 = 0.0;
} else {
bool cond_76723;
double res_76724;
cond_76723 = ltid_72457 == i_76719;
if (cond_76723) {
double res_76725;
res_76725 = futrts_exp64(qs_elem_elem_76713);
res_76724 = res_76725;
} else {
int32_t y_76726;
int32_t x_76727;
int32_t res_76728;
int32_t gmm_knossos_tri_arg_76729;
int32_t y_76730;
int32_t x_76731;
int32_t res_76732;
int32_t x_76733;
int32_t x_76734;
int32_t y_76735;
int32_t i_76736;
double res_76737;
y_76726 = D_68526 - 1;
x_76727 = D_68526 * y_76726;
res_76728 = sdiv32(x_76727, 2);
gmm_knossos_tri_arg_76729 = D_68526 - i_76719;
y_76730 = gmm_knossos_tri_arg_76729 - 1;
x_76731 = gmm_knossos_tri_arg_76729 * y_76730;
res_76732 = sdiv32(x_76731, 2);
x_76733 = res_76728 - res_76732;
x_76734 = ltid_72457 - i_76719;
y_76735 = x_76734 - 1;
i_76736 = x_76733 + y_76735;
res_76737 = *(__global
double *) &icf_mem_80370[(gtid_72455 *
triD_68516 +
i_76736) * 8];
res_76724 = res_76737;
}
res_76722 = res_76724;
}
*(__global double *) &mem_80771[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526) + local_tid_72546 +
i_76719 *
computed_group_sizze_76480) *
8] = res_76722;
}
res_76739 = x_elem_elem_76714 - means_elem_elem_76715;
*(__local double *) &mem_80774[ctid_72462 * 8] = res_76739;
for (int32_t i_82579 = 0; i_82579 < D_68526; i_82579++) {
*(__local double *) &mem_80778[(ctid_72462 * D_68526 +
i_82579) * 8] = *(__global
double *) &mem_80771[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_72546 +
i_82579 *
computed_group_sizze_76480) *
8];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
double y_76740;
double rev_sqnorm_arg_76741;
if ((slt32(gtid_72454, N_68508) && slt32(gtid_72455, K_68510)) &&
slt32(ltid_72457, computed_group_sizze_76480)) {
y_76740 = 0.0 - res_76707;
rev_sqnorm_arg_76741 = 0.5 * y_76740;
}
__local char *mem_80781;
mem_80781 = (__local char *) mem_80781_backing_2;
for (int32_t comb_iter_82580 = 0; comb_iter_82580 < squot32(D_68526 +
computed_group_sizze_76480 -
1,
computed_group_sizze_76480);
comb_iter_82580++) {
int32_t ctid_72474;
int32_t flat_comb_id_82581 = comb_iter_82580 *
computed_group_sizze_76480 + local_tid_72546;
ctid_72474 = flat_comb_id_82581;
if (slt32(ctid_72474, D_68526) && 1) {
double res_76744;
double x_76747 = 0.0;
int32_t chunk_sizze_76745;
int32_t chunk_offset_76746 = 0;
chunk_sizze_76745 = D_68526;
double res_76750;
double acc_76753 = x_76747;
int32_t groupstream_mapaccum_dummy_chunk_sizze_76751;
groupstream_mapaccum_dummy_chunk_sizze_76751 = 1;
if (chunk_sizze_76745 == D_68526) {
for (int32_t i_76752 = 0; i_76752 < D_68526; i_76752++) {
double x_76756;
double x_76757;
double res_76759;
double res_76761;
x_76756 = *(__local
double *) &mem_80774[(chunk_offset_76746 +
i_76752) * 8];
x_76757 = *(__local double *) &mem_80778[(ltid_72457 *
D_68526 +
chunk_offset_76746 +
i_76752) * 8];
res_76759 = x_76756 * x_76757;
res_76761 = acc_76753 + res_76759;
double acc_tmp_82582 = res_76761;
acc_76753 = acc_tmp_82582;
}
} else {
for (int32_t i_76752 = 0; i_76752 < chunk_sizze_76745;
i_76752++) {
double x_76756;
double x_76757;
double res_76759;
double res_76761;
x_76756 = *(__local
double *) &mem_80774[(chunk_offset_76746 +
i_76752) * 8];
x_76757 = *(__local double *) &mem_80778[(ltid_72457 *
D_68526 +
chunk_offset_76746 +
i_76752) * 8];
res_76759 = x_76756 * x_76757;
res_76761 = acc_76753 + res_76759;
double acc_tmp_82583 = res_76761;
acc_76753 = acc_tmp_82583;
}
}
res_76750 = acc_76753;
x_76747 = res_76750;
res_76744 = x_76747;
double res_76762;
double res_76763;
res_76762 = rev_sqnorm_arg_76741 * res_76744;
res_76763 = res_76762 + res_76762;
*(__local double *) &mem_80781[ctid_72474 * 8] = res_76763;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if ((slt32(gtid_72454, N_68508) && slt32(gtid_72455, K_68510)) &&
slt32(ltid_72457, computed_group_sizze_76480)) { }
__local char *mem_80821;
__local char *mem_80825;
__local char *mem_80828;
mem_80821 = (__local char *) mem_80821_backing_3;
mem_80825 = (__local char *) mem_80825_backing_4;
mem_80828 = (__local char *) mem_80828_backing_5;
for (int32_t comb_iter_82584 = 0; comb_iter_82584 < squot32(D_68526 +
computed_group_sizze_76480 -
1,
computed_group_sizze_76480);
comb_iter_82584++) {
int32_t ctid_72513;
int32_t flat_comb_id_82585 = comb_iter_82584 *
computed_group_sizze_76480 + local_tid_72546;
ctid_72513 = flat_comb_id_82585;
if (slt32(ctid_72513, D_68526) && 1) {
double x_76768;
double qs_elem_elem_76771;
x_76768 = *(__local double *) &mem_80781[ltid_72457 * 8];
qs_elem_elem_76771 = *(__global
double *) &qs_mem_80369[(gtid_72455 *
D_68514 +
ltid_72457) * 8];
double res_76772;
double x_76775 = 0.0;
int32_t chunk_sizze_76773;
int32_t chunk_offset_76774 = 0;
chunk_sizze_76773 = D_68526;
double res_76778;
double acc_76781 = x_76775;
int32_t groupstream_mapaccum_dummy_chunk_sizze_76779;
groupstream_mapaccum_dummy_chunk_sizze_76779 = 1;
if (chunk_sizze_76773 == D_68526) {
for (int32_t i_76780 = 0; i_76780 < D_68526; i_76780++) {
double x_76784;
double x_76785;
double res_76787;
double res_76789;
x_76784 = *(__local double *) &mem_80778[(ltid_72457 +
D_68526 *
chunk_offset_76774 +
D_68526 *
i_76780 + 0 *
D_68526) * 8];
x_76785 = *(__local
double *) &mem_80781[(chunk_offset_76774 +
i_76780) * 8];
res_76787 = x_76784 * x_76785;
res_76789 = acc_76781 + res_76787;
double acc_tmp_82586 = res_76789;
acc_76781 = acc_tmp_82586;
}
} else {
for (int32_t i_76780 = 0; i_76780 < chunk_sizze_76773;
i_76780++) {
double x_76784;
double x_76785;
double res_76787;
double res_76789;
x_76784 = *(__local double *) &mem_80778[(ltid_72457 +
D_68526 *
chunk_offset_76774 +
D_68526 *
i_76780 + 0 *
D_68526) * 8];
x_76785 = *(__local
double *) &mem_80781[(chunk_offset_76774 +
i_76780) * 8];
res_76787 = x_76784 * x_76785;
res_76789 = acc_76781 + res_76787;
double acc_tmp_82587 = res_76789;
acc_76781 = acc_tmp_82587;
}
}
res_76778 = acc_76781;
x_76775 = res_76778;
res_76772 = x_76775;
double res_76790;
res_76790 = 0.0 - res_76772;
for (int32_t i_76797 = 0; i_76797 < D_68526; i_76797++) {
double x_76798;
double res_76800;
bool cond_76801;
bool cond_76802;
x_76798 = *(__local double *) &mem_80774[i_76797 * 8];
res_76800 = x_76768 * x_76798;
cond_76801 = slt32(ltid_72457, i_76797);
cond_76802 = ltid_72457 == i_76797;
if (cond_76801) {
for (int32_t i_82590 = 0; i_82590 < D_68526; i_82590++) {
*(__global double *) &mem_80798[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_72546 +
i_82590 *
computed_group_sizze_76480) *
8] = 0.0;
}
for (int32_t i_82591 = 0; i_82591 < triD_68516; i_82591++) {
*(__global double *) &mem_80801[(group_id_72547 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_72546 +
i_82591 *
computed_group_sizze_76480) *
8] = 0.0;
}
for (int32_t i_82592 = 0; i_82592 < D_68526; i_82592++) {
*(__global double *) &mem_81742[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_72546 +
i_82592 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_80798[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_72546 +
i_82592 *
computed_group_sizze_76480) *
8];
}
for (int32_t i_82593 = 0; i_82593 < triD_68516; i_82593++) {
*(__global double *) &mem_81739[(group_id_72547 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_72546 +
i_82593 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_80801[(group_id_72547 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_72546 +
i_82593 *
computed_group_sizze_76480) *
8];
}
} else {
if (cond_76802) {
double res_76809;
double deltaVec_arg_76810;
res_76809 = futrts_exp64(qs_elem_elem_76771);
deltaVec_arg_76810 = res_76800 * res_76809;
for (int32_t i_76815 = 0; i_76815 < D_68526;
i_76815++) {
bool cond_76817;
double res_76818;
cond_76817 = i_76815 == ltid_72457;
if (cond_76817) {
res_76818 = deltaVec_arg_76810;
} else {
res_76818 = 0.0;
}
*(__global double *) &mem_80804[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_72546 +
i_76815 *
computed_group_sizze_76480) *
8] = res_76818;
}
for (int32_t i_82595 = 0; i_82595 < triD_68516;
i_82595++) {
*(__global double *) &mem_80807[(group_id_72547 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_72546 +
i_82595 *
computed_group_sizze_76480) *
8] = 0.0;
}
for (int32_t i_82596 = 0; i_82596 < D_68526;
i_82596++) {
*(__global double *) &mem_81732[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_72546 +
i_82596 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_80804[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_72546 +
i_82596 *
computed_group_sizze_76480) *
8];
}
for (int32_t i_82597 = 0; i_82597 < triD_68516;
i_82597++) {
*(__global double *) &mem_81729[(group_id_72547 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_72546 +
i_82597 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_80807[(group_id_72547 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_72546 +
i_82597 *
computed_group_sizze_76480) *
8];
}
} else {
int32_t y_76821;
int32_t x_76822;
int32_t res_76823;
int32_t deltaVec_arg_76824;
y_76821 = ltid_72457 - 1;
x_76822 = ltid_72457 * y_76821;
res_76823 = sdiv32(x_76822, 2);
deltaVec_arg_76824 = i_76797 + res_76823;
for (int32_t i_76829 = 0; i_76829 < triD_68516;
i_76829++) {
bool cond_76831;
double res_76832;
cond_76831 = i_76829 == deltaVec_arg_76824;
if (cond_76831) {
res_76832 = res_76800;
} else {
res_76832 = 0.0;
}
*(__global double *) &mem_80810[(group_id_72547 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_72546 +
i_76829 *
computed_group_sizze_76480) *
8] = res_76832;
}
for (int32_t i_82599 = 0; i_82599 < D_68526;
i_82599++) {
*(__global double *) &mem_80813[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_72546 +
i_82599 *
computed_group_sizze_76480) *
8] = 0.0;
}
for (int32_t i_82600 = 0; i_82600 < D_68526;
i_82600++) {
*(__global double *) &mem_81732[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_72546 +
i_82600 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_80813[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_72546 +
i_82600 *
computed_group_sizze_76480) *
8];
}
for (int32_t i_82601 = 0; i_82601 < triD_68516;
i_82601++) {
*(__global double *) &mem_81729[(group_id_72547 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_72546 +
i_82601 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_80810[(group_id_72547 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_72546 +
i_82601 *
computed_group_sizze_76480) *
8];
}
}
for (int32_t i_82602 = 0; i_82602 < D_68526; i_82602++) {
*(__global double *) &mem_81742[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_72546 +
i_82602 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_81732[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_72546 +
i_82602 *
computed_group_sizze_76480) *
8];
}
for (int32_t i_82603 = 0; i_82603 < triD_68516; i_82603++) {
*(__global double *) &mem_81739[(group_id_72547 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_72546 +
i_82603 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_81729[(group_id_72547 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_72546 +
i_82603 *
computed_group_sizze_76480) *
8];
}
}
for (int32_t i_82604 = 0; i_82604 < D_68526; i_82604++) {
*(__global double *) &mem_80785[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526 * D_68526) +
local_tid_72546 + i_76797 *
(computed_group_sizze_76480 *
D_68526) + i_82604 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_81742[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_72546 +
i_82604 *
computed_group_sizze_76480) *
8];
}
for (int32_t i_82605 = 0; i_82605 < triD_68516; i_82605++) {
*(__global double *) &mem_80789[(group_id_72547 *
(computed_group_sizze_76480 *
triD_68516 * D_68526) +
local_tid_72546 + i_76797 *
(computed_group_sizze_76480 *
triD_68516) + i_82605 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_81739[(group_id_72547 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_72546 +
i_82605 *
computed_group_sizze_76480) *
8];
}
}
for (int32_t i_76841 = 0; i_76841 < D_68526; i_76841++) {
double res_76843;
double redout_76844 = 0.0;
for (int32_t i_76845 = 0; i_76845 < D_68526; i_76845++) {
double x_76846;
double res_76849;
x_76846 = *(__global double *) &mem_80785[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526 *
D_68526) +
local_tid_72546 +
(i_76845 *
(computed_group_sizze_76480 *
D_68526) +
i_76841 *
computed_group_sizze_76480)) *
8];
res_76849 = redout_76844 + x_76846;
double redout_tmp_82607 = res_76849;
redout_76844 = redout_tmp_82607;
}
res_76843 = redout_76844;
*(__global double *) &mem_80792[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526) + local_tid_72546 +
i_76841 *
computed_group_sizze_76480) *
8] = res_76843;
}
for (int32_t i_76855 = 0; i_76855 < triD_68516; i_76855++) {
double res_76857;
double redout_76858 = 0.0;
for (int32_t i_76859 = 0; i_76859 < D_68526; i_76859++) {
double x_76860;
double res_76863;
x_76860 = *(__global double *) &mem_80789[(group_id_72547 *
(computed_group_sizze_76480 *
triD_68516 *
D_68526) +
local_tid_72546 +
(i_76859 *
(computed_group_sizze_76480 *
triD_68516) +
i_76855 *
computed_group_sizze_76480)) *
8];
res_76863 = redout_76858 + x_76860;
double redout_tmp_82609 = res_76863;
redout_76858 = redout_tmp_82609;
}
res_76857 = redout_76858;
*(__global double *) &mem_80795[(group_id_72547 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_72546 + i_76855 *
computed_group_sizze_76480) *
8] = res_76857;
}
for (int32_t i_82610 = 0; i_82610 < D_68526; i_82610++) {
*(__local double *) &mem_80821[(ctid_72513 * D_68526 +
i_82610) * 8] = *(__global
double *) &mem_80792[(group_id_72547 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_72546 +
i_82610 *
computed_group_sizze_76480) *
8];
}
for (int32_t i_82611 = 0; i_82611 < triD_68516; i_82611++) {
*(__local double *) &mem_80825[(ctid_72513 * triD_68516 +
i_82611) * 8] = *(__global
double *) &mem_80795[(group_id_72547 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_72546 +
i_82611 *
computed_group_sizze_76480) *
8];
}
*(__local double *) &mem_80828[ctid_72513 * 8] = res_76790;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if ((slt32(gtid_72454, N_68508) && slt32(gtid_72455, K_68510)) &&
slt32(ltid_72457, computed_group_sizze_76480)) { }
__local char *mem_80831;
mem_80831 = (__local char *) mem_80831_backing_6;
for (int32_t comb_iter_82612 = 0; comb_iter_82612 < squot32(triD_68516 +
computed_group_sizze_76480 -
1,
computed_group_sizze_76480);
comb_iter_82612++) {
int32_t ctid_72523;
int32_t flat_comb_id_82613 = comb_iter_82612 *
computed_group_sizze_76480 + local_tid_72546;
ctid_72523 = flat_comb_id_82613;
if (slt32(ctid_72523, triD_68516) && 1) {
double res_76868;
double x_76871 = 0.0;
int32_t chunk_sizze_76869;
int32_t chunk_offset_76870 = 0;
chunk_sizze_76869 = D_68526;
double res_76873;
double acc_76876 = x_76871;
int32_t groupstream_mapaccum_dummy_chunk_sizze_76874;
groupstream_mapaccum_dummy_chunk_sizze_76874 = 1;
if (chunk_sizze_76869 == D_68526) {
for (int32_t i_76875 = 0; i_76875 < D_68526; i_76875++) {
double x_76878;
double res_76881;
x_76878 = *(__local double *) &mem_80825[(ltid_72457 +
triD_68516 *
chunk_offset_76870 +
triD_68516 *
i_76875 + 0 *
triD_68516) * 8];
res_76881 = acc_76876 + x_76878;
double acc_tmp_82614 = res_76881;
acc_76876 = acc_tmp_82614;
}
} else {
for (int32_t i_76875 = 0; i_76875 < chunk_sizze_76869;
i_76875++) {
double x_76878;
double res_76881;
x_76878 = *(__local double *) &mem_80825[(ltid_72457 +
triD_68516 *
chunk_offset_76870 +
triD_68516 *
i_76875 + 0 *
triD_68516) * 8];
res_76881 = acc_76876 + x_76878;
double acc_tmp_82615 = res_76881;
acc_76876 = acc_tmp_82615;
}
}
res_76873 = acc_76876;
x_76871 = res_76873;
res_76868 = x_76871;
*(__local double *) &mem_80831[ctid_72523 * 8] = res_76868;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if ((slt32(gtid_72454, N_68508) && slt32(gtid_72455, K_68510)) &&
slt32(ltid_72457, computed_group_sizze_76480)) { }
__local char *mem_80834;
mem_80834 = (__local char *) mem_80834_backing_7;
for (int32_t comb_iter_82616 = 0; comb_iter_82616 < squot32(D_68526 +
computed_group_sizze_76480 -
1,
computed_group_sizze_76480);
comb_iter_82616++) {
int32_t ctid_72533;
int32_t flat_comb_id_82617 = comb_iter_82616 *
computed_group_sizze_76480 + local_tid_72546;
ctid_72533 = flat_comb_id_82617;
if (slt32(ctid_72533, D_68526) && 1) {
double res_76885;
double x_76888 = 0.0;
int32_t chunk_sizze_76886;
int32_t chunk_offset_76887 = 0;
chunk_sizze_76886 = D_68526;
double res_76890;
double acc_76893 = x_76888;
int32_t groupstream_mapaccum_dummy_chunk_sizze_76891;
groupstream_mapaccum_dummy_chunk_sizze_76891 = 1;
if (chunk_sizze_76886 == D_68526) {
for (int32_t i_76892 = 0; i_76892 < D_68526; i_76892++) {
double x_76895;
double res_76898;
x_76895 = *(__local double *) &mem_80821[(ltid_72457 +
D_68526 *
chunk_offset_76887 +
D_68526 *
i_76892 + 0 *
D_68526) * 8];
res_76898 = acc_76893 + x_76895;
double acc_tmp_82618 = res_76898;
acc_76893 = acc_tmp_82618;
}
} else {
for (int32_t i_76892 = 0; i_76892 < chunk_sizze_76886;
i_76892++) {
double x_76895;
double res_76898;
x_76895 = *(__local double *) &mem_80821[(ltid_72457 +
D_68526 *
chunk_offset_76887 +
D_68526 *
i_76892 + 0 *
D_68526) * 8];
res_76898 = acc_76893 + x_76895;
double acc_tmp_82619 = res_76898;
acc_76893 = acc_tmp_82619;
}
}
res_76890 = acc_76893;
x_76888 = res_76890;
res_76885 = x_76888;
double res_76899 = res_76707 + res_76885;
*(__local double *) &mem_80834[ctid_72533 * 8] = res_76899;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (local_tid_72546 == 0) {
*(__global double *) &mem_80837[group_id_72547 * 8] = res_76707;
}
for (int32_t i_82621 = 0; i_82621 < squot32(D_68526 - local_tid_72546 +
computed_group_sizze_76480 - 1,
computed_group_sizze_76480);
i_82621++) {
*(__global double *) &mem_80841[(group_id_72547 * D_68526 + (i_82621 *
computed_group_sizze_76480 +
local_tid_72546)) *
8] = *(__local
double *) &mem_80828[(i_82621 *
computed_group_sizze_76480 +
local_tid_72546) *
8];
}
for (int32_t i_82622 = 0; i_82622 < squot32(D_68526 - local_tid_72546 +
computed_group_sizze_76480 - 1,
computed_group_sizze_76480);
i_82622++) {
*(__global double *) &mem_80845[(group_id_72547 * D_68526 + (i_82622 *
computed_group_sizze_76480 +
local_tid_72546)) *
8] = *(__local
double *) &mem_80834[(i_82622 *
computed_group_sizze_76480 +
local_tid_72546) *
8];
}
for (int32_t i_82623 = 0; i_82623 < squot32(triD_68516 - local_tid_72546 +
computed_group_sizze_76480 - 1,
computed_group_sizze_76480);
i_82623++) {
*(__global double *) &mem_80849[(group_id_72547 * triD_68516 +
(i_82623 * computed_group_sizze_76480 +
local_tid_72546)) * 8] = *(__local
double *) &mem_80831[(i_82623 *
computed_group_sizze_76480 +
local_tid_72546) *
8];
}
}
__kernel void map_intra_group_73000(__local volatile
int64_t *mem_81373_backing_aligned_0,
int32_t N_68508, int32_t K_68510,
int32_t D_68526, __global
unsigned char *mem_80863, __global
unsigned char *mem_81370, __global
unsigned char *mem_81376)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
__local volatile char *restrict mem_81373_backing_0 =
mem_81373_backing_aligned_0;
int32_t global_tid_73000;
int32_t local_tid_73001;
int32_t group_sizze_82991;
int32_t wave_sizze_82990;
int32_t group_id_73002;
global_tid_73000 = get_global_id(0);
local_tid_73001 = get_local_id(0);
group_sizze_82991 = get_local_size(0);
wave_sizze_82990 = LOCKSTEP_WIDTH;
group_id_73002 = get_group_id(0);
int32_t gtid_72991;
int32_t gtid_72992;
int32_t gtid_72993;
int32_t ltid_72996;
gtid_72991 = squot32(global_tid_73000, K_68510 * D_68526 * D_68526);
gtid_72992 = squot32(global_tid_73000 - squot32(global_tid_73000, K_68510 *
D_68526 * D_68526) *
(K_68510 * D_68526 * D_68526), D_68526 * D_68526);
gtid_72993 = squot32(global_tid_73000 - squot32(global_tid_73000, K_68510 *
D_68526 * D_68526) *
(K_68510 * D_68526 * D_68526) -
squot32(global_tid_73000 - squot32(global_tid_73000,
K_68510 * D_68526 *
D_68526) *
(K_68510 * D_68526 * D_68526), D_68526 *
D_68526) * (D_68526 * D_68526), D_68526);
ltid_72996 = global_tid_73000 - squot32(global_tid_73000, K_68510 *
D_68526 * D_68526) * (K_68510 *
D_68526 *
D_68526) -
squot32(global_tid_73000 - squot32(global_tid_73000, K_68510 * D_68526 *
D_68526) * (K_68510 * D_68526 *
D_68526), D_68526 *
D_68526) * (D_68526 * D_68526) - squot32(global_tid_73000 -
squot32(global_tid_73000,
K_68510 *
D_68526 *
D_68526) *
(K_68510 * D_68526 *
D_68526) -
squot32(global_tid_73000 -
squot32(global_tid_73000,
K_68510 *
D_68526 *
D_68526) *
(K_68510 *
D_68526 *
D_68526),
D_68526 *
D_68526) *
(D_68526 * D_68526),
D_68526) * D_68526;
double res_77876;
double x_79861;
if (((slt32(gtid_72991, N_68508) && slt32(gtid_72992, K_68510)) &&
slt32(gtid_72993, D_68526)) && slt32(ltid_72996, D_68526)) {
res_77876 = *(__global double *) &mem_80863[(gtid_72991 * K_68510 +
gtid_72992) * 8];
x_79861 = *(__global double *) &mem_81370[(gtid_72991 * (D_68526 *
D_68526 *
K_68510) +
gtid_72992 * (D_68526 *
D_68526) +
gtid_72993 * D_68526 +
ltid_72996) * 8];
}
__local char *mem_81373;
double res_77880;
mem_81373 = (__local char *) mem_81373_backing_0;
for (int32_t comb_iter_82992 = 0; comb_iter_82992 < 1; comb_iter_82992++) {
int32_t ctid_72998;
int32_t flat_comb_id_82993 = comb_iter_82992 * D_68526 +
local_tid_73001;
ctid_72998 = flat_comb_id_82993;
if (slt32(ctid_72998, D_68526) && 1) {
*(__local double *) &mem_81373[ctid_72998 * 8] = x_79861;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82994;
int32_t skip_waves_82995;
double x_77881;
double x_77882;
offset_82994 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_73001, D_68526)) {
x_77881 = *(__local double *) &mem_81373[(local_tid_73001 +
offset_82994) * 8];
}
}
offset_82994 = 1;
while (slt32(offset_82994, wave_sizze_82990)) {
if (slt32(local_tid_73001 + offset_82994, D_68526) &&
((local_tid_73001 - squot32(local_tid_73001, wave_sizze_82990) *
wave_sizze_82990) & (2 * offset_82994 - 1)) == 0) {
// read array element
{
x_77882 = *(volatile __local
double *) &mem_81373[(local_tid_73001 +
offset_82994) * 8];
}
// apply reduction operation
{
double res_77883;
if (((slt32(gtid_72991, N_68508) && slt32(gtid_72992,
K_68510)) &&
slt32(gtid_72993, D_68526)) && slt32(ltid_72996,
D_68526)) {
res_77883 = x_77881 + x_77882;
}
x_77881 = res_77883;
}
// write result of operation
{
*(volatile __local double *) &mem_81373[local_tid_73001 * 8] =
x_77881;
}
}
offset_82994 *= 2;
}
skip_waves_82995 = 1;
while (slt32(skip_waves_82995, squot32(D_68526 + wave_sizze_82990 - 1,
wave_sizze_82990))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82994 = skip_waves_82995 * wave_sizze_82990;
if (slt32(local_tid_73001 + offset_82994, D_68526) &&
((local_tid_73001 - squot32(local_tid_73001, wave_sizze_82990) *
wave_sizze_82990) == 0 && (squot32(local_tid_73001,
wave_sizze_82990) & (2 *
skip_waves_82995 -
1)) ==
0)) {
// read array element
{
x_77882 = *(__local double *) &mem_81373[(local_tid_73001 +
offset_82994) * 8];
}
// apply reduction operation
{
double res_77883;
if (((slt32(gtid_72991, N_68508) && slt32(gtid_72992,
K_68510)) &&
slt32(gtid_72993, D_68526)) && slt32(ltid_72996,
D_68526)) {
res_77883 = x_77881 + x_77882;
}
x_77881 = res_77883;
}
// write result of operation
{
*(__local double *) &mem_81373[local_tid_73001 * 8] = x_77881;
}
}
skip_waves_82995 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
res_77880 = *(__local double *) &mem_81373[0];
double res_77884;
if (((slt32(gtid_72991, N_68508) && slt32(gtid_72992, K_68510)) &&
slt32(gtid_72993, D_68526)) && slt32(ltid_72996, D_68526)) {
res_77884 = res_77876 + res_77880;
}
if (local_tid_73001 == 0) {
*(__global double *) &mem_81376[group_id_73002 * 8] = res_77884;
}
}
__kernel void map_intra_group_73195(__local volatile
int64_t *mem_81315_backing_aligned_0,
int32_t N_68508, int32_t K_68510,
int32_t triD_68516, int32_t D_68526,
__global unsigned char *mem_81312, __global
unsigned char *mem_81318)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
__local volatile char *restrict mem_81315_backing_0 =
mem_81315_backing_aligned_0;
int32_t global_tid_73195;
int32_t local_tid_73196;
int32_t group_sizze_82937;
int32_t wave_sizze_82936;
int32_t group_id_73197;
global_tid_73195 = get_global_id(0);
local_tid_73196 = get_local_id(0);
group_sizze_82937 = get_local_size(0);
wave_sizze_82936 = LOCKSTEP_WIDTH;
group_id_73197 = get_group_id(0);
int32_t gtid_73186;
int32_t gtid_73187;
int32_t gtid_73188;
int32_t ltid_73191;
gtid_73186 = squot32(global_tid_73195, K_68510 * triD_68516 * D_68526);
gtid_73187 = squot32(global_tid_73195 - squot32(global_tid_73195, K_68510 *
triD_68516 * D_68526) *
(K_68510 * triD_68516 * D_68526), triD_68516 *
D_68526);
gtid_73188 = squot32(global_tid_73195 - squot32(global_tid_73195, K_68510 *
triD_68516 * D_68526) *
(K_68510 * triD_68516 * D_68526) -
squot32(global_tid_73195 - squot32(global_tid_73195,
K_68510 *
triD_68516 *
D_68526) *
(K_68510 * triD_68516 * D_68526), triD_68516 *
D_68526) * (triD_68516 * D_68526), D_68526);
ltid_73191 = global_tid_73195 - squot32(global_tid_73195, K_68510 *
triD_68516 * D_68526) * (K_68510 *
triD_68516 *
D_68526) -
squot32(global_tid_73195 - squot32(global_tid_73195, K_68510 *
triD_68516 * D_68526) * (K_68510 *
triD_68516 *
D_68526),
triD_68516 * D_68526) * (triD_68516 * D_68526) -
squot32(global_tid_73195 - squot32(global_tid_73195, K_68510 *
triD_68516 * D_68526) * (K_68510 *
triD_68516 *
D_68526) -
squot32(global_tid_73195 - squot32(global_tid_73195, K_68510 *
triD_68516 * D_68526) *
(K_68510 * triD_68516 * D_68526), triD_68516 *
D_68526) * (triD_68516 * D_68526), D_68526) * D_68526;
double x_79855;
if (((slt32(gtid_73186, N_68508) && slt32(gtid_73187, K_68510)) &&
slt32(gtid_73188, triD_68516)) && slt32(ltid_73191, D_68526)) {
x_79855 = *(__global double *) &mem_81312[(gtid_73186 * (D_68526 *
triD_68516 *
K_68510) +
gtid_73187 * (D_68526 *
triD_68516) +
gtid_73188 * D_68526 +
ltid_73191) * 8];
}
__local char *mem_81315;
double res_77783;
mem_81315 = (__local char *) mem_81315_backing_0;
for (int32_t comb_iter_82938 = 0; comb_iter_82938 < 1; comb_iter_82938++) {
int32_t ctid_73193;
int32_t flat_comb_id_82939 = comb_iter_82938 * D_68526 +
local_tid_73196;
ctid_73193 = flat_comb_id_82939;
if (slt32(ctid_73193, D_68526) && 1) {
*(__local double *) &mem_81315[ctid_73193 * 8] = x_79855;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82940;
int32_t skip_waves_82941;
double x_77784;
double x_77785;
offset_82940 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_73196, D_68526)) {
x_77784 = *(__local double *) &mem_81315[(local_tid_73196 +
offset_82940) * 8];
}
}
offset_82940 = 1;
while (slt32(offset_82940, wave_sizze_82936)) {
if (slt32(local_tid_73196 + offset_82940, D_68526) &&
((local_tid_73196 - squot32(local_tid_73196, wave_sizze_82936) *
wave_sizze_82936) & (2 * offset_82940 - 1)) == 0) {
// read array element
{
x_77785 = *(volatile __local
double *) &mem_81315[(local_tid_73196 +
offset_82940) * 8];
}
// apply reduction operation
{
double res_77786;
if (((slt32(gtid_73186, N_68508) && slt32(gtid_73187,
K_68510)) &&
slt32(gtid_73188, triD_68516)) && slt32(ltid_73191,
D_68526)) {
res_77786 = x_77784 + x_77785;
}
x_77784 = res_77786;
}
// write result of operation
{
*(volatile __local double *) &mem_81315[local_tid_73196 * 8] =
x_77784;
}
}
offset_82940 *= 2;
}
skip_waves_82941 = 1;
while (slt32(skip_waves_82941, squot32(D_68526 + wave_sizze_82936 - 1,
wave_sizze_82936))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82940 = skip_waves_82941 * wave_sizze_82936;
if (slt32(local_tid_73196 + offset_82940, D_68526) &&
((local_tid_73196 - squot32(local_tid_73196, wave_sizze_82936) *
wave_sizze_82936) == 0 && (squot32(local_tid_73196,
wave_sizze_82936) & (2 *
skip_waves_82941 -
1)) ==
0)) {
// read array element
{
x_77785 = *(__local double *) &mem_81315[(local_tid_73196 +
offset_82940) * 8];
}
// apply reduction operation
{
double res_77786;
if (((slt32(gtid_73186, N_68508) && slt32(gtid_73187,
K_68510)) &&
slt32(gtid_73188, triD_68516)) && slt32(ltid_73191,
D_68526)) {
res_77786 = x_77784 + x_77785;
}
x_77784 = res_77786;
}
// write result of operation
{
*(__local double *) &mem_81315[local_tid_73196 * 8] = x_77784;
}
}
skip_waves_82941 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
res_77783 = *(__local double *) &mem_81315[0];
if (local_tid_73196 == 0) {
*(__global double *) &mem_81318[group_id_73197 * 8] = res_77783;
}
}
__kernel void map_intra_group_73444(__local volatile
int64_t *mem_81029_backing_aligned_0,
__local volatile
int64_t *mem_81055_backing_aligned_1,
__local volatile
int64_t *mem_81059_backing_aligned_2,
__local volatile
int64_t *mem_81062_backing_aligned_3,
__local volatile
int64_t *mem_81065_backing_aligned_4,
int32_t N_68508, int32_t K_68510,
int32_t triD_68516, int32_t D_68526,
int32_t computed_group_sizze_76480, __global
unsigned char *mem_80873, __global
unsigned char *mem_80889, __global
unsigned char *res_r_r_mem_80930, __global
unsigned char *mem_80942, __global
unsigned char *mem_81026, __global
unsigned char *mem_81032, __global
unsigned char *mem_81035, __global
unsigned char *mem_81038, __global
unsigned char *mem_81041, __global
unsigned char *mem_81044, __global
unsigned char *mem_81047, __global
unsigned char *mem_81069, __global
unsigned char *mem_81073, __global
unsigned char *mem_81076, __global
unsigned char *mem_81769, __global
unsigned char *mem_81772, __global
unsigned char *mem_81779, __global
unsigned char *mem_81782)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
__local volatile char *restrict mem_81029_backing_0 =
mem_81029_backing_aligned_0;
__local volatile char *restrict mem_81055_backing_1 =
mem_81055_backing_aligned_1;
__local volatile char *restrict mem_81059_backing_2 =
mem_81059_backing_aligned_2;
__local volatile char *restrict mem_81062_backing_3 =
mem_81062_backing_aligned_3;
__local volatile char *restrict mem_81065_backing_4 =
mem_81065_backing_aligned_4;
int32_t global_tid_73444;
int32_t local_tid_73445;
int32_t group_sizze_82724;
int32_t wave_sizze_82723;
int32_t group_id_73446;
global_tid_73444 = get_global_id(0);
local_tid_73445 = get_local_id(0);
group_sizze_82724 = get_local_size(0);
wave_sizze_82723 = LOCKSTEP_WIDTH;
group_id_73446 = get_group_id(0);
int32_t gtid_73396;
int32_t gtid_73397;
int32_t gtid_73398;
int32_t ltid_73401;
gtid_73396 = squot32(global_tid_73444, K_68510 * D_68526 *
computed_group_sizze_76480);
gtid_73397 = squot32(global_tid_73444 - squot32(global_tid_73444, K_68510 *
D_68526 *
computed_group_sizze_76480) *
(K_68510 * D_68526 * computed_group_sizze_76480),
D_68526 * computed_group_sizze_76480);
gtid_73398 = squot32(global_tid_73444 - squot32(global_tid_73444, K_68510 *
D_68526 *
computed_group_sizze_76480) *
(K_68510 * D_68526 * computed_group_sizze_76480) -
squot32(global_tid_73444 - squot32(global_tid_73444,
K_68510 * D_68526 *
computed_group_sizze_76480) *
(K_68510 * D_68526 *
computed_group_sizze_76480), D_68526 *
computed_group_sizze_76480) * (D_68526 *
computed_group_sizze_76480),
computed_group_sizze_76480);
ltid_73401 = global_tid_73444 - squot32(global_tid_73444, K_68510 *
D_68526 *
computed_group_sizze_76480) *
(K_68510 * D_68526 * computed_group_sizze_76480) -
squot32(global_tid_73444 - squot32(global_tid_73444, K_68510 * D_68526 *
computed_group_sizze_76480) *
(K_68510 * D_68526 * computed_group_sizze_76480), D_68526 *
computed_group_sizze_76480) * (D_68526 *
computed_group_sizze_76480) -
squot32(global_tid_73444 - squot32(global_tid_73444, K_68510 * D_68526 *
computed_group_sizze_76480) *
(K_68510 * D_68526 * computed_group_sizze_76480) -
squot32(global_tid_73444 - squot32(global_tid_73444, K_68510 *
D_68526 *
computed_group_sizze_76480) *
(K_68510 * D_68526 * computed_group_sizze_76480),
D_68526 * computed_group_sizze_76480) * (D_68526 *
computed_group_sizze_76480),
computed_group_sizze_76480) * computed_group_sizze_76480;
double x_77300;
double qs_elem_elem_77303;
if (((slt32(gtid_73396, N_68508) && slt32(gtid_73397, K_68510)) &&
slt32(gtid_73398, D_68526)) && slt32(ltid_73401,
computed_group_sizze_76480)) {
x_77300 = *(__global double *) &mem_81026[(gtid_73397 * (N_68508 *
D_68526) +
gtid_73398 * N_68508 +
gtid_73396) * 8];
qs_elem_elem_77303 = *(__global double *) &mem_80873[(gtid_73397 *
(N_68508 *
D_68526) +
gtid_73398 *
N_68508 +
gtid_73396) * 8];
}
__local char *mem_81029;
double res_77308;
mem_81029 = (__local char *) mem_81029_backing_0;
for (int32_t comb_iter_82725 = 0; comb_iter_82725 < squot32(D_68526 +
computed_group_sizze_76480 -
1,
computed_group_sizze_76480);
comb_iter_82725++) {
int32_t ctid_73403;
int32_t flat_comb_id_82726 = comb_iter_82725 *
computed_group_sizze_76480 + local_tid_73445;
ctid_73403 = flat_comb_id_82726;
if (slt32(ctid_73403, D_68526) && 1) {
double x_77305;
double x_77306;
double res_77307;
x_77305 = *(__global double *) &mem_80942[(gtid_73398 * (K_68510 *
N_68508 *
D_68526) +
ltid_73401 * (K_68510 *
N_68508) +
gtid_73396 * K_68510 +
gtid_73397) * 8];
x_77306 = *(__global double *) &res_r_r_mem_80930[(gtid_73396 *
(D_68526 *
K_68510) +
gtid_73397 *
D_68526 +
ltid_73401) * 8];
res_77307 = x_77305 * x_77306;
*(__local double *) &mem_81029[ctid_73403 * 8] = res_77307;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82727;
int32_t skip_waves_82728;
double x_77309;
double x_77310;
offset_82727 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_73445, D_68526)) {
x_77309 = *(__local double *) &mem_81029[(local_tid_73445 +
offset_82727) * 8];
}
}
offset_82727 = 1;
while (slt32(offset_82727, wave_sizze_82723)) {
if (slt32(local_tid_73445 + offset_82727, D_68526) &&
((local_tid_73445 - squot32(local_tid_73445, wave_sizze_82723) *
wave_sizze_82723) & (2 * offset_82727 - 1)) == 0) {
// read array element
{
x_77310 = *(volatile __local
double *) &mem_81029[(local_tid_73445 +
offset_82727) * 8];
}
// apply reduction operation
{
double res_77311;
if (((slt32(gtid_73396, N_68508) && slt32(gtid_73397,
K_68510)) &&
slt32(gtid_73398, D_68526)) && slt32(ltid_73401,
computed_group_sizze_76480)) {
res_77311 = x_77309 + x_77310;
}
x_77309 = res_77311;
}
// write result of operation
{
*(volatile __local double *) &mem_81029[local_tid_73445 * 8] =
x_77309;
}
}
offset_82727 *= 2;
}
skip_waves_82728 = 1;
while (slt32(skip_waves_82728, squot32(computed_group_sizze_76480 +
wave_sizze_82723 - 1,
wave_sizze_82723))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82727 = skip_waves_82728 * wave_sizze_82723;
if (slt32(local_tid_73445 + offset_82727, D_68526) &&
((local_tid_73445 - squot32(local_tid_73445, wave_sizze_82723) *
wave_sizze_82723) == 0 && (squot32(local_tid_73445,
wave_sizze_82723) & (2 *
skip_waves_82728 -
1)) ==
0)) {
// read array element
{
x_77310 = *(__local double *) &mem_81029[(local_tid_73445 +
offset_82727) * 8];
}
// apply reduction operation
{
double res_77311;
if (((slt32(gtid_73396, N_68508) && slt32(gtid_73397,
K_68510)) &&
slt32(gtid_73398, D_68526)) && slt32(ltid_73401,
computed_group_sizze_76480)) {
res_77311 = x_77309 + x_77310;
}
x_77309 = res_77311;
}
// write result of operation
{
*(__local double *) &mem_81029[local_tid_73445 * 8] = x_77309;
}
}
skip_waves_82728 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
res_77308 = *(__local double *) &mem_81029[0];
double res_77312;
bool cond_77318;
bool cond_77319;
if (((slt32(gtid_73396, N_68508) && slt32(gtid_73397, K_68510)) &&
slt32(gtid_73398, D_68526)) && slt32(ltid_73401,
computed_group_sizze_76480)) {
res_77312 = 0.0 - res_77308;
cond_77318 = slt32(gtid_73398, ltid_73401);
cond_77319 = gtid_73398 == ltid_73401;
}
__local char *mem_81055;
__local char *mem_81059;
mem_81055 = (__local char *) mem_81055_backing_1;
mem_81059 = (__local char *) mem_81059_backing_2;
for (int32_t comb_iter_82729 = 0; comb_iter_82729 < squot32(D_68526 +
computed_group_sizze_76480 -
1,
computed_group_sizze_76480);
comb_iter_82729++) {
int32_t ctid_73412;
int32_t flat_comb_id_82730 = comb_iter_82729 *
computed_group_sizze_76480 + local_tid_73445;
ctid_73412 = flat_comb_id_82730;
if (slt32(ctid_73412, D_68526) && 1) {
double x_77315;
double res_77317;
x_77315 = *(__global double *) &mem_80889[(gtid_73396 * (D_68526 *
K_68510) +
gtid_73397 * D_68526 +
ltid_73401) * 8];
res_77317 = x_77300 * x_77315;
if (cond_77318) {
for (int32_t i_82731 = 0; i_82731 < D_68526; i_82731++) {
*(__global double *) &mem_81032[(group_id_73446 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_73445 + i_82731 *
computed_group_sizze_76480) *
8] = 0.0;
}
for (int32_t i_82732 = 0; i_82732 < triD_68516; i_82732++) {
*(__global double *) &mem_81035[(group_id_73446 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_73445 + i_82732 *
computed_group_sizze_76480) *
8] = 0.0;
}
for (int32_t i_82733 = 0; i_82733 < D_68526; i_82733++) {
*(__global double *) &mem_81782[(group_id_73446 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_73445 + i_82733 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_81032[(group_id_73446 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_73445 +
i_82733 *
computed_group_sizze_76480) *
8];
}
for (int32_t i_82734 = 0; i_82734 < triD_68516; i_82734++) {
*(__global double *) &mem_81779[(group_id_73446 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_73445 + i_82734 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_81035[(group_id_73446 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_73445 +
i_82734 *
computed_group_sizze_76480) *
8];
}
} else {
if (cond_77319) {
double res_77326;
double deltaVec_arg_77327;
res_77326 = futrts_exp64(qs_elem_elem_77303);
deltaVec_arg_77327 = res_77317 * res_77326;
for (int32_t i_77332 = 0; i_77332 < D_68526; i_77332++) {
bool cond_77334;
double res_77335;
cond_77334 = i_77332 == gtid_73398;
if (cond_77334) {
res_77335 = deltaVec_arg_77327;
} else {
res_77335 = 0.0;
}
*(__global double *) &mem_81038[(group_id_73446 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_73445 +
i_77332 *
computed_group_sizze_76480) *
8] = res_77335;
}
for (int32_t i_82736 = 0; i_82736 < triD_68516; i_82736++) {
*(__global double *) &mem_81041[(group_id_73446 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_73445 +
i_82736 *
computed_group_sizze_76480) *
8] = 0.0;
}
for (int32_t i_82737 = 0; i_82737 < D_68526; i_82737++) {
*(__global double *) &mem_81772[(group_id_73446 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_73445 +
i_82737 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_81038[(group_id_73446 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_73445 +
i_82737 *
computed_group_sizze_76480) *
8];
}
for (int32_t i_82738 = 0; i_82738 < triD_68516; i_82738++) {
*(__global double *) &mem_81769[(group_id_73446 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_73445 +
i_82738 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_81041[(group_id_73446 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_73445 +
i_82738 *
computed_group_sizze_76480) *
8];
}
} else {
int32_t y_77338;
int32_t x_77339;
int32_t res_77340;
int32_t deltaVec_arg_77341;
y_77338 = gtid_73398 - 1;
x_77339 = gtid_73398 * y_77338;
res_77340 = sdiv32(x_77339, 2);
deltaVec_arg_77341 = ltid_73401 + res_77340;
for (int32_t i_77346 = 0; i_77346 < triD_68516; i_77346++) {
bool cond_77348;
double res_77349;
cond_77348 = i_77346 == deltaVec_arg_77341;
if (cond_77348) {
res_77349 = res_77317;
} else {
res_77349 = 0.0;
}
*(__global double *) &mem_81044[(group_id_73446 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_73445 +
i_77346 *
computed_group_sizze_76480) *
8] = res_77349;
}
for (int32_t i_82740 = 0; i_82740 < D_68526; i_82740++) {
*(__global double *) &mem_81047[(group_id_73446 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_73445 +
i_82740 *
computed_group_sizze_76480) *
8] = 0.0;
}
for (int32_t i_82741 = 0; i_82741 < D_68526; i_82741++) {
*(__global double *) &mem_81772[(group_id_73446 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_73445 +
i_82741 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_81047[(group_id_73446 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_73445 +
i_82741 *
computed_group_sizze_76480) *
8];
}
for (int32_t i_82742 = 0; i_82742 < triD_68516; i_82742++) {
*(__global double *) &mem_81769[(group_id_73446 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_73445 +
i_82742 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_81044[(group_id_73446 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_73445 +
i_82742 *
computed_group_sizze_76480) *
8];
}
}
for (int32_t i_82743 = 0; i_82743 < D_68526; i_82743++) {
*(__global double *) &mem_81782[(group_id_73446 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_73445 + i_82743 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_81772[(group_id_73446 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_73445 +
i_82743 *
computed_group_sizze_76480) *
8];
}
for (int32_t i_82744 = 0; i_82744 < triD_68516; i_82744++) {
*(__global double *) &mem_81779[(group_id_73446 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_73445 + i_82744 *
computed_group_sizze_76480) *
8] = *(__global
double *) &mem_81769[(group_id_73446 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_73445 +
i_82744 *
computed_group_sizze_76480) *
8];
}
}
for (int32_t i_82745 = 0; i_82745 < D_68526; i_82745++) {
*(__local double *) &mem_81055[(ctid_73412 * D_68526 +
i_82745) * 8] = *(__global
double *) &mem_81782[(group_id_73446 *
(computed_group_sizze_76480 *
D_68526) +
local_tid_73445 +
i_82745 *
computed_group_sizze_76480) *
8];
}
for (int32_t i_82746 = 0; i_82746 < triD_68516; i_82746++) {
*(__local double *) &mem_81059[(ctid_73412 * triD_68516 +
i_82746) * 8] = *(__global
double *) &mem_81779[(group_id_73446 *
(computed_group_sizze_76480 *
triD_68516) +
local_tid_73445 +
i_82746 *
computed_group_sizze_76480) *
8];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (((slt32(gtid_73396, N_68508) && slt32(gtid_73397, K_68510)) &&
slt32(gtid_73398, D_68526)) && slt32(ltid_73401,
computed_group_sizze_76480)) { }
__local char *mem_81062;
mem_81062 = (__local char *) mem_81062_backing_3;
for (int32_t comb_iter_82747 = 0; comb_iter_82747 < squot32(D_68526 +
computed_group_sizze_76480 -
1,
computed_group_sizze_76480);
comb_iter_82747++) {
int32_t ctid_73422;
int32_t flat_comb_id_82748 = comb_iter_82747 *
computed_group_sizze_76480 + local_tid_73445;
ctid_73422 = flat_comb_id_82748;
if (slt32(ctid_73422, D_68526) && 1) {
double res_77355;
double x_77358 = 0.0;
int32_t chunk_sizze_77356;
int32_t chunk_offset_77357 = 0;
chunk_sizze_77356 = D_68526;
double res_77360;
double acc_77363 = x_77358;
int32_t groupstream_mapaccum_dummy_chunk_sizze_77361;
groupstream_mapaccum_dummy_chunk_sizze_77361 = 1;
if (chunk_sizze_77356 == D_68526) {
for (int32_t i_77362 = 0; i_77362 < D_68526; i_77362++) {
double x_77365;
double res_77368;
x_77365 = *(__local double *) &mem_81055[(ltid_73401 +
D_68526 *
chunk_offset_77357 +
D_68526 *
i_77362 + 0 *
D_68526) * 8];
res_77368 = acc_77363 + x_77365;
double acc_tmp_82749 = res_77368;
acc_77363 = acc_tmp_82749;
}
} else {
for (int32_t i_77362 = 0; i_77362 < chunk_sizze_77356;
i_77362++) {
double x_77365;
double res_77368;
x_77365 = *(__local double *) &mem_81055[(ltid_73401 +
D_68526 *
chunk_offset_77357 +
D_68526 *
i_77362 + 0 *
D_68526) * 8];
res_77368 = acc_77363 + x_77365;
double acc_tmp_82750 = res_77368;
acc_77363 = acc_tmp_82750;
}
}
res_77360 = acc_77363;
x_77358 = res_77360;
res_77355 = x_77358;
*(__local double *) &mem_81062[ctid_73422 * 8] = res_77355;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (((slt32(gtid_73396, N_68508) && slt32(gtid_73397, K_68510)) &&
slt32(gtid_73398, D_68526)) && slt32(ltid_73401,
computed_group_sizze_76480)) { }
__local char *mem_81065;
mem_81065 = (__local char *) mem_81065_backing_4;
for (int32_t comb_iter_82751 = 0; comb_iter_82751 < squot32(triD_68516 +
computed_group_sizze_76480 -
1,
computed_group_sizze_76480);
comb_iter_82751++) {
int32_t ctid_73432;
int32_t flat_comb_id_82752 = comb_iter_82751 *
computed_group_sizze_76480 + local_tid_73445;
ctid_73432 = flat_comb_id_82752;
if (slt32(ctid_73432, triD_68516) && 1) {
double res_77372;
double x_77375 = 0.0;
int32_t chunk_sizze_77373;
int32_t chunk_offset_77374 = 0;
chunk_sizze_77373 = D_68526;
double res_77377;
double acc_77380 = x_77375;
int32_t groupstream_mapaccum_dummy_chunk_sizze_77378;
groupstream_mapaccum_dummy_chunk_sizze_77378 = 1;
if (chunk_sizze_77373 == D_68526) {
for (int32_t i_77379 = 0; i_77379 < D_68526; i_77379++) {
double x_77382;
double res_77385;
x_77382 = *(__local double *) &mem_81059[(ltid_73401 +
triD_68516 *
chunk_offset_77374 +
triD_68516 *
i_77379 + 0 *
triD_68516) * 8];
res_77385 = acc_77380 + x_77382;
double acc_tmp_82753 = res_77385;
acc_77380 = acc_tmp_82753;
}
} else {
for (int32_t i_77379 = 0; i_77379 < chunk_sizze_77373;
i_77379++) {
double x_77382;
double res_77385;
x_77382 = *(__local double *) &mem_81059[(ltid_73401 +
triD_68516 *
chunk_offset_77374 +
triD_68516 *
i_77379 + 0 *
triD_68516) * 8];
res_77385 = acc_77380 + x_77382;
double acc_tmp_82754 = res_77385;
acc_77380 = acc_tmp_82754;
}
}
res_77377 = acc_77380;
x_77375 = res_77377;
res_77372 = x_77375;
*(__local double *) &mem_81065[ctid_73432 * 8] = res_77372;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int32_t i_82755 = 0; i_82755 < squot32(triD_68516 - local_tid_73445 +
computed_group_sizze_76480 - 1,
computed_group_sizze_76480);
i_82755++) {
*(__global double *) &mem_81069[(group_id_73446 * triD_68516 +
(i_82755 * computed_group_sizze_76480 +
local_tid_73445)) * 8] = *(__local
double *) &mem_81065[(i_82755 *
computed_group_sizze_76480 +
local_tid_73445) *
8];
}
for (int32_t i_82756 = 0; i_82756 < squot32(D_68526 - local_tid_73445 +
computed_group_sizze_76480 - 1,
computed_group_sizze_76480);
i_82756++) {
*(__global double *) &mem_81073[(group_id_73446 * D_68526 + (i_82756 *
computed_group_sizze_76480 +
local_tid_73445)) *
8] = *(__local
double *) &mem_81062[(i_82756 *
computed_group_sizze_76480 +
local_tid_73445) *
8];
}
if (local_tid_73445 == 0) {
*(__global double *) &mem_81076[group_id_73446 * 8] = res_77312;
}
}
__kernel void map_intra_group_73690(__local volatile
int64_t *mem_81253_backing_aligned_0,
int32_t N_68508, int32_t K_68510,
int32_t triD_68516, int32_t D_68526,
__global unsigned char *mem_81250, __global
unsigned char *mem_81256)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
__local volatile char *restrict mem_81253_backing_0 =
mem_81253_backing_aligned_0;
int32_t global_tid_73690;
int32_t local_tid_73691;
int32_t group_sizze_82883;
int32_t wave_sizze_82882;
int32_t group_id_73692;
global_tid_73690 = get_global_id(0);
local_tid_73691 = get_local_id(0);
group_sizze_82883 = get_local_size(0);
wave_sizze_82882 = LOCKSTEP_WIDTH;
group_id_73692 = get_group_id(0);
int32_t gtid_73679;
int32_t gtid_73680;
int32_t gtid_73681;
int32_t gtid_73682;
int32_t ltid_73686;
gtid_73679 = squot32(global_tid_73690, K_68510 * D_68526 * triD_68516 *
D_68526);
gtid_73680 = squot32(global_tid_73690 - squot32(global_tid_73690, K_68510 *
D_68526 * triD_68516 *
D_68526) * (K_68510 *
D_68526 *
triD_68516 *
D_68526),
D_68526 * triD_68516 * D_68526);
gtid_73681 = squot32(global_tid_73690 - squot32(global_tid_73690, K_68510 *
D_68526 * triD_68516 *
D_68526) * (K_68510 *
D_68526 *
triD_68516 *
D_68526) -
squot32(global_tid_73690 - squot32(global_tid_73690,
K_68510 * D_68526 *
triD_68516 *
D_68526) *
(K_68510 * D_68526 * triD_68516 * D_68526),
D_68526 * triD_68516 * D_68526) * (D_68526 *
triD_68516 *
D_68526),
triD_68516 * D_68526);
gtid_73682 = squot32(global_tid_73690 - squot32(global_tid_73690, K_68510 *
D_68526 * triD_68516 *
D_68526) * (K_68510 *
D_68526 *
triD_68516 *
D_68526) -
squot32(global_tid_73690 - squot32(global_tid_73690,
K_68510 * D_68526 *
triD_68516 *
D_68526) *
(K_68510 * D_68526 * triD_68516 * D_68526),
D_68526 * triD_68516 * D_68526) * (D_68526 *
triD_68516 *
D_68526) -
squot32(global_tid_73690 - squot32(global_tid_73690,
K_68510 * D_68526 *
triD_68516 *
D_68526) *
(K_68510 * D_68526 * triD_68516 * D_68526) -
squot32(global_tid_73690 -
squot32(global_tid_73690, K_68510 *
D_68526 * triD_68516 *
D_68526) * (K_68510 * D_68526 *
triD_68516 *
D_68526), D_68526 *
triD_68516 * D_68526) * (D_68526 *
triD_68516 *
D_68526),
triD_68516 * D_68526) * (triD_68516 * D_68526),
D_68526);
ltid_73686 = global_tid_73690 - squot32(global_tid_73690, K_68510 *
D_68526 * triD_68516 * D_68526) *
(K_68510 * D_68526 * triD_68516 * D_68526) - squot32(global_tid_73690 -
squot32(global_tid_73690,
K_68510 *
D_68526 *
triD_68516 *
D_68526) *
(K_68510 *
D_68526 *
triD_68516 *
D_68526),
D_68526 *
triD_68516 *
D_68526) *
(D_68526 * triD_68516 * D_68526) - squot32(global_tid_73690 -
squot32(global_tid_73690,
K_68510 * D_68526 *
triD_68516 *
D_68526) * (K_68510 *
D_68526 *
triD_68516 *
D_68526) -
squot32(global_tid_73690 -
squot32(global_tid_73690,
K_68510 *
D_68526 *
triD_68516 *
D_68526) *
(K_68510 * D_68526 *
triD_68516 *
D_68526), D_68526 *
triD_68516 *
D_68526) * (D_68526 *
triD_68516 *
D_68526),
triD_68516 * D_68526) *
(triD_68516 * D_68526) - squot32(global_tid_73690 -
squot32(global_tid_73690, K_68510 *
D_68526 * triD_68516 *
D_68526) * (K_68510 * D_68526 *
triD_68516 *
D_68526) -
squot32(global_tid_73690 -
squot32(global_tid_73690,
K_68510 * D_68526 *
triD_68516 * D_68526) *
(K_68510 * D_68526 *
triD_68516 * D_68526),
D_68526 * triD_68516 *
D_68526) * (D_68526 *
triD_68516 *
D_68526) -
squot32(global_tid_73690 -
squot32(global_tid_73690,
K_68510 * D_68526 *
triD_68516 * D_68526) *
(K_68510 * D_68526 *
triD_68516 * D_68526) -
squot32(global_tid_73690 -
squot32(global_tid_73690,
K_68510 *
D_68526 *
triD_68516 *
D_68526) *
(K_68510 * D_68526 *
triD_68516 * D_68526),
D_68526 * triD_68516 *
D_68526) * (D_68526 *
triD_68516 *
D_68526),
triD_68516 * D_68526) *
(triD_68516 * D_68526), D_68526) *
D_68526;
double x_79849;
if ((((slt32(gtid_73679, N_68508) && slt32(gtid_73680, K_68510)) &&
slt32(gtid_73681, D_68526)) && slt32(gtid_73682, triD_68516)) &&
slt32(ltid_73686, D_68526)) {
x_79849 = *(__global double *) &mem_81250[(gtid_73679 * (D_68526 *
triD_68516 *
D_68526 *
K_68510) +
gtid_73680 * (D_68526 *
triD_68516 *
D_68526) +
gtid_73681 * (D_68526 *
triD_68516) +
gtid_73682 * D_68526 +
ltid_73686) * 8];
}
__local char *mem_81253;
double res_77684;
mem_81253 = (__local char *) mem_81253_backing_0;
for (int32_t comb_iter_82884 = 0; comb_iter_82884 < 1; comb_iter_82884++) {
int32_t ctid_73688;
int32_t flat_comb_id_82885 = comb_iter_82884 * D_68526 +
local_tid_73691;
ctid_73688 = flat_comb_id_82885;
if (slt32(ctid_73688, D_68526) && 1) {
*(__local double *) &mem_81253[ctid_73688 * 8] = x_79849;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82886;
int32_t skip_waves_82887;
double x_77685;
double x_77686;
offset_82886 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_73691, D_68526)) {
x_77685 = *(__local double *) &mem_81253[(local_tid_73691 +
offset_82886) * 8];
}
}
offset_82886 = 1;
while (slt32(offset_82886, wave_sizze_82882)) {
if (slt32(local_tid_73691 + offset_82886, D_68526) &&
((local_tid_73691 - squot32(local_tid_73691, wave_sizze_82882) *
wave_sizze_82882) & (2 * offset_82886 - 1)) == 0) {
// read array element
{
x_77686 = *(volatile __local
double *) &mem_81253[(local_tid_73691 +
offset_82886) * 8];
}
// apply reduction operation
{
double res_77687;
if ((((slt32(gtid_73679, N_68508) && slt32(gtid_73680,
K_68510)) &&
slt32(gtid_73681, D_68526)) && slt32(gtid_73682,
triD_68516)) &&
slt32(ltid_73686, D_68526)) {
res_77687 = x_77685 + x_77686;
}
x_77685 = res_77687;
}
// write result of operation
{
*(volatile __local double *) &mem_81253[local_tid_73691 * 8] =
x_77685;
}
}
offset_82886 *= 2;
}
skip_waves_82887 = 1;
while (slt32(skip_waves_82887, squot32(D_68526 + wave_sizze_82882 - 1,
wave_sizze_82882))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82886 = skip_waves_82887 * wave_sizze_82882;
if (slt32(local_tid_73691 + offset_82886, D_68526) &&
((local_tid_73691 - squot32(local_tid_73691, wave_sizze_82882) *
wave_sizze_82882) == 0 && (squot32(local_tid_73691,
wave_sizze_82882) & (2 *
skip_waves_82887 -
1)) ==
0)) {
// read array element
{
x_77686 = *(__local double *) &mem_81253[(local_tid_73691 +
offset_82886) * 8];
}
// apply reduction operation
{
double res_77687;
if ((((slt32(gtid_73679, N_68508) && slt32(gtid_73680,
K_68510)) &&
slt32(gtid_73681, D_68526)) && slt32(gtid_73682,
triD_68516)) &&
slt32(ltid_73686, D_68526)) {
res_77687 = x_77685 + x_77686;
}
x_77685 = res_77687;
}
// write result of operation
{
*(__local double *) &mem_81253[local_tid_73691 * 8] = x_77685;
}
}
skip_waves_82887 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
res_77684 = *(__local double *) &mem_81253[0];
if (local_tid_73691 == 0) {
*(__global double *) &mem_81256[group_id_73692 * 8] = res_77684;
}
}
__kernel void map_intra_group_73863(__local volatile
int64_t *mem_81184_backing_aligned_0,
int32_t N_68508, int32_t K_68510,
int32_t D_68526, __global
unsigned char *mem_81181, __global
unsigned char *mem_81187)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
__local volatile char *restrict mem_81184_backing_0 =
mem_81184_backing_aligned_0;
int32_t global_tid_73863;
int32_t local_tid_73864;
int32_t group_sizze_82829;
int32_t wave_sizze_82828;
int32_t group_id_73865;
global_tid_73863 = get_global_id(0);
local_tid_73864 = get_local_id(0);
group_sizze_82829 = get_local_size(0);
wave_sizze_82828 = LOCKSTEP_WIDTH;
group_id_73865 = get_group_id(0);
int32_t gtid_73852;
int32_t gtid_73853;
int32_t gtid_73854;
int32_t gtid_73855;
int32_t ltid_73859;
gtid_73852 = squot32(global_tid_73863, K_68510 * D_68526 * D_68526 *
D_68526);
gtid_73853 = squot32(global_tid_73863 - squot32(global_tid_73863, K_68510 *
D_68526 * D_68526 *
D_68526) * (K_68510 *
D_68526 *
D_68526 *
D_68526),
D_68526 * D_68526 * D_68526);
gtid_73854 = squot32(global_tid_73863 - squot32(global_tid_73863, K_68510 *
D_68526 * D_68526 *
D_68526) * (K_68510 *
D_68526 *
D_68526 *
D_68526) -
squot32(global_tid_73863 - squot32(global_tid_73863,
K_68510 * D_68526 *
D_68526 * D_68526) *
(K_68510 * D_68526 * D_68526 * D_68526),
D_68526 * D_68526 * D_68526) * (D_68526 *
D_68526 *
D_68526),
D_68526 * D_68526);
gtid_73855 = squot32(global_tid_73863 - squot32(global_tid_73863, K_68510 *
D_68526 * D_68526 *
D_68526) * (K_68510 *
D_68526 *
D_68526 *
D_68526) -
squot32(global_tid_73863 - squot32(global_tid_73863,
K_68510 * D_68526 *
D_68526 * D_68526) *
(K_68510 * D_68526 * D_68526 * D_68526),
D_68526 * D_68526 * D_68526) * (D_68526 *
D_68526 *
D_68526) -
squot32(global_tid_73863 - squot32(global_tid_73863,
K_68510 * D_68526 *
D_68526 * D_68526) *
(K_68510 * D_68526 * D_68526 * D_68526) -
squot32(global_tid_73863 -
squot32(global_tid_73863, K_68510 *
D_68526 * D_68526 * D_68526) *
(K_68510 * D_68526 * D_68526 *
D_68526), D_68526 * D_68526 *
D_68526) * (D_68526 * D_68526 *
D_68526), D_68526 *
D_68526) * (D_68526 * D_68526), D_68526);
ltid_73859 = global_tid_73863 - squot32(global_tid_73863, K_68510 *
D_68526 * D_68526 * D_68526) *
(K_68510 * D_68526 * D_68526 * D_68526) - squot32(global_tid_73863 -
squot32(global_tid_73863,
K_68510 *
D_68526 *
D_68526 *
D_68526) *
(K_68510 * D_68526 *
D_68526 * D_68526),
D_68526 * D_68526 *
D_68526) * (D_68526 *
D_68526 *
D_68526) -
squot32(global_tid_73863 - squot32(global_tid_73863, K_68510 * D_68526 *
D_68526 * D_68526) * (K_68510 *
D_68526 *
D_68526 *
D_68526) -
squot32(global_tid_73863 - squot32(global_tid_73863, K_68510 *
D_68526 * D_68526 *
D_68526) * (K_68510 *
D_68526 *
D_68526 *
D_68526),
D_68526 * D_68526 * D_68526) * (D_68526 * D_68526 *
D_68526), D_68526 *
D_68526) * (D_68526 * D_68526) - squot32(global_tid_73863 -
squot32(global_tid_73863,
K_68510 *
D_68526 *
D_68526 *
D_68526) *
(K_68510 * D_68526 *
D_68526 * D_68526) -
squot32(global_tid_73863 -
squot32(global_tid_73863,
K_68510 *
D_68526 *
D_68526 *
D_68526) *
(K_68510 *
D_68526 *
D_68526 *
D_68526),
D_68526 *
D_68526 *
D_68526) *
(D_68526 * D_68526 *
D_68526) -
squot32(global_tid_73863 -
squot32(global_tid_73863,
K_68510 *
D_68526 *
D_68526 *
D_68526) *
(K_68510 *
D_68526 *
D_68526 *
D_68526) -
squot32(global_tid_73863 -
squot32(global_tid_73863,
K_68510 *
D_68526 *
D_68526 *
D_68526) *
(K_68510 *
D_68526 *
D_68526 *
D_68526),
D_68526 *
D_68526 *
D_68526) *
(D_68526 *
D_68526 *
D_68526),
D_68526 *
D_68526) *
(D_68526 * D_68526),
D_68526) * D_68526;
double x_79843;
if ((((slt32(gtid_73852, N_68508) && slt32(gtid_73853, K_68510)) &&
slt32(gtid_73854, D_68526)) && slt32(gtid_73855, D_68526)) &&
slt32(ltid_73859, D_68526)) {
x_79843 = *(__global double *) &mem_81181[(gtid_73852 * (D_68526 *
D_68526 *
D_68526 *
K_68510) +
gtid_73853 * (D_68526 *
D_68526 *
D_68526) +
gtid_73854 * (D_68526 *
D_68526) +
gtid_73855 * D_68526 +
ltid_73859) * 8];
}
__local char *mem_81184;
double res_77585;
mem_81184 = (__local char *) mem_81184_backing_0;
for (int32_t comb_iter_82830 = 0; comb_iter_82830 < 1; comb_iter_82830++) {
int32_t ctid_73861;
int32_t flat_comb_id_82831 = comb_iter_82830 * D_68526 +
local_tid_73864;
ctid_73861 = flat_comb_id_82831;
if (slt32(ctid_73861, D_68526) && 1) {
*(__local double *) &mem_81184[ctid_73861 * 8] = x_79843;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82832;
int32_t skip_waves_82833;
double x_77586;
double x_77587;
offset_82832 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_73864, D_68526)) {
x_77586 = *(__local double *) &mem_81184[(local_tid_73864 +
offset_82832) * 8];
}
}
offset_82832 = 1;
while (slt32(offset_82832, wave_sizze_82828)) {
if (slt32(local_tid_73864 + offset_82832, D_68526) &&
((local_tid_73864 - squot32(local_tid_73864, wave_sizze_82828) *
wave_sizze_82828) & (2 * offset_82832 - 1)) == 0) {
// read array element
{
x_77587 = *(volatile __local
double *) &mem_81184[(local_tid_73864 +
offset_82832) * 8];
}
// apply reduction operation
{
double res_77588;
if ((((slt32(gtid_73852, N_68508) && slt32(gtid_73853,
K_68510)) &&
slt32(gtid_73854, D_68526)) && slt32(gtid_73855,
D_68526)) &&
slt32(ltid_73859, D_68526)) {
res_77588 = x_77586 + x_77587;
}
x_77586 = res_77588;
}
// write result of operation
{
*(volatile __local double *) &mem_81184[local_tid_73864 * 8] =
x_77586;
}
}
offset_82832 *= 2;
}
skip_waves_82833 = 1;
while (slt32(skip_waves_82833, squot32(D_68526 + wave_sizze_82828 - 1,
wave_sizze_82828))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82832 = skip_waves_82833 * wave_sizze_82828;
if (slt32(local_tid_73864 + offset_82832, D_68526) &&
((local_tid_73864 - squot32(local_tid_73864, wave_sizze_82828) *
wave_sizze_82828) == 0 && (squot32(local_tid_73864,
wave_sizze_82828) & (2 *
skip_waves_82833 -
1)) ==
0)) {
// read array element
{
x_77587 = *(__local double *) &mem_81184[(local_tid_73864 +
offset_82832) * 8];
}
// apply reduction operation
{
double res_77588;
if ((((slt32(gtid_73852, N_68508) && slt32(gtid_73853,
K_68510)) &&
slt32(gtid_73854, D_68526)) && slt32(gtid_73855,
D_68526)) &&
slt32(ltid_73859, D_68526)) {
res_77588 = x_77586 + x_77587;
}
x_77586 = res_77588;
}
// write result of operation
{
*(__local double *) &mem_81184[local_tid_73864 * 8] = x_77586;
}
}
skip_waves_82833 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
res_77585 = *(__local double *) &mem_81184[0];
if (local_tid_73864 == 0) {
*(__global double *) &mem_81187[group_id_73865 * 8] = res_77585;
}
}
__kernel void map_intra_group_74690(__local volatile
int64_t *mem_80911_backing_aligned_0,
int32_t N_68508, int32_t K_68510,
int32_t D_68526, __global
unsigned char *mem_80879, __global
unsigned char *mem_80889, __global
unsigned char *mem_80893, __global
unsigned char *mem_80914)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
__local volatile char *restrict mem_80911_backing_0 =
mem_80911_backing_aligned_0;
int32_t global_tid_74690;
int32_t local_tid_74691;
int32_t group_sizze_82641;
int32_t wave_sizze_82640;
int32_t group_id_74692;
global_tid_74690 = get_global_id(0);
local_tid_74691 = get_local_id(0);
group_sizze_82641 = get_local_size(0);
wave_sizze_82640 = LOCKSTEP_WIDTH;
group_id_74692 = get_group_id(0);
int32_t gtid_74681;
int32_t gtid_74682;
int32_t gtid_74683;
int32_t ltid_74686;
gtid_74681 = squot32(global_tid_74690, K_68510 * D_68526 * D_68526);
gtid_74682 = squot32(global_tid_74690 - squot32(global_tid_74690, K_68510 *
D_68526 * D_68526) *
(K_68510 * D_68526 * D_68526), D_68526 * D_68526);
gtid_74683 = squot32(global_tid_74690 - squot32(global_tid_74690, K_68510 *
D_68526 * D_68526) *
(K_68510 * D_68526 * D_68526) -
squot32(global_tid_74690 - squot32(global_tid_74690,
K_68510 * D_68526 *
D_68526) *
(K_68510 * D_68526 * D_68526), D_68526 *
D_68526) * (D_68526 * D_68526), D_68526);
ltid_74686 = global_tid_74690 - squot32(global_tid_74690, K_68510 *
D_68526 * D_68526) * (K_68510 *
D_68526 *
D_68526) -
squot32(global_tid_74690 - squot32(global_tid_74690, K_68510 * D_68526 *
D_68526) * (K_68510 * D_68526 *
D_68526), D_68526 *
D_68526) * (D_68526 * D_68526) - squot32(global_tid_74690 -
squot32(global_tid_74690,
K_68510 *
D_68526 *
D_68526) *
(K_68510 * D_68526 *
D_68526) -
squot32(global_tid_74690 -
squot32(global_tid_74690,
K_68510 *
D_68526 *
D_68526) *
(K_68510 *
D_68526 *
D_68526),
D_68526 *
D_68526) *
(D_68526 * D_68526),
D_68526) * D_68526;
double rev_sqnorm_arg_77063;
double x_79805;
double x_79807;
double res_77068;
if (((slt32(gtid_74681, N_68508) && slt32(gtid_74682, K_68510)) &&
slt32(gtid_74683, D_68526)) && slt32(ltid_74686, D_68526)) {
rev_sqnorm_arg_77063 = *(__global double *) &mem_80893[(gtid_74681 *
K_68510 +
gtid_74682) *
8];
x_79805 = *(__global double *) &mem_80889[(gtid_74681 * (D_68526 *
K_68510) +
gtid_74682 * D_68526 +
ltid_74686) * 8];
x_79807 = *(__global double *) &mem_80879[(gtid_74681 * (D_68526 *
D_68526 *
K_68510) +
gtid_74682 * (D_68526 *
D_68526) +
gtid_74683 * D_68526 +
ltid_74686) * 8];
res_77068 = x_79805 * x_79807;
}
__local char *mem_80911;
double res_77069;
mem_80911 = (__local char *) mem_80911_backing_0;
for (int32_t comb_iter_82642 = 0; comb_iter_82642 < 1; comb_iter_82642++) {
int32_t ctid_74688;
int32_t flat_comb_id_82643 = comb_iter_82642 * D_68526 +
local_tid_74691;
ctid_74688 = flat_comb_id_82643;
if (slt32(ctid_74688, D_68526) && 1) {
*(__local double *) &mem_80911[ctid_74688 * 8] = res_77068;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82644;
int32_t skip_waves_82645;
double x_77070;
double x_77071;
offset_82644 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_74691, D_68526)) {
x_77070 = *(__local double *) &mem_80911[(local_tid_74691 +
offset_82644) * 8];
}
}
offset_82644 = 1;
while (slt32(offset_82644, wave_sizze_82640)) {
if (slt32(local_tid_74691 + offset_82644, D_68526) &&
((local_tid_74691 - squot32(local_tid_74691, wave_sizze_82640) *
wave_sizze_82640) & (2 * offset_82644 - 1)) == 0) {
// read array element
{
x_77071 = *(volatile __local
double *) &mem_80911[(local_tid_74691 +
offset_82644) * 8];
}
// apply reduction operation
{
double res_77072;
if (((slt32(gtid_74681, N_68508) && slt32(gtid_74682,
K_68510)) &&
slt32(gtid_74683, D_68526)) && slt32(ltid_74686,
D_68526)) {
res_77072 = x_77070 + x_77071;
}
x_77070 = res_77072;
}
// write result of operation
{
*(volatile __local double *) &mem_80911[local_tid_74691 * 8] =
x_77070;
}
}
offset_82644 *= 2;
}
skip_waves_82645 = 1;
while (slt32(skip_waves_82645, squot32(D_68526 + wave_sizze_82640 - 1,
wave_sizze_82640))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82644 = skip_waves_82645 * wave_sizze_82640;
if (slt32(local_tid_74691 + offset_82644, D_68526) &&
((local_tid_74691 - squot32(local_tid_74691, wave_sizze_82640) *
wave_sizze_82640) == 0 && (squot32(local_tid_74691,
wave_sizze_82640) & (2 *
skip_waves_82645 -
1)) ==
0)) {
// read array element
{
x_77071 = *(__local double *) &mem_80911[(local_tid_74691 +
offset_82644) * 8];
}
// apply reduction operation
{
double res_77072;
if (((slt32(gtid_74681, N_68508) && slt32(gtid_74682,
K_68510)) &&
slt32(gtid_74683, D_68526)) && slt32(ltid_74686,
D_68526)) {
res_77072 = x_77070 + x_77071;
}
x_77070 = res_77072;
}
// write result of operation
{
*(__local double *) &mem_80911[local_tid_74691 * 8] = x_77070;
}
}
skip_waves_82645 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
res_77069 = *(__local double *) &mem_80911[0];
double res_77073;
double res_77074;
if (((slt32(gtid_74681, N_68508) && slt32(gtid_74682, K_68510)) &&
slt32(gtid_74683, D_68526)) && slt32(ltid_74686, D_68526)) {
res_77073 = rev_sqnorm_arg_77063 * res_77069;
res_77074 = res_77073 + res_77073;
}
if (local_tid_74691 == 0) {
*(__global double *) &mem_80914[group_id_74692 * 8] = res_77074;
}
}
__kernel void map_intra_group_78025(__local volatile
int64_t *mem_81457_backing_aligned_0,
int32_t N_68508, int32_t K_68510,
int32_t D_68526, __global
unsigned char *alphas_mem_80367, __global
unsigned char *res_mem_81412, __global
unsigned char *mem_81460, __global
unsigned char *mem_81464)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
__local volatile char *restrict mem_81457_backing_0 =
mem_81457_backing_aligned_0;
int32_t global_tid_78025;
int32_t local_tid_78026;
int32_t group_sizze_83075;
int32_t wave_sizze_83074;
int32_t group_id_78027;
global_tid_78025 = get_global_id(0);
local_tid_78026 = get_local_id(0);
group_sizze_83075 = get_local_size(0);
wave_sizze_83074 = LOCKSTEP_WIDTH;
group_id_78027 = get_group_id(0);
int32_t gtid_78012;
int32_t ltid_78013;
gtid_78012 = squot32(global_tid_78025, D_68526);
ltid_78013 = global_tid_78025 - squot32(global_tid_78025, D_68526) *
D_68526;
double alphas_elem_78423;
double x_79883;
if (slt32(gtid_78012, K_68510) && slt32(ltid_78013, D_68526)) {
alphas_elem_78423 = *(__global double *) &alphas_mem_80367[gtid_78012 *
8];
double x_78429 = 0.0;
for (int32_t chunk_offset_78428 = 0; chunk_offset_78428 < N_68508;
chunk_offset_78428++) {
double x_78436;
double res_78439;
x_78436 = *(__global double *) &res_mem_81412[(chunk_offset_78428 *
(D_68526 * K_68510) +
gtid_78012 *
D_68526 +
ltid_78013) * 8];
res_78439 = x_78429 + x_78436;
double x_tmp_83076 = res_78439;
x_78429 = x_tmp_83076;
}
x_79883 = x_78429;
}
__local char *mem_81457;
mem_81457 = (__local char *) mem_81457_backing_0;
for (int32_t comb_iter_83077 = 0; comb_iter_83077 < 1; comb_iter_83077++) {
int32_t ctid_78023;
int32_t flat_comb_id_83078 = comb_iter_83077 * D_68526 +
local_tid_78026;
ctid_78023 = flat_comb_id_83078;
if (slt32(ctid_78023, D_68526) && 1) {
*(__local double *) &mem_81457[ctid_78023 * 8] = x_79883;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
double res_78440;
if (slt32(gtid_78012, K_68510) && slt32(ltid_78013, D_68526)) {
res_78440 = futrts_exp64(alphas_elem_78423);
}
if (local_tid_78026 == 0) {
*(__global double *) &mem_81460[group_id_78027 * 8] = res_78440;
}
for (int32_t i_83080 = 0; i_83080 < squot32(D_68526 - local_tid_78026 +
D_68526 - 1, D_68526);
i_83080++) {
*(__global double *) &mem_81464[(group_id_78027 * D_68526 + (i_83080 *
D_68526 +
local_tid_78026)) *
8] = *(__local
double *) &mem_81457[(i_83080 *
D_68526 +
local_tid_78026) *
8];
}
}
__kernel void map_intra_group_78182(__local volatile
int64_t *mem_81476_backing_aligned_0,
int32_t N_68508, int32_t K_68510,
int32_t D_68526, __global
unsigned char *mem_81473, __global
unsigned char *mem_81479)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
__local volatile char *restrict mem_81476_backing_0 =
mem_81476_backing_aligned_0;
int32_t global_tid_78182;
int32_t local_tid_78183;
int32_t group_sizze_83085;
int32_t wave_sizze_83084;
int32_t group_id_78184;
global_tid_78182 = get_global_id(0);
local_tid_78183 = get_local_id(0);
group_sizze_83085 = get_local_size(0);
wave_sizze_83084 = LOCKSTEP_WIDTH;
group_id_78184 = get_group_id(0);
int32_t gtid_78175;
int32_t gtid_78176;
int32_t ltid_78178;
gtid_78175 = squot32(global_tid_78182, D_68526 * N_68508);
gtid_78176 = squot32(global_tid_78182 - squot32(global_tid_78182, D_68526 *
N_68508) * (D_68526 *
N_68508),
N_68508);
ltid_78178 = global_tid_78182 - squot32(global_tid_78182, D_68526 *
N_68508) * (D_68526 * N_68508) -
squot32(global_tid_78182 - squot32(global_tid_78182, D_68526 *
N_68508) * (D_68526 * N_68508),
N_68508) * N_68508;
double x_79887;
if ((slt32(gtid_78175, K_68510) && slt32(gtid_78176, D_68526)) &&
slt32(ltid_78178, N_68508)) {
x_79887 = *(__global double *) &mem_81473[(gtid_78175 * (N_68508 *
D_68526) +
gtid_78176 * N_68508 +
ltid_78178) * 8];
}
__local char *mem_81476;
double res_78478;
mem_81476 = (__local char *) mem_81476_backing_0;
for (int32_t comb_iter_83086 = 0; comb_iter_83086 < 1; comb_iter_83086++) {
int32_t ctid_78180;
int32_t flat_comb_id_83087 = comb_iter_83086 * N_68508 +
local_tid_78183;
ctid_78180 = flat_comb_id_83087;
if (slt32(ctid_78180, N_68508) && 1) {
*(__local double *) &mem_81476[ctid_78180 * 8] = x_79887;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_83088;
int32_t skip_waves_83089;
double x_78479;
double x_78480;
offset_83088 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_78183, N_68508)) {
x_78479 = *(__local double *) &mem_81476[(local_tid_78183 +
offset_83088) * 8];
}
}
offset_83088 = 1;
while (slt32(offset_83088, wave_sizze_83084)) {
if (slt32(local_tid_78183 + offset_83088, N_68508) &&
((local_tid_78183 - squot32(local_tid_78183, wave_sizze_83084) *
wave_sizze_83084) & (2 * offset_83088 - 1)) == 0) {
// read array element
{
x_78480 = *(volatile __local
double *) &mem_81476[(local_tid_78183 +
offset_83088) * 8];
}
// apply reduction operation
{
double res_78481;
if ((slt32(gtid_78175, K_68510) && slt32(gtid_78176,
D_68526)) &&
slt32(ltid_78178, N_68508)) {
res_78481 = x_78479 + x_78480;
}
x_78479 = res_78481;
}
// write result of operation
{
*(volatile __local double *) &mem_81476[local_tid_78183 * 8] =
x_78479;
}
}
offset_83088 *= 2;
}
skip_waves_83089 = 1;
while (slt32(skip_waves_83089, squot32(N_68508 + wave_sizze_83084 - 1,
wave_sizze_83084))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_83088 = skip_waves_83089 * wave_sizze_83084;
if (slt32(local_tid_78183 + offset_83088, N_68508) &&
((local_tid_78183 - squot32(local_tid_78183, wave_sizze_83084) *
wave_sizze_83084) == 0 && (squot32(local_tid_78183,
wave_sizze_83084) & (2 *
skip_waves_83089 -
1)) ==
0)) {
// read array element
{
x_78480 = *(__local double *) &mem_81476[(local_tid_78183 +
offset_83088) * 8];
}
// apply reduction operation
{
double res_78481;
if ((slt32(gtid_78175, K_68510) && slt32(gtid_78176,
D_68526)) &&
slt32(ltid_78178, N_68508)) {
res_78481 = x_78479 + x_78480;
}
x_78479 = res_78481;
}
// write result of operation
{
*(__local double *) &mem_81476[local_tid_78183 * 8] = x_78479;
}
}
skip_waves_83089 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
res_78478 = *(__local double *) &mem_81476[0];
if (local_tid_78183 == 0) {
*(__global double *) &mem_81479[group_id_78184 * 8] = res_78478;
}
}
__kernel void map_intra_group_78574(__local volatile
int64_t *mem_81549_backing_aligned_0,
__local volatile
int64_t *mem_81552_backing_aligned_1,
__local volatile
int64_t *mem_81555_backing_aligned_2,
int32_t N_68508, int32_t K_68510,
int32_t D_68514, int32_t triD_68516,
int32_t D_68526, double res_68862,
double t1389_68865, double res_68867,
int32_t computed_group_sizze_78572, __global
unsigned char *alphas_mem_80367, __global
unsigned char *qs_mem_80369, __global
unsigned char *icf_mem_80370, __global
unsigned char *res_mem_81411, __global
unsigned char *res_mem_81413, __global
unsigned char *res_mem_81414, __global
unsigned char *mem_81559, __global
unsigned char *mem_81563, __global
unsigned char *mem_81566)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
__local volatile char *restrict mem_81549_backing_0 =
mem_81549_backing_aligned_0;
__local volatile char *restrict mem_81552_backing_1 =
mem_81552_backing_aligned_1;
__local volatile char *restrict mem_81555_backing_2 =
mem_81555_backing_aligned_2;
int32_t global_tid_78574;
int32_t local_tid_78575;
int32_t group_sizze_83165;
int32_t wave_sizze_83164;
int32_t group_id_78576;
global_tid_78574 = get_global_id(0);
local_tid_78575 = get_local_id(0);
group_sizze_83165 = get_local_size(0);
wave_sizze_83164 = LOCKSTEP_WIDTH;
group_id_78576 = get_group_id(0);
int32_t gtid_78545;
int32_t ltid_78546;
gtid_78545 = squot32(global_tid_78574, computed_group_sizze_78572);
ltid_78546 = global_tid_78574 - squot32(global_tid_78574,
computed_group_sizze_78572) *
computed_group_sizze_78572;
double alphas_elem_78784;
if (slt32(gtid_78545, K_68510) && slt32(ltid_78546,
computed_group_sizze_78572)) {
alphas_elem_78784 = *(__global double *) &alphas_mem_80367[gtid_78545 *
8];
}
__local char *mem_81549;
double res_78789;
mem_81549 = (__local char *) mem_81549_backing_0;
for (int32_t comb_iter_83166 = 0; comb_iter_83166 < squot32(N_68508 +
computed_group_sizze_78572 -
1,
computed_group_sizze_78572);
comb_iter_83166++) {
int32_t ctid_78548;
int32_t flat_comb_id_83167 = comb_iter_83166 *
computed_group_sizze_78572 + local_tid_78575;
ctid_78548 = flat_comb_id_83167;
if (slt32(ctid_78548, N_68508) && 1) {
double x_78788 = *(__global double *) &res_mem_81411[(ltid_78546 *
K_68510 +
gtid_78545) *
8];
*(__local double *) &mem_81549[ctid_78548 * 8] = x_78788;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_83168;
int32_t skip_waves_83169;
double x_78790;
double x_78791;
offset_83168 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_78575, N_68508)) {
x_78790 = *(__local double *) &mem_81549[(local_tid_78575 +
offset_83168) * 8];
}
}
offset_83168 = 1;
while (slt32(offset_83168, wave_sizze_83164)) {
if (slt32(local_tid_78575 + offset_83168, N_68508) &&
((local_tid_78575 - squot32(local_tid_78575, wave_sizze_83164) *
wave_sizze_83164) & (2 * offset_83168 - 1)) == 0) {
// read array element
{
x_78791 = *(volatile __local
double *) &mem_81549[(local_tid_78575 +
offset_83168) * 8];
}
// apply reduction operation
{
double res_78792;
if (slt32(gtid_78545, K_68510) && slt32(ltid_78546,
computed_group_sizze_78572)) {
res_78792 = x_78790 + x_78791;
}
x_78790 = res_78792;
}
// write result of operation
{
*(volatile __local double *) &mem_81549[local_tid_78575 * 8] =
x_78790;
}
}
offset_83168 *= 2;
}
skip_waves_83169 = 1;
while (slt32(skip_waves_83169, squot32(computed_group_sizze_78572 +
wave_sizze_83164 - 1,
wave_sizze_83164))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_83168 = skip_waves_83169 * wave_sizze_83164;
if (slt32(local_tid_78575 + offset_83168, N_68508) &&
((local_tid_78575 - squot32(local_tid_78575, wave_sizze_83164) *
wave_sizze_83164) == 0 && (squot32(local_tid_78575,
wave_sizze_83164) & (2 *
skip_waves_83169 -
1)) ==
0)) {
// read array element
{
x_78791 = *(__local double *) &mem_81549[(local_tid_78575 +
offset_83168) * 8];
}
// apply reduction operation
{
double res_78792;
if (slt32(gtid_78545, K_68510) && slt32(ltid_78546,
computed_group_sizze_78572)) {
res_78792 = x_78790 + x_78791;
}
x_78790 = res_78792;
}
// write result of operation
{
*(__local double *) &mem_81549[local_tid_78575 * 8] = x_78790;
}
}
skip_waves_83169 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
res_78789 = *(__local double *) &mem_81549[0];
double res_78793;
double res_78794;
double res_78795;
if (slt32(gtid_78545, K_68510) && slt32(ltid_78546,
computed_group_sizze_78572)) {
res_78793 = futrts_exp64(alphas_elem_78784);
res_78794 = res_68862 * res_78793;
res_78795 = res_78789 + res_78794;
}
__local char *mem_81552;
__local char *mem_81555;
mem_81552 = (__local char *) mem_81552_backing_1;
for (int32_t comb_iter_83170 = 0; comb_iter_83170 < squot32(D_68526 +
computed_group_sizze_78572 -
1,
computed_group_sizze_78572);
comb_iter_83170++) {
int32_t ctid_78558;
int32_t flat_comb_id_83171 = comb_iter_83170 *
computed_group_sizze_78572 + local_tid_78575;
ctid_78558 = flat_comb_id_83171;
if (slt32(ctid_78558, D_68526) && 1) {
double qs_elem_elem_78799 = *(__global
double *) &qs_mem_80369[(gtid_78545 *
D_68514 +
ltid_78546) *
8];
double res_78801;
double x_78804 = 0.0;
int32_t chunk_sizze_78802;
int32_t chunk_offset_78803 = 0;
chunk_sizze_78802 = N_68508;
double res_78806;
double acc_78809 = x_78804;
int32_t groupstream_mapaccum_dummy_chunk_sizze_78807;
groupstream_mapaccum_dummy_chunk_sizze_78807 = 1;
if (chunk_sizze_78802 == N_68508) {
for (int32_t i_78808 = 0; i_78808 < N_68508; i_78808++) {
double x_78811;
double res_78814;
x_78811 = *(__global double *) &res_mem_81413[(gtid_78545 *
D_68526 +
ltid_78546 +
D_68526 *
K_68510 *
chunk_offset_78803 +
D_68526 *
K_68510 *
i_78808 + 0 *
(D_68526 *
K_68510)) *
8];
res_78814 = acc_78809 + x_78811;
double acc_tmp_83172 = res_78814;
acc_78809 = acc_tmp_83172;
}
} else {
for (int32_t i_78808 = 0; i_78808 < chunk_sizze_78802;
i_78808++) {
double x_78811;
double res_78814;
x_78811 = *(__global double *) &res_mem_81413[(gtid_78545 *
D_68526 +
ltid_78546 +
D_68526 *
K_68510 *
chunk_offset_78803 +
D_68526 *
K_68510 *
i_78808 + 0 *
(D_68526 *
K_68510)) *
8];
res_78814 = acc_78809 + x_78811;
double acc_tmp_83173 = res_78814;
acc_78809 = acc_tmp_83173;
}
}
res_78806 = acc_78809;
x_78804 = res_78806;
res_78801 = x_78804;
double res_78815;
double res_78816;
double res_78817;
double res_78819;
double res_78820;
double res_78821;
res_78815 = futrts_exp64(qs_elem_elem_78799);
res_78816 = t1389_68865 * res_78815;
res_78817 = res_78816 + res_78816;
res_78819 = res_78815 * res_78817;
res_78820 = res_68867 + res_78819;
res_78821 = res_78801 + res_78820;
*(__local double *) &mem_81552[ctid_78558 * 8] = res_78821;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
mem_81555 = (__local char *) mem_81555_backing_2;
for (int32_t comb_iter_83174 = 0; comb_iter_83174 < squot32(triD_68516 +
computed_group_sizze_78572 -
1,
computed_group_sizze_78572);
comb_iter_83174++) {
int32_t ctid_78568;
int32_t flat_comb_id_83175 = comb_iter_83174 *
computed_group_sizze_78572 + local_tid_78575;
ctid_78568 = flat_comb_id_83175;
if (slt32(ctid_78568, triD_68516) && 1) {
double icf_elem_elem_78824 = *(__global
double *) &icf_mem_80370[(gtid_78545 *
triD_68516 +
ltid_78546) *
8];
double res_78825;
double x_78828 = 0.0;
int32_t chunk_sizze_78826;
int32_t chunk_offset_78827 = 0;
chunk_sizze_78826 = N_68508;
double res_78830;
double acc_78833 = x_78828;
int32_t groupstream_mapaccum_dummy_chunk_sizze_78831;
groupstream_mapaccum_dummy_chunk_sizze_78831 = 1;
if (chunk_sizze_78826 == N_68508) {
for (int32_t i_78832 = 0; i_78832 < N_68508; i_78832++) {
double x_78835;
double res_78838;
x_78835 = *(__global double *) &res_mem_81414[(gtid_78545 *
triD_68516 +
ltid_78546 +
triD_68516 *
K_68510 *
chunk_offset_78827 +
triD_68516 *
K_68510 *
i_78832 + 0 *
(triD_68516 *
K_68510)) *
8];
res_78838 = acc_78833 + x_78835;
double acc_tmp_83176 = res_78838;
acc_78833 = acc_tmp_83176;
}
} else {
for (int32_t i_78832 = 0; i_78832 < chunk_sizze_78826;
i_78832++) {
double x_78835;
double res_78838;
x_78835 = *(__global double *) &res_mem_81414[(gtid_78545 *
triD_68516 +
ltid_78546 +
triD_68516 *
K_68510 *
chunk_offset_78827 +
triD_68516 *
K_68510 *
i_78832 + 0 *
(triD_68516 *
K_68510)) *
8];
res_78838 = acc_78833 + x_78835;
double acc_tmp_83177 = res_78838;
acc_78833 = acc_tmp_83177;
}
}
res_78830 = acc_78833;
x_78828 = res_78830;
res_78825 = x_78828;
double res_78839;
double res_78840;
double res_78841;
res_78839 = t1389_68865 * icf_elem_elem_78824;
res_78840 = res_78839 + res_78839;
res_78841 = res_78825 + res_78840;
*(__local double *) &mem_81555[ctid_78568 * 8] = res_78841;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int32_t i_83178 = 0; i_83178 < squot32(triD_68516 - local_tid_78575 +
computed_group_sizze_78572 - 1,
computed_group_sizze_78572);
i_83178++) {
*(__global double *) &mem_81559[(group_id_78576 * triD_68516 +
(i_83178 * computed_group_sizze_78572 +
local_tid_78575)) * 8] = *(__local
double *) &mem_81555[(i_83178 *
computed_group_sizze_78572 +
local_tid_78575) *
8];
}
for (int32_t i_83179 = 0; i_83179 < squot32(D_68526 - local_tid_78575 +
computed_group_sizze_78572 - 1,
computed_group_sizze_78572);
i_83179++) {
*(__global double *) &mem_81563[(group_id_78576 * D_68526 + (i_83179 *
computed_group_sizze_78572 +
local_tid_78575)) *
8] = *(__local
double *) &mem_81552[(i_83179 *
computed_group_sizze_78572 +
local_tid_78575) *
8];
}
if (local_tid_78575 == 0) {
*(__global double *) &mem_81566[group_id_78576 * 8] = res_78795;
}
}
__kernel void map_intra_group_78874(__local volatile
int64_t *mem_81626_backing_aligned_0,
int32_t N_68508, int32_t K_68510,
int32_t K_68515, int32_t triD_68516,
double t1389_68865, __global
unsigned char *mem_81618, __global
unsigned char *mem_81623, __global
unsigned char *mem_81629)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
__local volatile char *restrict mem_81626_backing_0 =
mem_81626_backing_aligned_0;
int32_t global_tid_78874;
int32_t local_tid_78875;
int32_t group_sizze_83276;
int32_t wave_sizze_83275;
int32_t group_id_78876;
global_tid_78874 = get_global_id(0);
local_tid_78875 = get_local_id(0);
group_sizze_83276 = get_local_size(0);
wave_sizze_83275 = LOCKSTEP_WIDTH;
group_id_78876 = get_group_id(0);
int32_t gtid_78867;
int32_t gtid_78868;
int32_t ltid_78870;
gtid_78867 = squot32(global_tid_78874, triD_68516 * N_68508);
gtid_78868 = squot32(global_tid_78874 - squot32(global_tid_78874,
triD_68516 * N_68508) *
(triD_68516 * N_68508), N_68508);
ltid_78870 = global_tid_78874 - squot32(global_tid_78874, triD_68516 *
N_68508) * (triD_68516 * N_68508) -
squot32(global_tid_78874 - squot32(global_tid_78874, triD_68516 *
N_68508) * (triD_68516 * N_68508),
N_68508) * N_68508;
double icf_elem_elem_79427;
double x_79977;
if ((slt32(gtid_78867, K_68510) && slt32(gtid_78868, triD_68516)) &&
slt32(ltid_78870, N_68508)) {
icf_elem_elem_79427 = *(__global double *) &mem_81618[(gtid_78868 *
K_68515 +
gtid_78867) * 8];
x_79977 = *(__global double *) &mem_81623[(gtid_78867 * (N_68508 *
triD_68516) +
gtid_78868 * N_68508 +
ltid_78870) * 8];
}
__local char *mem_81626;
double res_79430;
mem_81626 = (__local char *) mem_81626_backing_0;
for (int32_t comb_iter_83277 = 0; comb_iter_83277 < 1; comb_iter_83277++) {
int32_t ctid_78872;
int32_t flat_comb_id_83278 = comb_iter_83277 * N_68508 +
local_tid_78875;
ctid_78872 = flat_comb_id_83278;
if (slt32(ctid_78872, N_68508) && 1) {
*(__local double *) &mem_81626[ctid_78872 * 8] = x_79977;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_83279;
int32_t skip_waves_83280;
double x_79431;
double x_79432;
offset_83279 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_78875, N_68508)) {
x_79431 = *(__local double *) &mem_81626[(local_tid_78875 +
offset_83279) * 8];
}
}
offset_83279 = 1;
while (slt32(offset_83279, wave_sizze_83275)) {
if (slt32(local_tid_78875 + offset_83279, N_68508) &&
((local_tid_78875 - squot32(local_tid_78875, wave_sizze_83275) *
wave_sizze_83275) & (2 * offset_83279 - 1)) == 0) {
// read array element
{
x_79432 = *(volatile __local
double *) &mem_81626[(local_tid_78875 +
offset_83279) * 8];
}
// apply reduction operation
{
double res_79433;
if ((slt32(gtid_78867, K_68510) && slt32(gtid_78868,
triD_68516)) &&
slt32(ltid_78870, N_68508)) {
res_79433 = x_79431 + x_79432;
}
x_79431 = res_79433;
}
// write result of operation
{
*(volatile __local double *) &mem_81626[local_tid_78875 * 8] =
x_79431;
}
}
offset_83279 *= 2;
}
skip_waves_83280 = 1;
while (slt32(skip_waves_83280, squot32(N_68508 + wave_sizze_83275 - 1,
wave_sizze_83275))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_83279 = skip_waves_83280 * wave_sizze_83275;
if (slt32(local_tid_78875 + offset_83279, N_68508) &&
((local_tid_78875 - squot32(local_tid_78875, wave_sizze_83275) *
wave_sizze_83275) == 0 && (squot32(local_tid_78875,
wave_sizze_83275) & (2 *
skip_waves_83280 -
1)) ==
0)) {
// read array element
{
x_79432 = *(__local double *) &mem_81626[(local_tid_78875 +
offset_83279) * 8];
}
// apply reduction operation
{
double res_79433;
if ((slt32(gtid_78867, K_68510) && slt32(gtid_78868,
triD_68516)) &&
slt32(ltid_78870, N_68508)) {
res_79433 = x_79431 + x_79432;
}
x_79431 = res_79433;
}
// write result of operation
{
*(__local double *) &mem_81626[local_tid_78875 * 8] = x_79431;
}
}
skip_waves_83280 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
res_79430 = *(__local double *) &mem_81626[0];
double res_79434;
double res_79435;
double res_79436;
if ((slt32(gtid_78867, K_68510) && slt32(gtid_78868, triD_68516)) &&
slt32(ltid_78870, N_68508)) {
res_79434 = t1389_68865 * icf_elem_elem_79427;
res_79435 = res_79434 + res_79434;
res_79436 = res_79430 + res_79435;
}
if (local_tid_78875 == 0) {
*(__global double *) &mem_81629[group_id_78876 * 8] = res_79436;
}
}
__kernel void map_intra_group_79031(__local volatile
int64_t *mem_81592_backing_aligned_0,
int32_t N_68508, int32_t K_68510,
int32_t K_68513, int32_t D_68526,
double t1389_68865, double res_68867,
__global unsigned char *mem_81584, __global
unsigned char *mem_81589, __global
unsigned char *mem_81595)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
__local volatile char *restrict mem_81592_backing_0 =
mem_81592_backing_aligned_0;
int32_t global_tid_79031;
int32_t local_tid_79032;
int32_t group_sizze_83227;
int32_t wave_sizze_83226;
int32_t group_id_79033;
global_tid_79031 = get_global_id(0);
local_tid_79032 = get_local_id(0);
group_sizze_83227 = get_local_size(0);
wave_sizze_83226 = LOCKSTEP_WIDTH;
group_id_79033 = get_group_id(0);
int32_t gtid_79024;
int32_t gtid_79025;
int32_t ltid_79027;
gtid_79024 = squot32(global_tid_79031, D_68526 * N_68508);
gtid_79025 = squot32(global_tid_79031 - squot32(global_tid_79031, D_68526 *
N_68508) * (D_68526 *
N_68508),
N_68508);
ltid_79027 = global_tid_79031 - squot32(global_tid_79031, D_68526 *
N_68508) * (D_68526 * N_68508) -
squot32(global_tid_79031 - squot32(global_tid_79031, D_68526 *
N_68508) * (D_68526 * N_68508),
N_68508) * N_68508;
double qs_elem_elem_79330;
double x_79967;
if ((slt32(gtid_79024, K_68510) && slt32(gtid_79025, D_68526)) &&
slt32(ltid_79027, N_68508)) {
qs_elem_elem_79330 = *(__global double *) &mem_81584[(gtid_79025 *
K_68513 +
gtid_79024) * 8];
x_79967 = *(__global double *) &mem_81589[(gtid_79024 * (N_68508 *
D_68526) +
gtid_79025 * N_68508 +
ltid_79027) * 8];
}
__local char *mem_81592;
double res_79334;
mem_81592 = (__local char *) mem_81592_backing_0;
for (int32_t comb_iter_83228 = 0; comb_iter_83228 < 1; comb_iter_83228++) {
int32_t ctid_79029;
int32_t flat_comb_id_83229 = comb_iter_83228 * N_68508 +
local_tid_79032;
ctid_79029 = flat_comb_id_83229;
if (slt32(ctid_79029, N_68508) && 1) {
*(__local double *) &mem_81592[ctid_79029 * 8] = x_79967;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_83230;
int32_t skip_waves_83231;
double x_79335;
double x_79336;
offset_83230 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_79032, N_68508)) {
x_79335 = *(__local double *) &mem_81592[(local_tid_79032 +
offset_83230) * 8];
}
}
offset_83230 = 1;
while (slt32(offset_83230, wave_sizze_83226)) {
if (slt32(local_tid_79032 + offset_83230, N_68508) &&
((local_tid_79032 - squot32(local_tid_79032, wave_sizze_83226) *
wave_sizze_83226) & (2 * offset_83230 - 1)) == 0) {
// read array element
{
x_79336 = *(volatile __local
double *) &mem_81592[(local_tid_79032 +
offset_83230) * 8];
}
// apply reduction operation
{
double res_79337;
if ((slt32(gtid_79024, K_68510) && slt32(gtid_79025,
D_68526)) &&
slt32(ltid_79027, N_68508)) {
res_79337 = x_79335 + x_79336;
}
x_79335 = res_79337;
}
// write result of operation
{
*(volatile __local double *) &mem_81592[local_tid_79032 * 8] =
x_79335;
}
}
offset_83230 *= 2;
}
skip_waves_83231 = 1;
while (slt32(skip_waves_83231, squot32(N_68508 + wave_sizze_83226 - 1,
wave_sizze_83226))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_83230 = skip_waves_83231 * wave_sizze_83226;
if (slt32(local_tid_79032 + offset_83230, N_68508) &&
((local_tid_79032 - squot32(local_tid_79032, wave_sizze_83226) *
wave_sizze_83226) == 0 && (squot32(local_tid_79032,
wave_sizze_83226) & (2 *
skip_waves_83231 -
1)) ==
0)) {
// read array element
{
x_79336 = *(__local double *) &mem_81592[(local_tid_79032 +
offset_83230) * 8];
}
// apply reduction operation
{
double res_79337;
if ((slt32(gtid_79024, K_68510) && slt32(gtid_79025,
D_68526)) &&
slt32(ltid_79027, N_68508)) {
res_79337 = x_79335 + x_79336;
}
x_79335 = res_79337;
}
// write result of operation
{
*(__local double *) &mem_81592[local_tid_79032 * 8] = x_79335;
}
}
skip_waves_83231 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
res_79334 = *(__local double *) &mem_81592[0];
double res_79338;
double res_79339;
double res_79340;
double res_79342;
double res_79343;
double res_79344;
if ((slt32(gtid_79024, K_68510) && slt32(gtid_79025, D_68526)) &&
slt32(ltid_79027, N_68508)) {
res_79338 = futrts_exp64(qs_elem_elem_79330);
res_79339 = t1389_68865 * res_79338;
res_79340 = res_79339 + res_79339;
res_79342 = res_79338 * res_79340;
res_79343 = res_68867 + res_79342;
res_79344 = res_79334 + res_79343;
}
if (local_tid_79032 == 0) {
*(__global double *) &mem_81595[group_id_79033 * 8] = res_79344;
}
}
__kernel void map_transpose_f64(int32_t destoffset_1, int32_t srcoffset_3,
int32_t num_arrays_4, int32_t x_elems_5,
int32_t y_elems_6, int32_t in_elems_7,
int32_t out_elems_8, int32_t mulx_9,
int32_t muly_10, __global
unsigned char *destmem_0, __global
unsigned char *srcmem_2)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(block_11_backing_0, 8448);
__local char *block_11;
block_11 = (__local char *) block_11_backing_0;
int32_t get_global_id_0_37;
get_global_id_0_37 = get_global_id(0);
int32_t get_local_id_0_38;
get_local_id_0_38 = get_local_id(0);
int32_t get_local_id_1_39;
get_local_id_1_39 = get_local_id(1);
int32_t get_group_id_0_40;
get_group_id_0_40 = get_group_id(0);
int32_t get_group_id_1_41;
get_group_id_1_41 = get_group_id(1);
int32_t get_group_id_2_42;
get_group_id_2_42 = get_group_id(2);
int32_t our_array_offset_30 = get_group_id_2_42 * x_elems_5 * y_elems_6;
int32_t odata_offset_33 = squot32(destoffset_1, 8) + our_array_offset_30;
int32_t idata_offset_34 = squot32(srcoffset_3, 8) + our_array_offset_30;
int32_t x_index_31 = get_global_id_0_37;
int32_t y_index_32 = get_group_id_1_41 * 32 + get_local_id_1_39;
if (slt32(x_index_31, x_elems_5)) {
for (int32_t j_43 = 0; j_43 < 4; j_43++) {
int32_t index_in_35 = (y_index_32 + j_43 * 8) * x_elems_5 +
x_index_31;
if (slt32(y_index_32 + j_43 * 8, y_elems_6) && slt32(index_in_35,
in_elems_7)) {
*(__local double *) &block_11[((get_local_id_1_39 + j_43 * 8) *
33 + get_local_id_0_38) *
sizeof(double)] = *(__global
double *) &srcmem_2[(idata_offset_34 +
index_in_35) *
sizeof(double)];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
x_index_31 = get_group_id_1_41 * 32 + get_local_id_0_38;
y_index_32 = get_group_id_0_40 * 32 + get_local_id_1_39;
if (slt32(x_index_31, y_elems_6)) {
for (int32_t j_43 = 0; j_43 < 4; j_43++) {
int32_t index_out_36 = (y_index_32 + j_43 * 8) * y_elems_6 +
x_index_31;
if (slt32(y_index_32 + j_43 * 8, x_elems_5) && slt32(index_out_36,
out_elems_8)) {
*(__global double *) &destmem_0[(odata_offset_33 +
index_out_36) *
sizeof(double)] = *(__local
double *) &block_11[(get_local_id_0_38 *
33 +
get_local_id_1_39 +
j_43 *
8) *
sizeof(double)];
}
}
}
}
__kernel void map_transpose_f64_low_height(int32_t destoffset_1,
int32_t srcoffset_3,
int32_t num_arrays_4,
int32_t x_elems_5, int32_t y_elems_6,
int32_t in_elems_7,
int32_t out_elems_8, int32_t mulx_9,
int32_t muly_10, __global
unsigned char *destmem_0, __global
unsigned char *srcmem_2)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(block_11_backing_0, 2176);
__local char *block_11;
block_11 = (__local char *) block_11_backing_0;
int32_t get_global_id_0_37;
get_global_id_0_37 = get_global_id(0);
int32_t get_local_id_0_38;
get_local_id_0_38 = get_local_id(0);
int32_t get_local_id_1_39;
get_local_id_1_39 = get_local_id(1);
int32_t get_group_id_0_40;
get_group_id_0_40 = get_group_id(0);
int32_t get_group_id_1_41;
get_group_id_1_41 = get_group_id(1);
int32_t get_group_id_2_42;
get_group_id_2_42 = get_group_id(2);
int32_t our_array_offset_30 = get_group_id_2_42 * x_elems_5 * y_elems_6;
int32_t odata_offset_33 = squot32(destoffset_1, 8) + our_array_offset_30;
int32_t idata_offset_34 = squot32(srcoffset_3, 8) + our_array_offset_30;
int32_t x_index_31 = get_group_id_0_40 * 16 * mulx_9 + get_local_id_0_38 +
srem32(get_local_id_1_39, mulx_9) * 16;
int32_t y_index_32 = get_group_id_1_41 * 16 + squot32(get_local_id_1_39,
mulx_9);
int32_t index_in_35 = y_index_32 * x_elems_5 + x_index_31;
if (slt32(x_index_31, x_elems_5) && (slt32(y_index_32, y_elems_6) &&
slt32(index_in_35, in_elems_7))) {
*(__local double *) &block_11[(get_local_id_1_39 * 17 +
get_local_id_0_38) * sizeof(double)] =
*(__global double *) &srcmem_2[(idata_offset_34 + index_in_35) *
sizeof(double)];
}
barrier(CLK_LOCAL_MEM_FENCE);
x_index_31 = get_group_id_1_41 * 16 + squot32(get_local_id_0_38, mulx_9);
y_index_32 = get_group_id_0_40 * 16 * mulx_9 + get_local_id_1_39 +
srem32(get_local_id_0_38, mulx_9) * 16;
int32_t index_out_36 = y_index_32 * y_elems_6 + x_index_31;
if (slt32(x_index_31, y_elems_6) && (slt32(y_index_32, x_elems_5) &&
slt32(index_out_36, out_elems_8))) {
*(__global double *) &destmem_0[(odata_offset_33 + index_out_36) *
sizeof(double)] = *(__local
double *) &block_11[(get_local_id_0_38 *
17 +
get_local_id_1_39) *
sizeof(double)];
}
}
__kernel void map_transpose_f64_low_width(int32_t destoffset_1,
int32_t srcoffset_3,
int32_t num_arrays_4,
int32_t x_elems_5, int32_t y_elems_6,
int32_t in_elems_7,
int32_t out_elems_8, int32_t mulx_9,
int32_t muly_10, __global
unsigned char *destmem_0, __global
unsigned char *srcmem_2)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(block_11_backing_0, 2176);
__local char *block_11;
block_11 = (__local char *) block_11_backing_0;
int32_t get_global_id_0_37;
get_global_id_0_37 = get_global_id(0);
int32_t get_local_id_0_38;
get_local_id_0_38 = get_local_id(0);
int32_t get_local_id_1_39;
get_local_id_1_39 = get_local_id(1);
int32_t get_group_id_0_40;
get_group_id_0_40 = get_group_id(0);
int32_t get_group_id_1_41;
get_group_id_1_41 = get_group_id(1);
int32_t get_group_id_2_42;
get_group_id_2_42 = get_group_id(2);
int32_t our_array_offset_30 = get_group_id_2_42 * x_elems_5 * y_elems_6;
int32_t odata_offset_33 = squot32(destoffset_1, 8) + our_array_offset_30;
int32_t idata_offset_34 = squot32(srcoffset_3, 8) + our_array_offset_30;
int32_t x_index_31 = get_group_id_0_40 * 16 + squot32(get_local_id_0_38,
muly_10);
int32_t y_index_32 = get_group_id_1_41 * 16 * muly_10 + get_local_id_1_39 +
srem32(get_local_id_0_38, muly_10) * 16;
int32_t index_in_35 = y_index_32 * x_elems_5 + x_index_31;
if (slt32(x_index_31, x_elems_5) && (slt32(y_index_32, y_elems_6) &&
slt32(index_in_35, in_elems_7))) {
*(__local double *) &block_11[(get_local_id_1_39 * 17 +
get_local_id_0_38) * sizeof(double)] =
*(__global double *) &srcmem_2[(idata_offset_34 + index_in_35) *
sizeof(double)];
}
barrier(CLK_LOCAL_MEM_FENCE);
x_index_31 = get_group_id_1_41 * 16 * muly_10 + get_local_id_0_38 +
srem32(get_local_id_1_39, muly_10) * 16;
y_index_32 = get_group_id_0_40 * 16 + squot32(get_local_id_1_39, muly_10);
int32_t index_out_36 = y_index_32 * y_elems_6 + x_index_31;
if (slt32(x_index_31, y_elems_6) && (slt32(y_index_32, x_elems_5) &&
slt32(index_out_36, out_elems_8))) {
*(__global double *) &destmem_0[(odata_offset_33 + index_out_36) *
sizeof(double)] = *(__local
double *) &block_11[(get_local_id_0_38 *
17 +
get_local_id_1_39) *
sizeof(double)];
}
}
__kernel void map_transpose_f64_small(int32_t destoffset_1, int32_t srcoffset_3,
int32_t num_arrays_4, int32_t x_elems_5,
int32_t y_elems_6, int32_t in_elems_7,
int32_t out_elems_8, int32_t mulx_9,
int32_t muly_10, __global
unsigned char *destmem_0, __global
unsigned char *srcmem_2)
{
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(block_11_backing_0, 1);
__local char *block_11;
block_11 = (__local char *) block_11_backing_0;
int32_t get_global_id_0_37;
get_global_id_0_37 = get_global_id(0);
int32_t get_local_id_0_38;
get_local_id_0_38 = get_local_id(0);
int32_t get_local_id_1_39;
get_local_id_1_39 = get_local_id(1);
int32_t get_group_id_0_40;
get_group_id_0_40 = get_group_id(0);
int32_t get_group_id_1_41;
get_group_id_1_41 = get_group_id(1);
int32_t get_group_id_2_42;
get_group_id_2_42 = get_group_id(2);
int32_t our_array_offset_30 = squot32(get_global_id_0_37, y_elems_6 *
x_elems_5) * (y_elems_6 * x_elems_5);
int32_t x_index_31 = squot32(srem32(get_global_id_0_37, y_elems_6 *
x_elems_5), y_elems_6);
int32_t y_index_32 = srem32(get_global_id_0_37, y_elems_6);
int32_t odata_offset_33 = squot32(destoffset_1, 8) + our_array_offset_30;
int32_t idata_offset_34 = squot32(srcoffset_3, 8) + our_array_offset_30;
int32_t index_in_35 = y_index_32 * x_elems_5 + x_index_31;
int32_t index_out_36 = x_index_31 * y_elems_6 + y_index_32;
if (slt32(get_global_id_0_37, in_elems_7)) {
*(__global double *) &destmem_0[(odata_offset_33 + index_out_36) *
sizeof(double)] = *(__global
double *) &srcmem_2[(idata_offset_34 +
index_in_35) *
sizeof(double)];
}
}
__kernel void segred_large_69652(int32_t N_68316, int32_t D_68317,
int32_t K_68318, int32_t K_68319,
int32_t K_68321, int32_t K_68323,
int32_t D_68333, int32_t num_groups_70125,
__global unsigned char *x_mem_80366, __global
unsigned char *alphas_mem_80367, __global
unsigned char *mem_80418, __global
unsigned char *mem_80422, __global
unsigned char *mem_80426, __global
unsigned char *mem_80429, __global
unsigned char *mem_80432,
int32_t thread_per_segment_82077, __global
unsigned char *group_res_arr_mem_82078,
__global unsigned char *counter_mem_82080)
{
const int32_t group_sizze_70115 = gmm_objectivezigroup_sizze_69634;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(red_arr_mem_82082_backing_0, 8 *
gmm_objectivezigroup_sizze_69634);
ALIGNED_LOCAL_MEMORY(sync_arr_mem_82084_backing_1, 1);
int32_t global_tid_69652;
int32_t local_tid_69653;
int32_t group_sizze_82074;
int32_t wave_sizze_82073;
int32_t group_id_69654;
global_tid_69652 = get_global_id(0);
local_tid_69653 = get_local_id(0);
group_sizze_82074 = get_local_size(0);
wave_sizze_82073 = LOCKSTEP_WIDTH;
group_id_69654 = get_group_id(0);
int32_t gtid_69623;
int32_t gtid_69651;
__local char *red_arr_mem_82082;
red_arr_mem_82082 = (__local char *) red_arr_mem_82082_backing_0;
__local char *sync_arr_mem_82084;
sync_arr_mem_82084 = (__local char *) sync_arr_mem_82084_backing_1;
gtid_69623 = squot32(group_id_69654, squot32(num_groups_70125 + smax32(1,
N_68316) -
1, smax32(1, N_68316)));
int32_t chunk_sizze_82086 = smin32(squot32(K_68318 + group_sizze_70115 *
squot32(num_groups_70125 +
smax32(1, N_68316) - 1,
smax32(1, N_68316)) - 1,
group_sizze_70115 *
squot32(num_groups_70125 +
smax32(1, N_68316) - 1,
smax32(1, N_68316))),
squot32(K_68318 -
srem32(global_tid_69652,
group_sizze_70115 *
squot32(num_groups_70125 +
smax32(1,
N_68316) -
1, smax32(1,
N_68316))) +
thread_per_segment_82077 - 1,
thread_per_segment_82077));
double x_70131;
double x_70132;
x_70131 = 0.0;
for (int32_t i_82090 = 0; i_82090 < chunk_sizze_82086; i_82090++) {
gtid_69651 = srem32(global_tid_69652, group_sizze_70115 *
squot32(num_groups_70125 + smax32(1, N_68316) - 1,
smax32(1, N_68316))) +
thread_per_segment_82077 * i_82090;
// apply map function
{
double alphas_elem_70135;
double res_70140;
double x_70154;
double res_70164;
double y_70217;
double res_70218;
double res_70219;
alphas_elem_70135 = *(__global
double *) &alphas_mem_80367[gtid_69651 * 8];
double x_70143 = 0.0;
for (int32_t chunk_offset_70142 = 0; chunk_offset_70142 < D_68333;
chunk_offset_70142++) {
double x_70150;
double res_70153;
x_70150 = *(__global double *) &mem_80418[(chunk_offset_70142 *
K_68321 +
gtid_69651) * 8];
res_70153 = x_70143 + x_70150;
double x_tmp_82091 = res_70153;
x_70143 = x_tmp_82091;
}
res_70140 = x_70143;
x_70154 = alphas_elem_70135 + res_70140;
for (int32_t i_70159 = 0; i_70159 < D_68333; i_70159++) {
double x_elem_elem_70160;
double means_elem_elem_70161;
double res_70162;
x_elem_elem_70160 = *(__global
double *) &x_mem_80366[(gtid_69623 *
D_68317 +
i_70159) * 8];
means_elem_elem_70161 = *(__global
double *) &mem_80422[(i_70159 *
K_68319 +
gtid_69651) *
8];
res_70162 = x_elem_elem_70160 - means_elem_elem_70161;
*(__global double *) &mem_80429[(group_id_69654 *
(group_sizze_70115 * D_68333) +
local_tid_69653 + i_70159 *
group_sizze_70115) * 8] =
res_70162;
}
double x_70167 = 0.0;
for (int32_t chunk_offset_70166 = 0; chunk_offset_70166 < D_68333;
chunk_offset_70166++) {
double qs_elem_elem_70177;
double res_70179;
double res_70214;
double res_70216;
qs_elem_elem_70177 = *(__global
double *) &mem_80418[(chunk_offset_70166 *
K_68321 +
gtid_69651) * 8];
double x_70182 = 0.0;
for (int32_t chunk_offset_70181 = 0; chunk_offset_70181 <
D_68333; chunk_offset_70181++) {
double x_70192;
bool cond_70194;
double res_70195;
double res_70211;
double res_70213;
x_70192 = *(__global double *) &mem_80429[(group_id_69654 *
(group_sizze_70115 *
D_68333) +
local_tid_69653 +
chunk_offset_70181 *
group_sizze_70115) *
8];
cond_70194 = slt32(chunk_offset_70166, chunk_offset_70181);
if (cond_70194) {
res_70195 = 0.0;
} else {
bool cond_70196;
double res_70197;
cond_70196 = chunk_offset_70166 == chunk_offset_70181;
if (cond_70196) {
double res_70198;
res_70198 = futrts_exp64(qs_elem_elem_70177);
res_70197 = res_70198;
} else {
int32_t y_70199;
int32_t x_70200;
int32_t res_70201;
int32_t gmm_knossos_tri_arg_70202;
int32_t y_70203;
int32_t x_70204;
int32_t res_70205;
int32_t x_70206;
int32_t x_70207;
int32_t y_70208;
int32_t i_70209;
double res_70210;
y_70199 = D_68333 - 1;
x_70200 = D_68333 * y_70199;
res_70201 = sdiv32(x_70200, 2);
gmm_knossos_tri_arg_70202 = D_68333 -
chunk_offset_70181;
y_70203 = gmm_knossos_tri_arg_70202 - 1;
x_70204 = gmm_knossos_tri_arg_70202 * y_70203;
res_70205 = sdiv32(x_70204, 2);
x_70206 = res_70201 - res_70205;
x_70207 = chunk_offset_70166 - chunk_offset_70181;
y_70208 = x_70207 - 1;
i_70209 = x_70206 + y_70208;
res_70210 = *(__global
double *) &mem_80426[(i_70209 *
K_68323 +
gtid_69651) *
8];
res_70197 = res_70210;
}
res_70195 = res_70197;
}
res_70211 = x_70192 * res_70195;
res_70213 = x_70182 + res_70211;
double x_tmp_82094 = res_70213;
x_70182 = x_tmp_82094;
}
res_70179 = x_70182;
res_70214 = res_70179 * res_70179;
res_70216 = x_70167 + res_70214;
double x_tmp_82093 = res_70216;
x_70167 = x_tmp_82093;
}
res_70164 = x_70167;
y_70217 = 0.5 * res_70164;
res_70218 = x_70154 - y_70217;
res_70219 = futrts_exp64(res_70218);
// save results to be reduced
{
x_70132 = res_70219;
}
// save map-out results
{ }
// apply reduction operator
{
double res_70133 = x_70131 + x_70132;
x_70131 = res_70133;
}
}
}
// to reduce current chunk, first store our result to memory
{
*(__local double *) &red_arr_mem_82082[local_tid_69653 * 8] = x_70131;
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82095;
int32_t skip_waves_82096;
double x_82087;
double x_82088;
offset_82095 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_69653, group_sizze_70115)) {
x_82087 = *(__local double *) &red_arr_mem_82082[(local_tid_69653 +
offset_82095) *
8];
}
}
offset_82095 = 1;
while (slt32(offset_82095, wave_sizze_82073)) {
if (slt32(local_tid_69653 + offset_82095, group_sizze_70115) &&
((local_tid_69653 - squot32(local_tid_69653, wave_sizze_82073) *
wave_sizze_82073) & (2 * offset_82095 - 1)) == 0) {
// read array element
{
x_82088 = *(volatile __local
double *) &red_arr_mem_82082[(local_tid_69653 +
offset_82095) * 8];
}
// apply reduction operation
{
double res_82089 = x_82087 + x_82088;
x_82087 = res_82089;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82082[local_tid_69653 * 8] = x_82087;
}
}
offset_82095 *= 2;
}
skip_waves_82096 = 1;
while (slt32(skip_waves_82096, squot32(group_sizze_70115 +
wave_sizze_82073 - 1,
wave_sizze_82073))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82095 = skip_waves_82096 * wave_sizze_82073;
if (slt32(local_tid_69653 + offset_82095, group_sizze_70115) &&
((local_tid_69653 - squot32(local_tid_69653, wave_sizze_82073) *
wave_sizze_82073) == 0 && (squot32(local_tid_69653,
wave_sizze_82073) & (2 *
skip_waves_82096 -
1)) ==
0)) {
// read array element
{
x_82088 = *(__local
double *) &red_arr_mem_82082[(local_tid_69653 +
offset_82095) * 8];
}
// apply reduction operation
{
double res_82089 = x_82087 + x_82088;
x_82087 = res_82089;
}
// write result of operation
{
*(__local double *) &red_arr_mem_82082[local_tid_69653 * 8] =
x_82087;
}
}
skip_waves_82096 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (squot32(num_groups_70125 + smax32(1, N_68316) - 1, smax32(1,
N_68316)) ==
1) {
// first thread in group saves final result to memory
{
if (local_tid_69653 == 0) {
*(__global double *) &mem_80432[gtid_69623 * 8] = x_82087;
}
}
} else {
int32_t old_counter_82097;
// first thread in group saves group result to global memory
{
if (local_tid_69653 == 0) {
*(__global double *) &group_res_arr_mem_82078[group_id_69654 *
8] = x_82087;
mem_fence_global();
old_counter_82097 = atomic_add((volatile __global int *) &
counter_mem_82080[srem32(squot32(group_id_69654,
squot32(num_groups_70125 +
smax32(1,
N_68316) -
1,
smax32(1,
N_68316))),
1024) *
4], 1);
*(__local bool *) &sync_arr_mem_82084[0] = old_counter_82097 ==
squot32(num_groups_70125 + smax32(1, N_68316) - 1, smax32(1,
N_68316)) -
1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
bool is_last_group_82098 = *(__local bool *) &sync_arr_mem_82084[0];
if (is_last_group_82098) {
if (local_tid_69653 == 0) {
old_counter_82097 = atomic_add((volatile __global int *) &
counter_mem_82080[srem32(squot32(group_id_69654,
squot32(num_groups_70125 +
smax32(1,
N_68316) -
1,
smax32(1,
N_68316))),
1024) *
4], 0 -
squot32(num_groups_70125 +
smax32(1, N_68316) - 1,
smax32(1, N_68316)));
}
// read in the per-group-results
{
if (slt32(local_tid_69653, squot32(num_groups_70125 + smax32(1,
N_68316) -
1, smax32(1, N_68316)))) {
x_70131 = *(__global
double *) &group_res_arr_mem_82078[(squot32(group_id_69654,
squot32(num_groups_70125 +
smax32(1,
N_68316) -
1,
smax32(1,
N_68316))) *
squot32(num_groups_70125 +
smax32(1,
N_68316) -
1,
smax32(1,
N_68316)) +
local_tid_69653) *
8];
} else {
x_70131 = 0.0;
}
*(__local double *) &red_arr_mem_82082[local_tid_69653 * 8] =
x_70131;
}
barrier(CLK_LOCAL_MEM_FENCE);
// reduce the per-group results
{
int32_t offset_82099;
int32_t skip_waves_82100;
double x_82087;
double x_82088;
offset_82099 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_69653, group_sizze_70115)) {
x_82087 = *(__local
double *) &red_arr_mem_82082[(local_tid_69653 +
offset_82099) *
8];
}
}
offset_82099 = 1;
while (slt32(offset_82099, wave_sizze_82073)) {
if (slt32(local_tid_69653 + offset_82099,
group_sizze_70115) && ((local_tid_69653 -
squot32(local_tid_69653,
wave_sizze_82073) *
wave_sizze_82073) & (2 *
offset_82099 -
1)) ==
0) {
// read array element
{
x_82088 = *(volatile __local
double *) &red_arr_mem_82082[(local_tid_69653 +
offset_82099) *
8];
}
// apply reduction operation
{
double res_82089 = x_82087 + x_82088;
x_82087 = res_82089;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82082[local_tid_69653 *
8] = x_82087;
}
}
offset_82099 *= 2;
}
skip_waves_82100 = 1;
while (slt32(skip_waves_82100, squot32(group_sizze_70115 +
wave_sizze_82073 - 1,
wave_sizze_82073))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82099 = skip_waves_82100 * wave_sizze_82073;
if (slt32(local_tid_69653 + offset_82099,
group_sizze_70115) && ((local_tid_69653 -
squot32(local_tid_69653,
wave_sizze_82073) *
wave_sizze_82073) == 0 &&
(squot32(local_tid_69653,
wave_sizze_82073) &
(2 * skip_waves_82100 -
1)) == 0)) {
// read array element
{
x_82088 = *(__local
double *) &red_arr_mem_82082[(local_tid_69653 +
offset_82099) *
8];
}
// apply reduction operation
{
double res_82089 = x_82087 + x_82088;
x_82087 = res_82089;
}
// write result of operation
{
*(__local
double *) &red_arr_mem_82082[local_tid_69653 *
8] = x_82087;
}
}
skip_waves_82100 *= 2;
}
// and back to memory with the final result
{
if (local_tid_69653 == 0) {
*(__global double *) &mem_80432[gtid_69623 * 8] =
x_82087;
}
}
}
}
}
}
__kernel void segred_large_70701(int32_t K_68318, int32_t D_68322,
int32_t D_68333, int32_t num_groups_71156,
__global unsigned char *qs_mem_80369, __global
unsigned char *mem_80492,
int32_t thread_per_segment_82310, __global
unsigned char *group_res_arr_mem_82311,
__global unsigned char *counter_mem_82313)
{
const int32_t group_sizze_71146 = gmm_objectivezigroup_sizze_70683;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(red_arr_mem_82315_backing_0, 8 *
gmm_objectivezigroup_sizze_70683);
ALIGNED_LOCAL_MEMORY(sync_arr_mem_82317_backing_1, 1);
int32_t global_tid_70701;
int32_t local_tid_70702;
int32_t group_sizze_82307;
int32_t wave_sizze_82306;
int32_t group_id_70703;
global_tid_70701 = get_global_id(0);
local_tid_70702 = get_local_id(0);
group_sizze_82307 = get_local_size(0);
wave_sizze_82306 = LOCKSTEP_WIDTH;
group_id_70703 = get_group_id(0);
int32_t gtid_70679;
int32_t gtid_70700;
__local char *red_arr_mem_82315;
red_arr_mem_82315 = (__local char *) red_arr_mem_82315_backing_0;
__local char *sync_arr_mem_82317;
sync_arr_mem_82317 = (__local char *) sync_arr_mem_82317_backing_1;
gtid_70679 = squot32(group_id_70703, squot32(num_groups_71156 + smax32(1,
K_68318) -
1, smax32(1, K_68318)));
int32_t chunk_sizze_82319 = smin32(squot32(D_68333 + group_sizze_71146 *
squot32(num_groups_71156 +
smax32(1, K_68318) - 1,
smax32(1, K_68318)) - 1,
group_sizze_71146 *
squot32(num_groups_71156 +
smax32(1, K_68318) - 1,
smax32(1, K_68318))),
squot32(D_68333 -
srem32(global_tid_70701,
group_sizze_71146 *
squot32(num_groups_71156 +
smax32(1,
K_68318) -
1, smax32(1,
K_68318))) +
thread_per_segment_82310 - 1,
thread_per_segment_82310));
double x_71162;
double x_71163;
x_71162 = 0.0;
for (int32_t i_82323 = 0; i_82323 < chunk_sizze_82319; i_82323++) {
gtid_70700 = srem32(global_tid_70701, group_sizze_71146 *
squot32(num_groups_71156 + smax32(1, K_68318) - 1,
smax32(1, K_68318))) +
thread_per_segment_82310 * i_82323;
// apply map function
{
double x_71166 = *(__global double *) &qs_mem_80369[(gtid_70679 *
D_68322 +
gtid_70700) *
8];
// save results to be reduced
{
x_71163 = x_71166;
}
// save map-out results
{ }
// apply reduction operator
{
double res_71164 = x_71162 + x_71163;
x_71162 = res_71164;
}
}
}
// to reduce current chunk, first store our result to memory
{
*(__local double *) &red_arr_mem_82315[local_tid_70702 * 8] = x_71162;
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82324;
int32_t skip_waves_82325;
double x_82320;
double x_82321;
offset_82324 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_70702, group_sizze_71146)) {
x_82320 = *(__local double *) &red_arr_mem_82315[(local_tid_70702 +
offset_82324) *
8];
}
}
offset_82324 = 1;
while (slt32(offset_82324, wave_sizze_82306)) {
if (slt32(local_tid_70702 + offset_82324, group_sizze_71146) &&
((local_tid_70702 - squot32(local_tid_70702, wave_sizze_82306) *
wave_sizze_82306) & (2 * offset_82324 - 1)) == 0) {
// read array element
{
x_82321 = *(volatile __local
double *) &red_arr_mem_82315[(local_tid_70702 +
offset_82324) * 8];
}
// apply reduction operation
{
double res_82322 = x_82320 + x_82321;
x_82320 = res_82322;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82315[local_tid_70702 * 8] = x_82320;
}
}
offset_82324 *= 2;
}
skip_waves_82325 = 1;
while (slt32(skip_waves_82325, squot32(group_sizze_71146 +
wave_sizze_82306 - 1,
wave_sizze_82306))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82324 = skip_waves_82325 * wave_sizze_82306;
if (slt32(local_tid_70702 + offset_82324, group_sizze_71146) &&
((local_tid_70702 - squot32(local_tid_70702, wave_sizze_82306) *
wave_sizze_82306) == 0 && (squot32(local_tid_70702,
wave_sizze_82306) & (2 *
skip_waves_82325 -
1)) ==
0)) {
// read array element
{
x_82321 = *(__local
double *) &red_arr_mem_82315[(local_tid_70702 +
offset_82324) * 8];
}
// apply reduction operation
{
double res_82322 = x_82320 + x_82321;
x_82320 = res_82322;
}
// write result of operation
{
*(__local double *) &red_arr_mem_82315[local_tid_70702 * 8] =
x_82320;
}
}
skip_waves_82325 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (squot32(num_groups_71156 + smax32(1, K_68318) - 1, smax32(1,
K_68318)) ==
1) {
// first thread in group saves final result to memory
{
if (local_tid_70702 == 0) {
*(__global double *) &mem_80492[gtid_70679 * 8] = x_82320;
}
}
} else {
int32_t old_counter_82326;
// first thread in group saves group result to global memory
{
if (local_tid_70702 == 0) {
*(__global double *) &group_res_arr_mem_82311[group_id_70703 *
8] = x_82320;
mem_fence_global();
old_counter_82326 = atomic_add((volatile __global int *) &
counter_mem_82313[srem32(squot32(group_id_70703,
squot32(num_groups_71156 +
smax32(1,
K_68318) -
1,
smax32(1,
K_68318))),
1024) *
4], 1);
*(__local bool *) &sync_arr_mem_82317[0] = old_counter_82326 ==
squot32(num_groups_71156 + smax32(1, K_68318) - 1, smax32(1,
K_68318)) -
1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
bool is_last_group_82327 = *(__local bool *) &sync_arr_mem_82317[0];
if (is_last_group_82327) {
if (local_tid_70702 == 0) {
old_counter_82326 = atomic_add((volatile __global int *) &
counter_mem_82313[srem32(squot32(group_id_70703,
squot32(num_groups_71156 +
smax32(1,
K_68318) -
1,
smax32(1,
K_68318))),
1024) *
4], 0 -
squot32(num_groups_71156 +
smax32(1, K_68318) - 1,
smax32(1, K_68318)));
}
// read in the per-group-results
{
if (slt32(local_tid_70702, squot32(num_groups_71156 + smax32(1,
K_68318) -
1, smax32(1, K_68318)))) {
x_71162 = *(__global
double *) &group_res_arr_mem_82311[(squot32(group_id_70703,
squot32(num_groups_71156 +
smax32(1,
K_68318) -
1,
smax32(1,
K_68318))) *
squot32(num_groups_71156 +
smax32(1,
K_68318) -
1,
smax32(1,
K_68318)) +
local_tid_70702) *
8];
} else {
x_71162 = 0.0;
}
*(__local double *) &red_arr_mem_82315[local_tid_70702 * 8] =
x_71162;
}
barrier(CLK_LOCAL_MEM_FENCE);
// reduce the per-group results
{
int32_t offset_82328;
int32_t skip_waves_82329;
double x_82320;
double x_82321;
offset_82328 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_70702, group_sizze_71146)) {
x_82320 = *(__local
double *) &red_arr_mem_82315[(local_tid_70702 +
offset_82328) *
8];
}
}
offset_82328 = 1;
while (slt32(offset_82328, wave_sizze_82306)) {
if (slt32(local_tid_70702 + offset_82328,
group_sizze_71146) && ((local_tid_70702 -
squot32(local_tid_70702,
wave_sizze_82306) *
wave_sizze_82306) & (2 *
offset_82328 -
1)) ==
0) {
// read array element
{
x_82321 = *(volatile __local
double *) &red_arr_mem_82315[(local_tid_70702 +
offset_82328) *
8];
}
// apply reduction operation
{
double res_82322 = x_82320 + x_82321;
x_82320 = res_82322;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82315[local_tid_70702 *
8] = x_82320;
}
}
offset_82328 *= 2;
}
skip_waves_82329 = 1;
while (slt32(skip_waves_82329, squot32(group_sizze_71146 +
wave_sizze_82306 - 1,
wave_sizze_82306))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82328 = skip_waves_82329 * wave_sizze_82306;
if (slt32(local_tid_70702 + offset_82328,
group_sizze_71146) && ((local_tid_70702 -
squot32(local_tid_70702,
wave_sizze_82306) *
wave_sizze_82306) == 0 &&
(squot32(local_tid_70702,
wave_sizze_82306) &
(2 * skip_waves_82329 -
1)) == 0)) {
// read array element
{
x_82321 = *(__local
double *) &red_arr_mem_82315[(local_tid_70702 +
offset_82328) *
8];
}
// apply reduction operation
{
double res_82322 = x_82320 + x_82321;
x_82320 = res_82322;
}
// write result of operation
{
*(__local
double *) &red_arr_mem_82315[local_tid_70702 *
8] = x_82320;
}
}
skip_waves_82329 *= 2;
}
// and back to memory with the final result
{
if (local_tid_70702 == 0) {
*(__global double *) &mem_80492[gtid_70679 * 8] =
x_82320;
}
}
}
}
}
}
__kernel void segred_large_70743(int32_t K_68318, int32_t triD_68324,
int32_t num_groups_71120, __global
unsigned char *icf_mem_80370, __global
unsigned char *mem_80486,
int32_t thread_per_segment_82271, __global
unsigned char *group_res_arr_mem_82272,
__global unsigned char *counter_mem_82274)
{
const int32_t group_sizze_71110 = gmm_objectivezigroup_sizze_70725;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(red_arr_mem_82276_backing_0, 8 *
gmm_objectivezigroup_sizze_70725);
ALIGNED_LOCAL_MEMORY(sync_arr_mem_82278_backing_1, 1);
int32_t global_tid_70743;
int32_t local_tid_70744;
int32_t group_sizze_82268;
int32_t wave_sizze_82267;
int32_t group_id_70745;
global_tid_70743 = get_global_id(0);
local_tid_70744 = get_local_id(0);
group_sizze_82268 = get_local_size(0);
wave_sizze_82267 = LOCKSTEP_WIDTH;
group_id_70745 = get_group_id(0);
int32_t gtid_70721;
int32_t gtid_70742;
__local char *red_arr_mem_82276;
red_arr_mem_82276 = (__local char *) red_arr_mem_82276_backing_0;
__local char *sync_arr_mem_82278;
sync_arr_mem_82278 = (__local char *) sync_arr_mem_82278_backing_1;
gtid_70721 = squot32(group_id_70745, squot32(num_groups_71120 + smax32(1,
K_68318) -
1, smax32(1, K_68318)));
int32_t chunk_sizze_82280 = smin32(squot32(triD_68324 + group_sizze_71110 *
squot32(num_groups_71120 +
smax32(1, K_68318) - 1,
smax32(1, K_68318)) - 1,
group_sizze_71110 *
squot32(num_groups_71120 +
smax32(1, K_68318) - 1,
smax32(1, K_68318))),
squot32(triD_68324 -
srem32(global_tid_70743,
group_sizze_71110 *
squot32(num_groups_71120 +
smax32(1,
K_68318) -
1, smax32(1,
K_68318))) +
thread_per_segment_82271 - 1,
thread_per_segment_82271));
double x_71126;
double x_71127;
x_71126 = 0.0;
for (int32_t i_82284 = 0; i_82284 < chunk_sizze_82280; i_82284++) {
gtid_70742 = srem32(global_tid_70743, group_sizze_71110 *
squot32(num_groups_71120 + smax32(1, K_68318) - 1,
smax32(1, K_68318))) +
thread_per_segment_82271 * i_82284;
// apply map function
{
double x_71130;
double res_71131;
x_71130 = *(__global double *) &icf_mem_80370[(gtid_70721 *
triD_68324 +
gtid_70742) * 8];
res_71131 = x_71130 * x_71130;
// save results to be reduced
{
x_71127 = res_71131;
}
// save map-out results
{ }
// apply reduction operator
{
double res_71128 = x_71126 + x_71127;
x_71126 = res_71128;
}
}
}
// to reduce current chunk, first store our result to memory
{
*(__local double *) &red_arr_mem_82276[local_tid_70744 * 8] = x_71126;
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82285;
int32_t skip_waves_82286;
double x_82281;
double x_82282;
offset_82285 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_70744, group_sizze_71110)) {
x_82281 = *(__local double *) &red_arr_mem_82276[(local_tid_70744 +
offset_82285) *
8];
}
}
offset_82285 = 1;
while (slt32(offset_82285, wave_sizze_82267)) {
if (slt32(local_tid_70744 + offset_82285, group_sizze_71110) &&
((local_tid_70744 - squot32(local_tid_70744, wave_sizze_82267) *
wave_sizze_82267) & (2 * offset_82285 - 1)) == 0) {
// read array element
{
x_82282 = *(volatile __local
double *) &red_arr_mem_82276[(local_tid_70744 +
offset_82285) * 8];
}
// apply reduction operation
{
double res_82283 = x_82281 + x_82282;
x_82281 = res_82283;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82276[local_tid_70744 * 8] = x_82281;
}
}
offset_82285 *= 2;
}
skip_waves_82286 = 1;
while (slt32(skip_waves_82286, squot32(group_sizze_71110 +
wave_sizze_82267 - 1,
wave_sizze_82267))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82285 = skip_waves_82286 * wave_sizze_82267;
if (slt32(local_tid_70744 + offset_82285, group_sizze_71110) &&
((local_tid_70744 - squot32(local_tid_70744, wave_sizze_82267) *
wave_sizze_82267) == 0 && (squot32(local_tid_70744,
wave_sizze_82267) & (2 *
skip_waves_82286 -
1)) ==
0)) {
// read array element
{
x_82282 = *(__local
double *) &red_arr_mem_82276[(local_tid_70744 +
offset_82285) * 8];
}
// apply reduction operation
{
double res_82283 = x_82281 + x_82282;
x_82281 = res_82283;
}
// write result of operation
{
*(__local double *) &red_arr_mem_82276[local_tid_70744 * 8] =
x_82281;
}
}
skip_waves_82286 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (squot32(num_groups_71120 + smax32(1, K_68318) - 1, smax32(1,
K_68318)) ==
1) {
// first thread in group saves final result to memory
{
if (local_tid_70744 == 0) {
*(__global double *) &mem_80486[gtid_70721 * 8] = x_82281;
}
}
} else {
int32_t old_counter_82287;
// first thread in group saves group result to global memory
{
if (local_tid_70744 == 0) {
*(__global double *) &group_res_arr_mem_82272[group_id_70745 *
8] = x_82281;
mem_fence_global();
old_counter_82287 = atomic_add((volatile __global int *) &
counter_mem_82274[srem32(squot32(group_id_70745,
squot32(num_groups_71120 +
smax32(1,
K_68318) -
1,
smax32(1,
K_68318))),
1024) *
4], 1);
*(__local bool *) &sync_arr_mem_82278[0] = old_counter_82287 ==
squot32(num_groups_71120 + smax32(1, K_68318) - 1, smax32(1,
K_68318)) -
1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
bool is_last_group_82288 = *(__local bool *) &sync_arr_mem_82278[0];
if (is_last_group_82288) {
if (local_tid_70744 == 0) {
old_counter_82287 = atomic_add((volatile __global int *) &
counter_mem_82274[srem32(squot32(group_id_70745,
squot32(num_groups_71120 +
smax32(1,
K_68318) -
1,
smax32(1,
K_68318))),
1024) *
4], 0 -
squot32(num_groups_71120 +
smax32(1, K_68318) - 1,
smax32(1, K_68318)));
}
// read in the per-group-results
{
if (slt32(local_tid_70744, squot32(num_groups_71120 + smax32(1,
K_68318) -
1, smax32(1, K_68318)))) {
x_71126 = *(__global
double *) &group_res_arr_mem_82272[(squot32(group_id_70745,
squot32(num_groups_71120 +
smax32(1,
K_68318) -
1,
smax32(1,
K_68318))) *
squot32(num_groups_71120 +
smax32(1,
K_68318) -
1,
smax32(1,
K_68318)) +
local_tid_70744) *
8];
} else {
x_71126 = 0.0;
}
*(__local double *) &red_arr_mem_82276[local_tid_70744 * 8] =
x_71126;
}
barrier(CLK_LOCAL_MEM_FENCE);
// reduce the per-group results
{
int32_t offset_82289;
int32_t skip_waves_82290;
double x_82281;
double x_82282;
offset_82289 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_70744, group_sizze_71110)) {
x_82281 = *(__local
double *) &red_arr_mem_82276[(local_tid_70744 +
offset_82289) *
8];
}
}
offset_82289 = 1;
while (slt32(offset_82289, wave_sizze_82267)) {
if (slt32(local_tid_70744 + offset_82289,
group_sizze_71110) && ((local_tid_70744 -
squot32(local_tid_70744,
wave_sizze_82267) *
wave_sizze_82267) & (2 *
offset_82289 -
1)) ==
0) {
// read array element
{
x_82282 = *(volatile __local
double *) &red_arr_mem_82276[(local_tid_70744 +
offset_82289) *
8];
}
// apply reduction operation
{
double res_82283 = x_82281 + x_82282;
x_82281 = res_82283;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82276[local_tid_70744 *
8] = x_82281;
}
}
offset_82289 *= 2;
}
skip_waves_82290 = 1;
while (slt32(skip_waves_82290, squot32(group_sizze_71110 +
wave_sizze_82267 - 1,
wave_sizze_82267))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82289 = skip_waves_82290 * wave_sizze_82267;
if (slt32(local_tid_70744 + offset_82289,
group_sizze_71110) && ((local_tid_70744 -
squot32(local_tid_70744,
wave_sizze_82267) *
wave_sizze_82267) == 0 &&
(squot32(local_tid_70744,
wave_sizze_82267) &
(2 * skip_waves_82290 -
1)) == 0)) {
// read array element
{
x_82282 = *(__local
double *) &red_arr_mem_82276[(local_tid_70744 +
offset_82289) *
8];
}
// apply reduction operation
{
double res_82283 = x_82281 + x_82282;
x_82281 = res_82283;
}
// write result of operation
{
*(__local
double *) &red_arr_mem_82276[local_tid_70744 *
8] = x_82281;
}
}
skip_waves_82290 *= 2;
}
// and back to memory with the final result
{
if (local_tid_70744 == 0) {
*(__global double *) &mem_80486[gtid_70721 * 8] =
x_82281;
}
}
}
}
}
}
__kernel void segred_large_70768(int32_t K_68318, int32_t D_68322,
int32_t D_68333, int32_t num_groups_71094,
__global unsigned char *qs_mem_80369, __global
unsigned char *mem_80483,
int32_t thread_per_segment_82234, __global
unsigned char *group_res_arr_mem_82235,
__global unsigned char *counter_mem_82237)
{
const int32_t group_sizze_71084 = gmm_objectivezigroup_sizze_70750;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(red_arr_mem_82239_backing_0, 8 *
gmm_objectivezigroup_sizze_70750);
ALIGNED_LOCAL_MEMORY(sync_arr_mem_82241_backing_1, 1);
int32_t global_tid_70768;
int32_t local_tid_70769;
int32_t group_sizze_82231;
int32_t wave_sizze_82230;
int32_t group_id_70770;
global_tid_70768 = get_global_id(0);
local_tid_70769 = get_local_id(0);
group_sizze_82231 = get_local_size(0);
wave_sizze_82230 = LOCKSTEP_WIDTH;
group_id_70770 = get_group_id(0);
int32_t gtid_70746;
int32_t gtid_70767;
__local char *red_arr_mem_82239;
red_arr_mem_82239 = (__local char *) red_arr_mem_82239_backing_0;
__local char *sync_arr_mem_82241;
sync_arr_mem_82241 = (__local char *) sync_arr_mem_82241_backing_1;
gtid_70746 = squot32(group_id_70770, squot32(num_groups_71094 + smax32(1,
K_68318) -
1, smax32(1, K_68318)));
int32_t chunk_sizze_82243 = smin32(squot32(D_68333 + group_sizze_71084 *
squot32(num_groups_71094 +
smax32(1, K_68318) - 1,
smax32(1, K_68318)) - 1,
group_sizze_71084 *
squot32(num_groups_71094 +
smax32(1, K_68318) - 1,
smax32(1, K_68318))),
squot32(D_68333 -
srem32(global_tid_70768,
group_sizze_71084 *
squot32(num_groups_71094 +
smax32(1,
K_68318) -
1, smax32(1,
K_68318))) +
thread_per_segment_82234 - 1,
thread_per_segment_82234));
double x_71100;
double x_71101;
x_71100 = 0.0;
for (int32_t i_82247 = 0; i_82247 < chunk_sizze_82243; i_82247++) {
gtid_70767 = srem32(global_tid_70768, group_sizze_71084 *
squot32(num_groups_71094 + smax32(1, K_68318) - 1,
smax32(1, K_68318))) +
thread_per_segment_82234 * i_82247;
// apply map function
{
double qs_elem_elem_71104;
double res_71105;
double res_71106;
qs_elem_elem_71104 = *(__global
double *) &qs_mem_80369[(gtid_70746 *
D_68322 +
gtid_70767) * 8];
res_71105 = futrts_exp64(qs_elem_elem_71104);
res_71106 = res_71105 * res_71105;
// save results to be reduced
{
x_71101 = res_71106;
}
// save map-out results
{ }
// apply reduction operator
{
double res_71102 = x_71100 + x_71101;
x_71100 = res_71102;
}
}
}
// to reduce current chunk, first store our result to memory
{
*(__local double *) &red_arr_mem_82239[local_tid_70769 * 8] = x_71100;
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82248;
int32_t skip_waves_82249;
double x_82244;
double x_82245;
offset_82248 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_70769, group_sizze_71084)) {
x_82244 = *(__local double *) &red_arr_mem_82239[(local_tid_70769 +
offset_82248) *
8];
}
}
offset_82248 = 1;
while (slt32(offset_82248, wave_sizze_82230)) {
if (slt32(local_tid_70769 + offset_82248, group_sizze_71084) &&
((local_tid_70769 - squot32(local_tid_70769, wave_sizze_82230) *
wave_sizze_82230) & (2 * offset_82248 - 1)) == 0) {
// read array element
{
x_82245 = *(volatile __local
double *) &red_arr_mem_82239[(local_tid_70769 +
offset_82248) * 8];
}
// apply reduction operation
{
double res_82246 = x_82244 + x_82245;
x_82244 = res_82246;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82239[local_tid_70769 * 8] = x_82244;
}
}
offset_82248 *= 2;
}
skip_waves_82249 = 1;
while (slt32(skip_waves_82249, squot32(group_sizze_71084 +
wave_sizze_82230 - 1,
wave_sizze_82230))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82248 = skip_waves_82249 * wave_sizze_82230;
if (slt32(local_tid_70769 + offset_82248, group_sizze_71084) &&
((local_tid_70769 - squot32(local_tid_70769, wave_sizze_82230) *
wave_sizze_82230) == 0 && (squot32(local_tid_70769,
wave_sizze_82230) & (2 *
skip_waves_82249 -
1)) ==
0)) {
// read array element
{
x_82245 = *(__local
double *) &red_arr_mem_82239[(local_tid_70769 +
offset_82248) * 8];
}
// apply reduction operation
{
double res_82246 = x_82244 + x_82245;
x_82244 = res_82246;
}
// write result of operation
{
*(__local double *) &red_arr_mem_82239[local_tid_70769 * 8] =
x_82244;
}
}
skip_waves_82249 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (squot32(num_groups_71094 + smax32(1, K_68318) - 1, smax32(1,
K_68318)) ==
1) {
// first thread in group saves final result to memory
{
if (local_tid_70769 == 0) {
*(__global double *) &mem_80483[gtid_70746 * 8] = x_82244;
}
}
} else {
int32_t old_counter_82250;
// first thread in group saves group result to global memory
{
if (local_tid_70769 == 0) {
*(__global double *) &group_res_arr_mem_82235[group_id_70770 *
8] = x_82244;
mem_fence_global();
old_counter_82250 = atomic_add((volatile __global int *) &
counter_mem_82237[srem32(squot32(group_id_70770,
squot32(num_groups_71094 +
smax32(1,
K_68318) -
1,
smax32(1,
K_68318))),
1024) *
4], 1);
*(__local bool *) &sync_arr_mem_82241[0] = old_counter_82250 ==
squot32(num_groups_71094 + smax32(1, K_68318) - 1, smax32(1,
K_68318)) -
1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
bool is_last_group_82251 = *(__local bool *) &sync_arr_mem_82241[0];
if (is_last_group_82251) {
if (local_tid_70769 == 0) {
old_counter_82250 = atomic_add((volatile __global int *) &
counter_mem_82237[srem32(squot32(group_id_70770,
squot32(num_groups_71094 +
smax32(1,
K_68318) -
1,
smax32(1,
K_68318))),
1024) *
4], 0 -
squot32(num_groups_71094 +
smax32(1, K_68318) - 1,
smax32(1, K_68318)));
}
// read in the per-group-results
{
if (slt32(local_tid_70769, squot32(num_groups_71094 + smax32(1,
K_68318) -
1, smax32(1, K_68318)))) {
x_71100 = *(__global
double *) &group_res_arr_mem_82235[(squot32(group_id_70770,
squot32(num_groups_71094 +
smax32(1,
K_68318) -
1,
smax32(1,
K_68318))) *
squot32(num_groups_71094 +
smax32(1,
K_68318) -
1,
smax32(1,
K_68318)) +
local_tid_70769) *
8];
} else {
x_71100 = 0.0;
}
*(__local double *) &red_arr_mem_82239[local_tid_70769 * 8] =
x_71100;
}
barrier(CLK_LOCAL_MEM_FENCE);
// reduce the per-group results
{
int32_t offset_82252;
int32_t skip_waves_82253;
double x_82244;
double x_82245;
offset_82252 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_70769, group_sizze_71084)) {
x_82244 = *(__local
double *) &red_arr_mem_82239[(local_tid_70769 +
offset_82252) *
8];
}
}
offset_82252 = 1;
while (slt32(offset_82252, wave_sizze_82230)) {
if (slt32(local_tid_70769 + offset_82252,
group_sizze_71084) && ((local_tid_70769 -
squot32(local_tid_70769,
wave_sizze_82230) *
wave_sizze_82230) & (2 *
offset_82252 -
1)) ==
0) {
// read array element
{
x_82245 = *(volatile __local
double *) &red_arr_mem_82239[(local_tid_70769 +
offset_82252) *
8];
}
// apply reduction operation
{
double res_82246 = x_82244 + x_82245;
x_82244 = res_82246;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82239[local_tid_70769 *
8] = x_82244;
}
}
offset_82252 *= 2;
}
skip_waves_82253 = 1;
while (slt32(skip_waves_82253, squot32(group_sizze_71084 +
wave_sizze_82230 - 1,
wave_sizze_82230))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82252 = skip_waves_82253 * wave_sizze_82230;
if (slt32(local_tid_70769 + offset_82252,
group_sizze_71084) && ((local_tid_70769 -
squot32(local_tid_70769,
wave_sizze_82230) *
wave_sizze_82230) == 0 &&
(squot32(local_tid_70769,
wave_sizze_82230) &
(2 * skip_waves_82253 -
1)) == 0)) {
// read array element
{
x_82245 = *(__local
double *) &red_arr_mem_82239[(local_tid_70769 +
offset_82252) *
8];
}
// apply reduction operation
{
double res_82246 = x_82244 + x_82245;
x_82244 = res_82246;
}
// write result of operation
{
*(__local
double *) &red_arr_mem_82239[local_tid_70769 *
8] = x_82244;
}
}
skip_waves_82253 *= 2;
}
// and back to memory with the final result
{
if (local_tid_70769 == 0) {
*(__global double *) &mem_80483[gtid_70746 * 8] =
x_82244;
}
}
}
}
}
}
__kernel void segred_large_73083(int32_t N_68508, int32_t K_68510,
int32_t D_68526, int32_t num_groups_77901,
__global unsigned char *mem_81382, __global
unsigned char *mem_81387,
int32_t thread_per_segment_83014, __global
unsigned char *group_res_arr_mem_83015,
__global unsigned char *counter_mem_83017)
{
const int32_t group_sizze_77891 = rev_gmm_objectivezigroup_sizze_73065;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(red_arr_mem_83019_backing_0, 8 *
rev_gmm_objectivezigroup_sizze_73065);
ALIGNED_LOCAL_MEMORY(sync_arr_mem_83021_backing_1, 1);
int32_t global_tid_73083;
int32_t local_tid_73084;
int32_t group_sizze_83011;
int32_t wave_sizze_83010;
int32_t group_id_73085;
global_tid_73083 = get_global_id(0);
local_tid_73084 = get_local_id(0);
group_sizze_83011 = get_local_size(0);
wave_sizze_83010 = LOCKSTEP_WIDTH;
group_id_73085 = get_group_id(0);
int32_t gtid_73057;
int32_t gtid_73058;
int32_t gtid_73059;
int32_t gtid_73082;
__local char *red_arr_mem_83019;
red_arr_mem_83019 = (__local char *) red_arr_mem_83019_backing_0;
__local char *sync_arr_mem_83021;
sync_arr_mem_83021 = (__local char *) sync_arr_mem_83021_backing_1;
gtid_73057 = squot32(squot32(group_id_73085, squot32(num_groups_77901 +
smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526))),
K_68510 * D_68526);
gtid_73058 = squot32(squot32(group_id_73085, squot32(num_groups_77901 +
smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526))) -
squot32(squot32(group_id_73085,
squot32(num_groups_77901 + smax32(1,
N_68508 *
K_68510 *
D_68526) -
1, smax32(1, N_68508 *
K_68510 * D_68526))),
K_68510 * D_68526) * (K_68510 * D_68526),
D_68526);
gtid_73059 = squot32(group_id_73085, squot32(num_groups_77901 + smax32(1,
N_68508 *
K_68510 *
D_68526) -
1, smax32(1, N_68508 *
K_68510 *
D_68526))) -
squot32(squot32(group_id_73085, squot32(num_groups_77901 + smax32(1,
N_68508 *
K_68510 *
D_68526) -
1, smax32(1, N_68508 * K_68510 *
D_68526))), K_68510 *
D_68526) * (K_68510 * D_68526) - squot32(squot32(group_id_73085,
squot32(num_groups_77901 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526))) -
squot32(squot32(group_id_73085,
squot32(num_groups_77901 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526))),
K_68510 *
D_68526) *
(K_68510 * D_68526),
D_68526) * D_68526;
int32_t chunk_sizze_83023 = smin32(squot32(D_68526 + group_sizze_77891 *
squot32(num_groups_77901 +
smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526)) - 1,
group_sizze_77891 *
squot32(num_groups_77901 +
smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526))),
squot32(D_68526 -
srem32(global_tid_73083,
group_sizze_77891 *
squot32(num_groups_77901 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1, smax32(1,
N_68508 *
K_68510 *
D_68526))) +
thread_per_segment_83014 - 1,
thread_per_segment_83014));
double x_77907;
double x_77908;
x_77907 = 0.0;
for (int32_t i_83027 = 0; i_83027 < chunk_sizze_83023; i_83027++) {
gtid_73082 = srem32(global_tid_73083, group_sizze_77891 *
squot32(num_groups_77901 + smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 * K_68510 * D_68526))) +
thread_per_segment_83014 * i_83027;
// apply map function
{
double x_77913 = *(__global double *) &mem_81382[(gtid_73057 *
(D_68526 *
D_68526 *
K_68510) +
gtid_73058 *
(D_68526 *
D_68526) +
gtid_73059 *
D_68526 +
gtid_73082) * 8];
// save results to be reduced
{
x_77908 = x_77913;
}
// save map-out results
{ }
// apply reduction operator
{
double res_77909 = x_77907 + x_77908;
x_77907 = res_77909;
}
}
}
// to reduce current chunk, first store our result to memory
{
*(__local double *) &red_arr_mem_83019[local_tid_73084 * 8] = x_77907;
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_83028;
int32_t skip_waves_83029;
double x_83024;
double x_83025;
offset_83028 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_73084, group_sizze_77891)) {
x_83024 = *(__local double *) &red_arr_mem_83019[(local_tid_73084 +
offset_83028) *
8];
}
}
offset_83028 = 1;
while (slt32(offset_83028, wave_sizze_83010)) {
if (slt32(local_tid_73084 + offset_83028, group_sizze_77891) &&
((local_tid_73084 - squot32(local_tid_73084, wave_sizze_83010) *
wave_sizze_83010) & (2 * offset_83028 - 1)) == 0) {
// read array element
{
x_83025 = *(volatile __local
double *) &red_arr_mem_83019[(local_tid_73084 +
offset_83028) * 8];
}
// apply reduction operation
{
double res_83026 = x_83024 + x_83025;
x_83024 = res_83026;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_83019[local_tid_73084 * 8] = x_83024;
}
}
offset_83028 *= 2;
}
skip_waves_83029 = 1;
while (slt32(skip_waves_83029, squot32(group_sizze_77891 +
wave_sizze_83010 - 1,
wave_sizze_83010))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_83028 = skip_waves_83029 * wave_sizze_83010;
if (slt32(local_tid_73084 + offset_83028, group_sizze_77891) &&
((local_tid_73084 - squot32(local_tid_73084, wave_sizze_83010) *
wave_sizze_83010) == 0 && (squot32(local_tid_73084,
wave_sizze_83010) & (2 *
skip_waves_83029 -
1)) ==
0)) {
// read array element
{
x_83025 = *(__local
double *) &red_arr_mem_83019[(local_tid_73084 +
offset_83028) * 8];
}
// apply reduction operation
{
double res_83026 = x_83024 + x_83025;
x_83024 = res_83026;
}
// write result of operation
{
*(__local double *) &red_arr_mem_83019[local_tid_73084 * 8] =
x_83024;
}
}
skip_waves_83029 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (squot32(num_groups_77901 + smax32(1, N_68508 * K_68510 * D_68526) - 1,
smax32(1, N_68508 * K_68510 * D_68526)) == 1) {
// first thread in group saves final result to memory
{
if (local_tid_73084 == 0) {
*(__global double *) &mem_81387[(gtid_73057 * (D_68526 *
K_68510) +
gtid_73058 * D_68526 +
gtid_73059) * 8] = x_83024;
}
}
} else {
int32_t old_counter_83030;
// first thread in group saves group result to global memory
{
if (local_tid_73084 == 0) {
*(__global double *) &group_res_arr_mem_83015[group_id_73085 *
8] = x_83024;
mem_fence_global();
old_counter_83030 = atomic_add((volatile __global int *) &
counter_mem_83017[srem32(squot32(group_id_73085,
squot32(num_groups_77901 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526))),
1024) *
4], 1);
*(__local bool *) &sync_arr_mem_83021[0] = old_counter_83030 ==
squot32(num_groups_77901 + smax32(1, N_68508 * K_68510 *
D_68526) - 1, smax32(1,
N_68508 *
K_68510 *
D_68526)) -
1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
bool is_last_group_83031 = *(__local bool *) &sync_arr_mem_83021[0];
if (is_last_group_83031) {
if (local_tid_73084 == 0) {
old_counter_83030 = atomic_add((volatile __global int *) &
counter_mem_83017[srem32(squot32(group_id_73085,
squot32(num_groups_77901 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526))),
1024) *
4], 0 -
squot32(num_groups_77901 +
smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526)));
}
// read in the per-group-results
{
if (slt32(local_tid_73084, squot32(num_groups_77901 + smax32(1,
N_68508 *
K_68510 *
D_68526) -
1, smax32(1, N_68508 *
K_68510 *
D_68526)))) {
x_77907 = *(__global
double *) &group_res_arr_mem_83015[(squot32(group_id_73085,
squot32(num_groups_77901 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526))) *
squot32(num_groups_77901 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526)) +
local_tid_73084) *
8];
} else {
x_77907 = 0.0;
}
*(__local double *) &red_arr_mem_83019[local_tid_73084 * 8] =
x_77907;
}
barrier(CLK_LOCAL_MEM_FENCE);
// reduce the per-group results
{
int32_t offset_83032;
int32_t skip_waves_83033;
double x_83024;
double x_83025;
offset_83032 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_73084, group_sizze_77891)) {
x_83024 = *(__local
double *) &red_arr_mem_83019[(local_tid_73084 +
offset_83032) *
8];
}
}
offset_83032 = 1;
while (slt32(offset_83032, wave_sizze_83010)) {
if (slt32(local_tid_73084 + offset_83032,
group_sizze_77891) && ((local_tid_73084 -
squot32(local_tid_73084,
wave_sizze_83010) *
wave_sizze_83010) & (2 *
offset_83032 -
1)) ==
0) {
// read array element
{
x_83025 = *(volatile __local
double *) &red_arr_mem_83019[(local_tid_73084 +
offset_83032) *
8];
}
// apply reduction operation
{
double res_83026 = x_83024 + x_83025;
x_83024 = res_83026;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_83019[local_tid_73084 *
8] = x_83024;
}
}
offset_83032 *= 2;
}
skip_waves_83033 = 1;
while (slt32(skip_waves_83033, squot32(group_sizze_77891 +
wave_sizze_83010 - 1,
wave_sizze_83010))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_83032 = skip_waves_83033 * wave_sizze_83010;
if (slt32(local_tid_73084 + offset_83032,
group_sizze_77891) && ((local_tid_73084 -
squot32(local_tid_73084,
wave_sizze_83010) *
wave_sizze_83010) == 0 &&
(squot32(local_tid_73084,
wave_sizze_83010) &
(2 * skip_waves_83033 -
1)) == 0)) {
// read array element
{
x_83025 = *(__local
double *) &red_arr_mem_83019[(local_tid_73084 +
offset_83032) *
8];
}
// apply reduction operation
{
double res_83026 = x_83024 + x_83025;
x_83024 = res_83026;
}
// write result of operation
{
*(__local
double *) &red_arr_mem_83019[local_tid_73084 *
8] = x_83024;
}
}
skip_waves_83033 *= 2;
}
// and back to memory with the final result
{
if (local_tid_73084 == 0) {
*(__global double *) &mem_81387[(gtid_73057 * (D_68526 *
K_68510) +
gtid_73058 * D_68526 +
gtid_73059) * 8] =
x_83024;
}
}
}
}
}
}
__kernel void segred_large_73254(int32_t N_68508, int32_t K_68510,
int32_t triD_68516, int32_t D_68526,
int32_t num_groups_77803, __global
unsigned char *mem_81324, __global
unsigned char *mem_81329,
int32_t thread_per_segment_82960, __global
unsigned char *group_res_arr_mem_82961,
__global unsigned char *counter_mem_82963)
{
const int32_t group_sizze_77793 = rev_gmm_objectivezigroup_sizze_73236;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(red_arr_mem_82965_backing_0, 8 *
rev_gmm_objectivezigroup_sizze_73236);
ALIGNED_LOCAL_MEMORY(sync_arr_mem_82967_backing_1, 1);
int32_t global_tid_73254;
int32_t local_tid_73255;
int32_t group_sizze_82957;
int32_t wave_sizze_82956;
int32_t group_id_73256;
global_tid_73254 = get_global_id(0);
local_tid_73255 = get_local_id(0);
group_sizze_82957 = get_local_size(0);
wave_sizze_82956 = LOCKSTEP_WIDTH;
group_id_73256 = get_group_id(0);
int32_t gtid_73228;
int32_t gtid_73229;
int32_t gtid_73230;
int32_t gtid_73253;
__local char *red_arr_mem_82965;
red_arr_mem_82965 = (__local char *) red_arr_mem_82965_backing_0;
__local char *sync_arr_mem_82967;
sync_arr_mem_82967 = (__local char *) sync_arr_mem_82967_backing_1;
gtid_73228 = squot32(squot32(group_id_73256, squot32(num_groups_77803 +
smax32(1, N_68508 *
K_68510 *
triD_68516) - 1,
smax32(1, N_68508 *
K_68510 *
triD_68516))),
K_68510 * triD_68516);
gtid_73229 = squot32(squot32(group_id_73256, squot32(num_groups_77803 +
smax32(1, N_68508 *
K_68510 *
triD_68516) - 1,
smax32(1, N_68508 *
K_68510 *
triD_68516))) -
squot32(squot32(group_id_73256,
squot32(num_groups_77803 + smax32(1,
N_68508 *
K_68510 *
triD_68516) -
1, smax32(1, N_68508 *
K_68510 *
triD_68516))),
K_68510 * triD_68516) * (K_68510 * triD_68516),
triD_68516);
gtid_73230 = squot32(group_id_73256, squot32(num_groups_77803 + smax32(1,
N_68508 *
K_68510 *
triD_68516) -
1, smax32(1, N_68508 *
K_68510 *
triD_68516))) -
squot32(squot32(group_id_73256, squot32(num_groups_77803 + smax32(1,
N_68508 *
K_68510 *
triD_68516) -
1, smax32(1, N_68508 * K_68510 *
triD_68516))),
K_68510 * triD_68516) * (K_68510 * triD_68516) -
squot32(squot32(group_id_73256, squot32(num_groups_77803 + smax32(1,
N_68508 *
K_68510 *
triD_68516) -
1, smax32(1, N_68508 * K_68510 *
triD_68516))) -
squot32(squot32(group_id_73256, squot32(num_groups_77803 +
smax32(1, N_68508 *
K_68510 *
triD_68516) - 1,
smax32(1, N_68508 *
K_68510 *
triD_68516))),
K_68510 * triD_68516) * (K_68510 * triD_68516),
triD_68516) * triD_68516;
int32_t chunk_sizze_82969 = smin32(squot32(D_68526 + group_sizze_77793 *
squot32(num_groups_77803 +
smax32(1, N_68508 *
K_68510 *
triD_68516) - 1,
smax32(1, N_68508 *
K_68510 *
triD_68516)) - 1,
group_sizze_77793 *
squot32(num_groups_77803 +
smax32(1, N_68508 *
K_68510 *
triD_68516) - 1,
smax32(1, N_68508 *
K_68510 *
triD_68516))),
squot32(D_68526 -
srem32(global_tid_73254,
group_sizze_77793 *
squot32(num_groups_77803 +
smax32(1,
N_68508 *
K_68510 *
triD_68516) -
1, smax32(1,
N_68508 *
K_68510 *
triD_68516))) +
thread_per_segment_82960 - 1,
thread_per_segment_82960));
double x_77809;
double x_77810;
x_77809 = 0.0;
for (int32_t i_82973 = 0; i_82973 < chunk_sizze_82969; i_82973++) {
gtid_73253 = srem32(global_tid_73254, group_sizze_77793 *
squot32(num_groups_77803 + smax32(1, N_68508 *
K_68510 *
triD_68516) - 1,
smax32(1, N_68508 * K_68510 *
triD_68516))) +
thread_per_segment_82960 * i_82973;
// apply map function
{
double x_77815 = *(__global double *) &mem_81324[(gtid_73228 *
(D_68526 *
triD_68516 *
K_68510) +
gtid_73229 *
(D_68526 *
triD_68516) +
gtid_73230 *
D_68526 +
gtid_73253) * 8];
// save results to be reduced
{
x_77810 = x_77815;
}
// save map-out results
{ }
// apply reduction operator
{
double res_77811 = x_77809 + x_77810;
x_77809 = res_77811;
}
}
}
// to reduce current chunk, first store our result to memory
{
*(__local double *) &red_arr_mem_82965[local_tid_73255 * 8] = x_77809;
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82974;
int32_t skip_waves_82975;
double x_82970;
double x_82971;
offset_82974 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_73255, group_sizze_77793)) {
x_82970 = *(__local double *) &red_arr_mem_82965[(local_tid_73255 +
offset_82974) *
8];
}
}
offset_82974 = 1;
while (slt32(offset_82974, wave_sizze_82956)) {
if (slt32(local_tid_73255 + offset_82974, group_sizze_77793) &&
((local_tid_73255 - squot32(local_tid_73255, wave_sizze_82956) *
wave_sizze_82956) & (2 * offset_82974 - 1)) == 0) {
// read array element
{
x_82971 = *(volatile __local
double *) &red_arr_mem_82965[(local_tid_73255 +
offset_82974) * 8];
}
// apply reduction operation
{
double res_82972 = x_82970 + x_82971;
x_82970 = res_82972;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82965[local_tid_73255 * 8] = x_82970;
}
}
offset_82974 *= 2;
}
skip_waves_82975 = 1;
while (slt32(skip_waves_82975, squot32(group_sizze_77793 +
wave_sizze_82956 - 1,
wave_sizze_82956))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82974 = skip_waves_82975 * wave_sizze_82956;
if (slt32(local_tid_73255 + offset_82974, group_sizze_77793) &&
((local_tid_73255 - squot32(local_tid_73255, wave_sizze_82956) *
wave_sizze_82956) == 0 && (squot32(local_tid_73255,
wave_sizze_82956) & (2 *
skip_waves_82975 -
1)) ==
0)) {
// read array element
{
x_82971 = *(__local
double *) &red_arr_mem_82965[(local_tid_73255 +
offset_82974) * 8];
}
// apply reduction operation
{
double res_82972 = x_82970 + x_82971;
x_82970 = res_82972;
}
// write result of operation
{
*(__local double *) &red_arr_mem_82965[local_tid_73255 * 8] =
x_82970;
}
}
skip_waves_82975 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (squot32(num_groups_77803 + smax32(1, N_68508 * K_68510 * triD_68516) -
1, smax32(1, N_68508 * K_68510 * triD_68516)) == 1) {
// first thread in group saves final result to memory
{
if (local_tid_73255 == 0) {
*(__global double *) &mem_81329[(gtid_73228 * (triD_68516 *
K_68510) +
gtid_73229 * triD_68516 +
gtid_73230) * 8] = x_82970;
}
}
} else {
int32_t old_counter_82976;
// first thread in group saves group result to global memory
{
if (local_tid_73255 == 0) {
*(__global double *) &group_res_arr_mem_82961[group_id_73256 *
8] = x_82970;
mem_fence_global();
old_counter_82976 = atomic_add((volatile __global int *) &
counter_mem_82963[srem32(squot32(group_id_73256,
squot32(num_groups_77803 +
smax32(1,
N_68508 *
K_68510 *
triD_68516) -
1,
smax32(1,
N_68508 *
K_68510 *
triD_68516))),
1024) *
4], 1);
*(__local bool *) &sync_arr_mem_82967[0] = old_counter_82976 ==
squot32(num_groups_77803 + smax32(1, N_68508 * K_68510 *
triD_68516) - 1, smax32(1,
N_68508 *
K_68510 *
triD_68516)) -
1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
bool is_last_group_82977 = *(__local bool *) &sync_arr_mem_82967[0];
if (is_last_group_82977) {
if (local_tid_73255 == 0) {
old_counter_82976 = atomic_add((volatile __global int *) &
counter_mem_82963[srem32(squot32(group_id_73256,
squot32(num_groups_77803 +
smax32(1,
N_68508 *
K_68510 *
triD_68516) -
1,
smax32(1,
N_68508 *
K_68510 *
triD_68516))),
1024) *
4], 0 -
squot32(num_groups_77803 +
smax32(1, N_68508 *
K_68510 *
triD_68516) - 1,
smax32(1, N_68508 *
K_68510 *
triD_68516)));
}
// read in the per-group-results
{
if (slt32(local_tid_73255, squot32(num_groups_77803 + smax32(1,
N_68508 *
K_68510 *
triD_68516) -
1, smax32(1, N_68508 *
K_68510 *
triD_68516)))) {
x_77809 = *(__global
double *) &group_res_arr_mem_82961[(squot32(group_id_73256,
squot32(num_groups_77803 +
smax32(1,
N_68508 *
K_68510 *
triD_68516) -
1,
smax32(1,
N_68508 *
K_68510 *
triD_68516))) *
squot32(num_groups_77803 +
smax32(1,
N_68508 *
K_68510 *
triD_68516) -
1,
smax32(1,
N_68508 *
K_68510 *
triD_68516)) +
local_tid_73255) *
8];
} else {
x_77809 = 0.0;
}
*(__local double *) &red_arr_mem_82965[local_tid_73255 * 8] =
x_77809;
}
barrier(CLK_LOCAL_MEM_FENCE);
// reduce the per-group results
{
int32_t offset_82978;
int32_t skip_waves_82979;
double x_82970;
double x_82971;
offset_82978 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_73255, group_sizze_77793)) {
x_82970 = *(__local
double *) &red_arr_mem_82965[(local_tid_73255 +
offset_82978) *
8];
}
}
offset_82978 = 1;
while (slt32(offset_82978, wave_sizze_82956)) {
if (slt32(local_tid_73255 + offset_82978,
group_sizze_77793) && ((local_tid_73255 -
squot32(local_tid_73255,
wave_sizze_82956) *
wave_sizze_82956) & (2 *
offset_82978 -
1)) ==
0) {
// read array element
{
x_82971 = *(volatile __local
double *) &red_arr_mem_82965[(local_tid_73255 +
offset_82978) *
8];
}
// apply reduction operation
{
double res_82972 = x_82970 + x_82971;
x_82970 = res_82972;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82965[local_tid_73255 *
8] = x_82970;
}
}
offset_82978 *= 2;
}
skip_waves_82979 = 1;
while (slt32(skip_waves_82979, squot32(group_sizze_77793 +
wave_sizze_82956 - 1,
wave_sizze_82956))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82978 = skip_waves_82979 * wave_sizze_82956;
if (slt32(local_tid_73255 + offset_82978,
group_sizze_77793) && ((local_tid_73255 -
squot32(local_tid_73255,
wave_sizze_82956) *
wave_sizze_82956) == 0 &&
(squot32(local_tid_73255,
wave_sizze_82956) &
(2 * skip_waves_82979 -
1)) == 0)) {
// read array element
{
x_82971 = *(__local
double *) &red_arr_mem_82965[(local_tid_73255 +
offset_82978) *
8];
}
// apply reduction operation
{
double res_82972 = x_82970 + x_82971;
x_82970 = res_82972;
}
// write result of operation
{
*(__local
double *) &red_arr_mem_82965[local_tid_73255 *
8] = x_82970;
}
}
skip_waves_82979 *= 2;
}
// and back to memory with the final result
{
if (local_tid_73255 == 0) {
*(__global double *) &mem_81329[(gtid_73228 *
(triD_68516 *
K_68510) +
gtid_73229 *
triD_68516 +
gtid_73230) * 8] =
x_82970;
}
}
}
}
}
}
__kernel void segred_large_73751(int32_t N_68508, int32_t K_68510,
int32_t triD_68516, int32_t D_68526,
int32_t num_groups_77705, __global
unsigned char *mem_81263, __global
unsigned char *mem_81269,
int32_t thread_per_segment_82906, __global
unsigned char *group_res_arr_mem_82907,
__global unsigned char *counter_mem_82909)
{
const int32_t group_sizze_77695 = rev_gmm_objectivezigroup_sizze_73733;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(red_arr_mem_82911_backing_0, 8 *
rev_gmm_objectivezigroup_sizze_73733);
ALIGNED_LOCAL_MEMORY(sync_arr_mem_82913_backing_1, 1);
int32_t global_tid_73751;
int32_t local_tid_73752;
int32_t group_sizze_82903;
int32_t wave_sizze_82902;
int32_t group_id_73753;
global_tid_73751 = get_global_id(0);
local_tid_73752 = get_local_id(0);
group_sizze_82903 = get_local_size(0);
wave_sizze_82902 = LOCKSTEP_WIDTH;
group_id_73753 = get_group_id(0);
int32_t gtid_73723;
int32_t gtid_73724;
int32_t gtid_73725;
int32_t gtid_73726;
int32_t gtid_73750;
__local char *red_arr_mem_82911;
red_arr_mem_82911 = (__local char *) red_arr_mem_82911_backing_0;
__local char *sync_arr_mem_82913;
sync_arr_mem_82913 = (__local char *) sync_arr_mem_82913_backing_1;
gtid_73723 = squot32(squot32(group_id_73753, squot32(num_groups_77705 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516))),
K_68510 * D_68526 * triD_68516);
gtid_73724 = squot32(squot32(group_id_73753, squot32(num_groups_77705 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516))) -
squot32(squot32(group_id_73753,
squot32(num_groups_77705 + smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516) -
1, smax32(1, N_68508 *
K_68510 * D_68526 *
triD_68516))),
K_68510 * D_68526 * triD_68516) * (K_68510 *
D_68526 *
triD_68516),
D_68526 * triD_68516);
gtid_73725 = squot32(squot32(group_id_73753, squot32(num_groups_77705 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516))) -
squot32(squot32(group_id_73753,
squot32(num_groups_77705 + smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516) -
1, smax32(1, N_68508 *
K_68510 * D_68526 *
triD_68516))),
K_68510 * D_68526 * triD_68516) * (K_68510 *
D_68526 *
triD_68516) -
squot32(squot32(group_id_73753,
squot32(num_groups_77705 + smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516) -
1, smax32(1, N_68508 *
K_68510 * D_68526 *
triD_68516))) -
squot32(squot32(group_id_73753,
squot32(num_groups_77705 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516))),
K_68510 * D_68526 * triD_68516) *
(K_68510 * D_68526 * triD_68516), D_68526 *
triD_68516) * (D_68526 * triD_68516),
triD_68516);
gtid_73726 = squot32(group_id_73753, squot32(num_groups_77705 + smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516) -
1, smax32(1, N_68508 *
K_68510 * D_68526 *
triD_68516))) -
squot32(squot32(group_id_73753, squot32(num_groups_77705 + smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516) -
1, smax32(1, N_68508 * K_68510 *
D_68526 *
triD_68516))),
K_68510 * D_68526 * triD_68516) * (K_68510 * D_68526 *
triD_68516) -
squot32(squot32(group_id_73753, squot32(num_groups_77705 + smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516) -
1, smax32(1, N_68508 * K_68510 *
D_68526 *
triD_68516))) -
squot32(squot32(group_id_73753, squot32(num_groups_77705 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516))),
K_68510 * D_68526 * triD_68516) * (K_68510 * D_68526 *
triD_68516),
D_68526 * triD_68516) * (D_68526 * triD_68516) -
squot32(squot32(group_id_73753, squot32(num_groups_77705 + smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516) -
1, smax32(1, N_68508 * K_68510 *
D_68526 *
triD_68516))) -
squot32(squot32(group_id_73753, squot32(num_groups_77705 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516))),
K_68510 * D_68526 * triD_68516) * (K_68510 * D_68526 *
triD_68516) -
squot32(squot32(group_id_73753, squot32(num_groups_77705 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516))) -
squot32(squot32(group_id_73753,
squot32(num_groups_77705 + smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516) -
1, smax32(1, N_68508 * K_68510 *
D_68526 *
triD_68516))),
K_68510 * D_68526 * triD_68516) * (K_68510 *
D_68526 *
triD_68516),
D_68526 * triD_68516) * (D_68526 * triD_68516),
triD_68516) * triD_68516;
int32_t chunk_sizze_82915 = smin32(squot32(D_68526 + group_sizze_77695 *
squot32(num_groups_77705 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516)) - 1,
group_sizze_77695 *
squot32(num_groups_77705 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516))),
squot32(D_68526 -
srem32(global_tid_73751,
group_sizze_77695 *
squot32(num_groups_77705 +
smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516) -
1, smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516))) +
thread_per_segment_82906 - 1,
thread_per_segment_82906));
double x_77711;
double x_77712;
x_77711 = 0.0;
for (int32_t i_82919 = 0; i_82919 < chunk_sizze_82915; i_82919++) {
gtid_73750 = srem32(global_tid_73751, group_sizze_77695 *
squot32(num_groups_77705 + smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516) - 1,
smax32(1, N_68508 * K_68510 * D_68526 *
triD_68516))) +
thread_per_segment_82906 * i_82919;
// apply map function
{
double x_77718 = *(__global double *) &mem_81263[(gtid_73723 *
(D_68526 *
triD_68516 *
D_68526 *
K_68510) +
gtid_73724 *
(D_68526 *
triD_68516 *
D_68526) +
gtid_73725 *
(D_68526 *
triD_68516) +
gtid_73726 *
D_68526 +
gtid_73750) * 8];
// save results to be reduced
{
x_77712 = x_77718;
}
// save map-out results
{ }
// apply reduction operator
{
double res_77713 = x_77711 + x_77712;
x_77711 = res_77713;
}
}
}
// to reduce current chunk, first store our result to memory
{
*(__local double *) &red_arr_mem_82911[local_tid_73752 * 8] = x_77711;
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82920;
int32_t skip_waves_82921;
double x_82916;
double x_82917;
offset_82920 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_73752, group_sizze_77695)) {
x_82916 = *(__local double *) &red_arr_mem_82911[(local_tid_73752 +
offset_82920) *
8];
}
}
offset_82920 = 1;
while (slt32(offset_82920, wave_sizze_82902)) {
if (slt32(local_tid_73752 + offset_82920, group_sizze_77695) &&
((local_tid_73752 - squot32(local_tid_73752, wave_sizze_82902) *
wave_sizze_82902) & (2 * offset_82920 - 1)) == 0) {
// read array element
{
x_82917 = *(volatile __local
double *) &red_arr_mem_82911[(local_tid_73752 +
offset_82920) * 8];
}
// apply reduction operation
{
double res_82918 = x_82916 + x_82917;
x_82916 = res_82918;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82911[local_tid_73752 * 8] = x_82916;
}
}
offset_82920 *= 2;
}
skip_waves_82921 = 1;
while (slt32(skip_waves_82921, squot32(group_sizze_77695 +
wave_sizze_82902 - 1,
wave_sizze_82902))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82920 = skip_waves_82921 * wave_sizze_82902;
if (slt32(local_tid_73752 + offset_82920, group_sizze_77695) &&
((local_tid_73752 - squot32(local_tid_73752, wave_sizze_82902) *
wave_sizze_82902) == 0 && (squot32(local_tid_73752,
wave_sizze_82902) & (2 *
skip_waves_82921 -
1)) ==
0)) {
// read array element
{
x_82917 = *(__local
double *) &red_arr_mem_82911[(local_tid_73752 +
offset_82920) * 8];
}
// apply reduction operation
{
double res_82918 = x_82916 + x_82917;
x_82916 = res_82918;
}
// write result of operation
{
*(__local double *) &red_arr_mem_82911[local_tid_73752 * 8] =
x_82916;
}
}
skip_waves_82921 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (squot32(num_groups_77705 + smax32(1, N_68508 * K_68510 * D_68526 *
triD_68516) - 1, smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516)) ==
1) {
// first thread in group saves final result to memory
{
if (local_tid_73752 == 0) {
*(__global double *) &mem_81269[(gtid_73723 * (triD_68516 *
D_68526 *
K_68510) +
gtid_73724 * (triD_68516 *
D_68526) +
gtid_73725 * triD_68516 +
gtid_73726) * 8] = x_82916;
}
}
} else {
int32_t old_counter_82922;
// first thread in group saves group result to global memory
{
if (local_tid_73752 == 0) {
*(__global double *) &group_res_arr_mem_82907[group_id_73753 *
8] = x_82916;
mem_fence_global();
old_counter_82922 = atomic_add((volatile __global int *) &
counter_mem_82909[srem32(squot32(group_id_73753,
squot32(num_groups_77705 +
smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516))),
1024) *
4], 1);
*(__local bool *) &sync_arr_mem_82913[0] = old_counter_82922 ==
squot32(num_groups_77705 + smax32(1, N_68508 * K_68510 *
D_68526 * triD_68516) - 1,
smax32(1, N_68508 * K_68510 * D_68526 *
triD_68516)) - 1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
bool is_last_group_82923 = *(__local bool *) &sync_arr_mem_82913[0];
if (is_last_group_82923) {
if (local_tid_73752 == 0) {
old_counter_82922 = atomic_add((volatile __global int *) &
counter_mem_82909[srem32(squot32(group_id_73753,
squot32(num_groups_77705 +
smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516))),
1024) *
4], 0 -
squot32(num_groups_77705 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
triD_68516)));
}
// read in the per-group-results
{
if (slt32(local_tid_73752, squot32(num_groups_77705 + smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516) -
1, smax32(1, N_68508 *
K_68510 * D_68526 *
triD_68516)))) {
x_77711 = *(__global
double *) &group_res_arr_mem_82907[(squot32(group_id_73753,
squot32(num_groups_77705 +
smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516))) *
squot32(num_groups_77705 +
smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526 *
triD_68516)) +
local_tid_73752) *
8];
} else {
x_77711 = 0.0;
}
*(__local double *) &red_arr_mem_82911[local_tid_73752 * 8] =
x_77711;
}
barrier(CLK_LOCAL_MEM_FENCE);
// reduce the per-group results
{
int32_t offset_82924;
int32_t skip_waves_82925;
double x_82916;
double x_82917;
offset_82924 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_73752, group_sizze_77695)) {
x_82916 = *(__local
double *) &red_arr_mem_82911[(local_tid_73752 +
offset_82924) *
8];
}
}
offset_82924 = 1;
while (slt32(offset_82924, wave_sizze_82902)) {
if (slt32(local_tid_73752 + offset_82924,
group_sizze_77695) && ((local_tid_73752 -
squot32(local_tid_73752,
wave_sizze_82902) *
wave_sizze_82902) & (2 *
offset_82924 -
1)) ==
0) {
// read array element
{
x_82917 = *(volatile __local
double *) &red_arr_mem_82911[(local_tid_73752 +
offset_82924) *
8];
}
// apply reduction operation
{
double res_82918 = x_82916 + x_82917;
x_82916 = res_82918;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82911[local_tid_73752 *
8] = x_82916;
}
}
offset_82924 *= 2;
}
skip_waves_82925 = 1;
while (slt32(skip_waves_82925, squot32(group_sizze_77695 +
wave_sizze_82902 - 1,
wave_sizze_82902))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82924 = skip_waves_82925 * wave_sizze_82902;
if (slt32(local_tid_73752 + offset_82924,
group_sizze_77695) && ((local_tid_73752 -
squot32(local_tid_73752,
wave_sizze_82902) *
wave_sizze_82902) == 0 &&
(squot32(local_tid_73752,
wave_sizze_82902) &
(2 * skip_waves_82925 -
1)) == 0)) {
// read array element
{
x_82917 = *(__local
double *) &red_arr_mem_82911[(local_tid_73752 +
offset_82924) *
8];
}
// apply reduction operation
{
double res_82918 = x_82916 + x_82917;
x_82916 = res_82918;
}
// write result of operation
{
*(__local
double *) &red_arr_mem_82911[local_tid_73752 *
8] = x_82916;
}
}
skip_waves_82925 *= 2;
}
// and back to memory with the final result
{
if (local_tid_73752 == 0) {
*(__global double *) &mem_81269[(gtid_73723 *
(triD_68516 * D_68526 *
K_68510) +
gtid_73724 *
(triD_68516 *
D_68526) +
gtid_73725 *
triD_68516 +
gtid_73726) * 8] =
x_82916;
}
}
}
}
}
}
__kernel void segred_large_73924(int32_t N_68508, int32_t K_68510,
int32_t D_68526, int32_t num_groups_77606,
__global unsigned char *mem_81194, __global
unsigned char *mem_81200,
int32_t thread_per_segment_82852, __global
unsigned char *group_res_arr_mem_82853,
__global unsigned char *counter_mem_82855)
{
const int32_t group_sizze_77596 = rev_gmm_objectivezigroup_sizze_73906;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(red_arr_mem_82857_backing_0, 8 *
rev_gmm_objectivezigroup_sizze_73906);
ALIGNED_LOCAL_MEMORY(sync_arr_mem_82859_backing_1, 1);
int32_t global_tid_73924;
int32_t local_tid_73925;
int32_t group_sizze_82849;
int32_t wave_sizze_82848;
int32_t group_id_73926;
global_tid_73924 = get_global_id(0);
local_tid_73925 = get_local_id(0);
group_sizze_82849 = get_local_size(0);
wave_sizze_82848 = LOCKSTEP_WIDTH;
group_id_73926 = get_group_id(0);
int32_t gtid_73896;
int32_t gtid_73897;
int32_t gtid_73898;
int32_t gtid_73899;
int32_t gtid_73923;
__local char *red_arr_mem_82857;
red_arr_mem_82857 = (__local char *) red_arr_mem_82857_backing_0;
__local char *sync_arr_mem_82859;
sync_arr_mem_82859 = (__local char *) sync_arr_mem_82859_backing_1;
gtid_73896 = squot32(squot32(group_id_73926, squot32(num_groups_77606 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526))),
K_68510 * D_68526 * D_68526);
gtid_73897 = squot32(squot32(group_id_73926, squot32(num_groups_77606 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526))) -
squot32(squot32(group_id_73926,
squot32(num_groups_77606 + smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526) -
1, smax32(1, N_68508 *
K_68510 * D_68526 *
D_68526))), K_68510 *
D_68526 * D_68526) * (K_68510 * D_68526 *
D_68526), D_68526 *
D_68526);
gtid_73898 = squot32(squot32(group_id_73926, squot32(num_groups_77606 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526))) -
squot32(squot32(group_id_73926,
squot32(num_groups_77606 + smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526) -
1, smax32(1, N_68508 *
K_68510 * D_68526 *
D_68526))), K_68510 *
D_68526 * D_68526) * (K_68510 * D_68526 *
D_68526) -
squot32(squot32(group_id_73926,
squot32(num_groups_77606 + smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526) -
1, smax32(1, N_68508 *
K_68510 * D_68526 *
D_68526))) -
squot32(squot32(group_id_73926,
squot32(num_groups_77606 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526))),
K_68510 * D_68526 * D_68526) *
(K_68510 * D_68526 * D_68526), D_68526 *
D_68526) * (D_68526 * D_68526), D_68526);
gtid_73899 = squot32(group_id_73926, squot32(num_groups_77606 + smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526) -
1, smax32(1, N_68508 *
K_68510 * D_68526 *
D_68526))) -
squot32(squot32(group_id_73926, squot32(num_groups_77606 + smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526) -
1, smax32(1, N_68508 * K_68510 *
D_68526 * D_68526))),
K_68510 * D_68526 * D_68526) * (K_68510 * D_68526 * D_68526) -
squot32(squot32(group_id_73926, squot32(num_groups_77606 + smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526) -
1, smax32(1, N_68508 * K_68510 *
D_68526 * D_68526))) -
squot32(squot32(group_id_73926, squot32(num_groups_77606 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526))),
K_68510 * D_68526 * D_68526) * (K_68510 * D_68526 *
D_68526), D_68526 *
D_68526) * (D_68526 * D_68526) - squot32(squot32(group_id_73926,
squot32(num_groups_77606 +
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526))) -
squot32(squot32(group_id_73926,
squot32(num_groups_77606 +
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526))),
K_68510 *
D_68526 *
D_68526) *
(K_68510 * D_68526 *
D_68526) -
squot32(squot32(group_id_73926,
squot32(num_groups_77606 +
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526))) -
squot32(squot32(group_id_73926,
squot32(num_groups_77606 +
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526))),
K_68510 *
D_68526 *
D_68526) *
(K_68510 *
D_68526 *
D_68526),
D_68526 *
D_68526) *
(D_68526 * D_68526),
D_68526) * D_68526;
int32_t chunk_sizze_82861 = smin32(squot32(D_68526 + group_sizze_77596 *
squot32(num_groups_77606 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526)) - 1,
group_sizze_77596 *
squot32(num_groups_77606 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526))),
squot32(D_68526 -
srem32(global_tid_73924,
group_sizze_77596 *
squot32(num_groups_77606 +
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526) -
1, smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526))) +
thread_per_segment_82852 - 1,
thread_per_segment_82852));
double x_77612;
double x_77613;
x_77612 = 0.0;
for (int32_t i_82865 = 0; i_82865 < chunk_sizze_82861; i_82865++) {
gtid_73923 = srem32(global_tid_73924, group_sizze_77596 *
squot32(num_groups_77606 + smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526) - 1,
smax32(1, N_68508 * K_68510 * D_68526 *
D_68526))) +
thread_per_segment_82852 * i_82865;
// apply map function
{
double x_77619 = *(__global double *) &mem_81194[(gtid_73896 *
(D_68526 *
D_68526 *
D_68526 *
K_68510) +
gtid_73897 *
(D_68526 *
D_68526 *
D_68526) +
gtid_73898 *
(D_68526 *
D_68526) +
gtid_73899 *
D_68526 +
gtid_73923) * 8];
// save results to be reduced
{
x_77613 = x_77619;
}
// save map-out results
{ }
// apply reduction operator
{
double res_77614 = x_77612 + x_77613;
x_77612 = res_77614;
}
}
}
// to reduce current chunk, first store our result to memory
{
*(__local double *) &red_arr_mem_82857[local_tid_73925 * 8] = x_77612;
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82866;
int32_t skip_waves_82867;
double x_82862;
double x_82863;
offset_82866 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_73925, group_sizze_77596)) {
x_82862 = *(__local double *) &red_arr_mem_82857[(local_tid_73925 +
offset_82866) *
8];
}
}
offset_82866 = 1;
while (slt32(offset_82866, wave_sizze_82848)) {
if (slt32(local_tid_73925 + offset_82866, group_sizze_77596) &&
((local_tid_73925 - squot32(local_tid_73925, wave_sizze_82848) *
wave_sizze_82848) & (2 * offset_82866 - 1)) == 0) {
// read array element
{
x_82863 = *(volatile __local
double *) &red_arr_mem_82857[(local_tid_73925 +
offset_82866) * 8];
}
// apply reduction operation
{
double res_82864 = x_82862 + x_82863;
x_82862 = res_82864;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82857[local_tid_73925 * 8] = x_82862;
}
}
offset_82866 *= 2;
}
skip_waves_82867 = 1;
while (slt32(skip_waves_82867, squot32(group_sizze_77596 +
wave_sizze_82848 - 1,
wave_sizze_82848))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82866 = skip_waves_82867 * wave_sizze_82848;
if (slt32(local_tid_73925 + offset_82866, group_sizze_77596) &&
((local_tid_73925 - squot32(local_tid_73925, wave_sizze_82848) *
wave_sizze_82848) == 0 && (squot32(local_tid_73925,
wave_sizze_82848) & (2 *
skip_waves_82867 -
1)) ==
0)) {
// read array element
{
x_82863 = *(__local
double *) &red_arr_mem_82857[(local_tid_73925 +
offset_82866) * 8];
}
// apply reduction operation
{
double res_82864 = x_82862 + x_82863;
x_82862 = res_82864;
}
// write result of operation
{
*(__local double *) &red_arr_mem_82857[local_tid_73925 * 8] =
x_82862;
}
}
skip_waves_82867 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (squot32(num_groups_77606 + smax32(1, N_68508 * K_68510 * D_68526 *
D_68526) - 1, smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526)) == 1) {
// first thread in group saves final result to memory
{
if (local_tid_73925 == 0) {
*(__global double *) &mem_81200[(gtid_73896 * (D_68526 *
D_68526 *
K_68510) +
gtid_73897 * (D_68526 *
D_68526) +
gtid_73898 * D_68526 +
gtid_73899) * 8] = x_82862;
}
}
} else {
int32_t old_counter_82868;
// first thread in group saves group result to global memory
{
if (local_tid_73925 == 0) {
*(__global double *) &group_res_arr_mem_82853[group_id_73926 *
8] = x_82862;
mem_fence_global();
old_counter_82868 = atomic_add((volatile __global int *) &
counter_mem_82855[srem32(squot32(group_id_73926,
squot32(num_groups_77606 +
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526))),
1024) *
4], 1);
*(__local bool *) &sync_arr_mem_82859[0] = old_counter_82868 ==
squot32(num_groups_77606 + smax32(1, N_68508 * K_68510 *
D_68526 * D_68526) - 1,
smax32(1, N_68508 * K_68510 * D_68526 * D_68526)) -
1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
bool is_last_group_82869 = *(__local bool *) &sync_arr_mem_82859[0];
if (is_last_group_82869) {
if (local_tid_73925 == 0) {
old_counter_82868 = atomic_add((volatile __global int *) &
counter_mem_82855[srem32(squot32(group_id_73926,
squot32(num_groups_77606 +
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526))),
1024) *
4], 0 -
squot32(num_groups_77606 +
smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526 *
D_68526)));
}
// read in the per-group-results
{
if (slt32(local_tid_73925, squot32(num_groups_77606 + smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526) -
1, smax32(1, N_68508 *
K_68510 * D_68526 *
D_68526)))) {
x_77612 = *(__global
double *) &group_res_arr_mem_82853[(squot32(group_id_73926,
squot32(num_groups_77606 +
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526))) *
squot32(num_groups_77606 +
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526 *
D_68526)) +
local_tid_73925) *
8];
} else {
x_77612 = 0.0;
}
*(__local double *) &red_arr_mem_82857[local_tid_73925 * 8] =
x_77612;
}
barrier(CLK_LOCAL_MEM_FENCE);
// reduce the per-group results
{
int32_t offset_82870;
int32_t skip_waves_82871;
double x_82862;
double x_82863;
offset_82870 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_73925, group_sizze_77596)) {
x_82862 = *(__local
double *) &red_arr_mem_82857[(local_tid_73925 +
offset_82870) *
8];
}
}
offset_82870 = 1;
while (slt32(offset_82870, wave_sizze_82848)) {
if (slt32(local_tid_73925 + offset_82870,
group_sizze_77596) && ((local_tid_73925 -
squot32(local_tid_73925,
wave_sizze_82848) *
wave_sizze_82848) & (2 *
offset_82870 -
1)) ==
0) {
// read array element
{
x_82863 = *(volatile __local
double *) &red_arr_mem_82857[(local_tid_73925 +
offset_82870) *
8];
}
// apply reduction operation
{
double res_82864 = x_82862 + x_82863;
x_82862 = res_82864;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82857[local_tid_73925 *
8] = x_82862;
}
}
offset_82870 *= 2;
}
skip_waves_82871 = 1;
while (slt32(skip_waves_82871, squot32(group_sizze_77596 +
wave_sizze_82848 - 1,
wave_sizze_82848))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82870 = skip_waves_82871 * wave_sizze_82848;
if (slt32(local_tid_73925 + offset_82870,
group_sizze_77596) && ((local_tid_73925 -
squot32(local_tid_73925,
wave_sizze_82848) *
wave_sizze_82848) == 0 &&
(squot32(local_tid_73925,
wave_sizze_82848) &
(2 * skip_waves_82871 -
1)) == 0)) {
// read array element
{
x_82863 = *(__local
double *) &red_arr_mem_82857[(local_tid_73925 +
offset_82870) *
8];
}
// apply reduction operation
{
double res_82864 = x_82862 + x_82863;
x_82862 = res_82864;
}
// write result of operation
{
*(__local
double *) &red_arr_mem_82857[local_tid_73925 *
8] = x_82862;
}
}
skip_waves_82871 *= 2;
}
// and back to memory with the final result
{
if (local_tid_73925 == 0) {
*(__global double *) &mem_81200[(gtid_73896 * (D_68526 *
D_68526 *
K_68510) +
gtid_73897 * (D_68526 *
D_68526) +
gtid_73898 * D_68526 +
gtid_73899) * 8] =
x_82862;
}
}
}
}
}
}
__kernel void segred_large_74280(int32_t N_68508, int32_t K_68510,
int32_t D_68526, int32_t num_groups_77406,
__global unsigned char *res_r_r_mem_80930,
__global unsigned char *mem_81082, __global
unsigned char *mem_81087,
int32_t thread_per_segment_82775, __global
unsigned char *group_res_arr_mem_82776,
__global unsigned char *counter_mem_82778)
{
const int32_t group_sizze_77396 = rev_gmm_objectivezigroup_sizze_74262;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(red_arr_mem_82780_backing_0, 8 *
rev_gmm_objectivezigroup_sizze_74262);
ALIGNED_LOCAL_MEMORY(sync_arr_mem_82782_backing_1, 1);
int32_t global_tid_74280;
int32_t local_tid_74281;
int32_t group_sizze_82772;
int32_t wave_sizze_82771;
int32_t group_id_74282;
global_tid_74280 = get_global_id(0);
local_tid_74281 = get_local_id(0);
group_sizze_82772 = get_local_size(0);
wave_sizze_82771 = LOCKSTEP_WIDTH;
group_id_74282 = get_group_id(0);
int32_t gtid_74252;
int32_t gtid_74253;
int32_t gtid_74254;
int32_t gtid_74279;
__local char *red_arr_mem_82780;
red_arr_mem_82780 = (__local char *) red_arr_mem_82780_backing_0;
__local char *sync_arr_mem_82782;
sync_arr_mem_82782 = (__local char *) sync_arr_mem_82782_backing_1;
gtid_74252 = squot32(squot32(group_id_74282, squot32(num_groups_77406 +
smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526))),
K_68510 * D_68526);
gtid_74253 = squot32(squot32(group_id_74282, squot32(num_groups_77406 +
smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526))) -
squot32(squot32(group_id_74282,
squot32(num_groups_77406 + smax32(1,
N_68508 *
K_68510 *
D_68526) -
1, smax32(1, N_68508 *
K_68510 * D_68526))),
K_68510 * D_68526) * (K_68510 * D_68526),
D_68526);
gtid_74254 = squot32(group_id_74282, squot32(num_groups_77406 + smax32(1,
N_68508 *
K_68510 *
D_68526) -
1, smax32(1, N_68508 *
K_68510 *
D_68526))) -
squot32(squot32(group_id_74282, squot32(num_groups_77406 + smax32(1,
N_68508 *
K_68510 *
D_68526) -
1, smax32(1, N_68508 * K_68510 *
D_68526))), K_68510 *
D_68526) * (K_68510 * D_68526) - squot32(squot32(group_id_74282,
squot32(num_groups_77406 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526))) -
squot32(squot32(group_id_74282,
squot32(num_groups_77406 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526))),
K_68510 *
D_68526) *
(K_68510 * D_68526),
D_68526) * D_68526;
int32_t chunk_sizze_82784 = smin32(squot32(D_68526 + group_sizze_77396 *
squot32(num_groups_77406 +
smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526)) - 1,
group_sizze_77396 *
squot32(num_groups_77406 +
smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526))),
squot32(D_68526 -
srem32(global_tid_74280,
group_sizze_77396 *
squot32(num_groups_77406 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1, smax32(1,
N_68508 *
K_68510 *
D_68526))) +
thread_per_segment_82775 - 1,
thread_per_segment_82775));
double x_77412;
double x_77413;
x_77412 = 0.0;
for (int32_t i_82788 = 0; i_82788 < chunk_sizze_82784; i_82788++) {
gtid_74279 = srem32(global_tid_74280, group_sizze_77396 *
squot32(num_groups_77406 + smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 * K_68510 * D_68526))) +
thread_per_segment_82775 * i_82788;
// apply map function
{
double x_77420;
double x_77421;
double res_77422;
x_77420 = *(__global double *) &mem_81082[(gtid_74252 * (D_68526 *
D_68526 *
K_68510) +
gtid_74253 * (D_68526 *
D_68526) +
gtid_74254 * D_68526 +
gtid_74279) * 8];
x_77421 = *(__global double *) &res_r_r_mem_80930[(gtid_74252 *
(D_68526 *
K_68510) +
gtid_74253 *
D_68526 +
gtid_74279) * 8];
res_77422 = x_77420 * x_77421;
// save results to be reduced
{
x_77413 = res_77422;
}
// save map-out results
{ }
// apply reduction operator
{
double res_77414 = x_77412 + x_77413;
x_77412 = res_77414;
}
}
}
// to reduce current chunk, first store our result to memory
{
*(__local double *) &red_arr_mem_82780[local_tid_74281 * 8] = x_77412;
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82789;
int32_t skip_waves_82790;
double x_82785;
double x_82786;
offset_82789 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_74281, group_sizze_77396)) {
x_82785 = *(__local double *) &red_arr_mem_82780[(local_tid_74281 +
offset_82789) *
8];
}
}
offset_82789 = 1;
while (slt32(offset_82789, wave_sizze_82771)) {
if (slt32(local_tid_74281 + offset_82789, group_sizze_77396) &&
((local_tid_74281 - squot32(local_tid_74281, wave_sizze_82771) *
wave_sizze_82771) & (2 * offset_82789 - 1)) == 0) {
// read array element
{
x_82786 = *(volatile __local
double *) &red_arr_mem_82780[(local_tid_74281 +
offset_82789) * 8];
}
// apply reduction operation
{
double res_82787 = x_82785 + x_82786;
x_82785 = res_82787;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82780[local_tid_74281 * 8] = x_82785;
}
}
offset_82789 *= 2;
}
skip_waves_82790 = 1;
while (slt32(skip_waves_82790, squot32(group_sizze_77396 +
wave_sizze_82771 - 1,
wave_sizze_82771))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82789 = skip_waves_82790 * wave_sizze_82771;
if (slt32(local_tid_74281 + offset_82789, group_sizze_77396) &&
((local_tid_74281 - squot32(local_tid_74281, wave_sizze_82771) *
wave_sizze_82771) == 0 && (squot32(local_tid_74281,
wave_sizze_82771) & (2 *
skip_waves_82790 -
1)) ==
0)) {
// read array element
{
x_82786 = *(__local
double *) &red_arr_mem_82780[(local_tid_74281 +
offset_82789) * 8];
}
// apply reduction operation
{
double res_82787 = x_82785 + x_82786;
x_82785 = res_82787;
}
// write result of operation
{
*(__local double *) &red_arr_mem_82780[local_tid_74281 * 8] =
x_82785;
}
}
skip_waves_82790 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (squot32(num_groups_77406 + smax32(1, N_68508 * K_68510 * D_68526) - 1,
smax32(1, N_68508 * K_68510 * D_68526)) == 1) {
// first thread in group saves final result to memory
{
if (local_tid_74281 == 0) {
*(__global double *) &mem_81087[(gtid_74252 * (D_68526 *
K_68510) +
gtid_74253 * D_68526 +
gtid_74254) * 8] = x_82785;
}
}
} else {
int32_t old_counter_82791;
// first thread in group saves group result to global memory
{
if (local_tid_74281 == 0) {
*(__global double *) &group_res_arr_mem_82776[group_id_74282 *
8] = x_82785;
mem_fence_global();
old_counter_82791 = atomic_add((volatile __global int *) &
counter_mem_82778[srem32(squot32(group_id_74282,
squot32(num_groups_77406 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526))),
1024) *
4], 1);
*(__local bool *) &sync_arr_mem_82782[0] = old_counter_82791 ==
squot32(num_groups_77406 + smax32(1, N_68508 * K_68510 *
D_68526) - 1, smax32(1,
N_68508 *
K_68510 *
D_68526)) -
1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
bool is_last_group_82792 = *(__local bool *) &sync_arr_mem_82782[0];
if (is_last_group_82792) {
if (local_tid_74281 == 0) {
old_counter_82791 = atomic_add((volatile __global int *) &
counter_mem_82778[srem32(squot32(group_id_74282,
squot32(num_groups_77406 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526))),
1024) *
4], 0 -
squot32(num_groups_77406 +
smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526)));
}
// read in the per-group-results
{
if (slt32(local_tid_74281, squot32(num_groups_77406 + smax32(1,
N_68508 *
K_68510 *
D_68526) -
1, smax32(1, N_68508 *
K_68510 *
D_68526)))) {
x_77412 = *(__global
double *) &group_res_arr_mem_82776[(squot32(group_id_74282,
squot32(num_groups_77406 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526))) *
squot32(num_groups_77406 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526)) +
local_tid_74281) *
8];
} else {
x_77412 = 0.0;
}
*(__local double *) &red_arr_mem_82780[local_tid_74281 * 8] =
x_77412;
}
barrier(CLK_LOCAL_MEM_FENCE);
// reduce the per-group results
{
int32_t offset_82793;
int32_t skip_waves_82794;
double x_82785;
double x_82786;
offset_82793 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_74281, group_sizze_77396)) {
x_82785 = *(__local
double *) &red_arr_mem_82780[(local_tid_74281 +
offset_82793) *
8];
}
}
offset_82793 = 1;
while (slt32(offset_82793, wave_sizze_82771)) {
if (slt32(local_tid_74281 + offset_82793,
group_sizze_77396) && ((local_tid_74281 -
squot32(local_tid_74281,
wave_sizze_82771) *
wave_sizze_82771) & (2 *
offset_82793 -
1)) ==
0) {
// read array element
{
x_82786 = *(volatile __local
double *) &red_arr_mem_82780[(local_tid_74281 +
offset_82793) *
8];
}
// apply reduction operation
{
double res_82787 = x_82785 + x_82786;
x_82785 = res_82787;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82780[local_tid_74281 *
8] = x_82785;
}
}
offset_82793 *= 2;
}
skip_waves_82794 = 1;
while (slt32(skip_waves_82794, squot32(group_sizze_77396 +
wave_sizze_82771 - 1,
wave_sizze_82771))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82793 = skip_waves_82794 * wave_sizze_82771;
if (slt32(local_tid_74281 + offset_82793,
group_sizze_77396) && ((local_tid_74281 -
squot32(local_tid_74281,
wave_sizze_82771) *
wave_sizze_82771) == 0 &&
(squot32(local_tid_74281,
wave_sizze_82771) &
(2 * skip_waves_82794 -
1)) == 0)) {
// read array element
{
x_82786 = *(__local
double *) &red_arr_mem_82780[(local_tid_74281 +
offset_82793) *
8];
}
// apply reduction operation
{
double res_82787 = x_82785 + x_82786;
x_82785 = res_82787;
}
// write result of operation
{
*(__local
double *) &red_arr_mem_82780[local_tid_74281 *
8] = x_82785;
}
}
skip_waves_82794 *= 2;
}
// and back to memory with the final result
{
if (local_tid_74281 == 0) {
*(__global double *) &mem_81087[(gtid_74252 * (D_68526 *
K_68510) +
gtid_74253 * D_68526 +
gtid_74254) * 8] =
x_82785;
}
}
}
}
}
}
__kernel void segred_large_74786(int32_t N_68508, int32_t K_68510,
int32_t D_68526, int32_t num_groups_77093,
__global unsigned char *mem_80879, __global
unsigned char *mem_80889, __global
unsigned char *mem_80919,
int32_t thread_per_segment_82664, __global
unsigned char *group_res_arr_mem_82665,
__global unsigned char *counter_mem_82667)
{
const int32_t group_sizze_77083 = rev_gmm_objectivezigroup_sizze_74768;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(red_arr_mem_82669_backing_0, 8 *
rev_gmm_objectivezigroup_sizze_74768);
ALIGNED_LOCAL_MEMORY(sync_arr_mem_82671_backing_1, 1);
int32_t global_tid_74786;
int32_t local_tid_74787;
int32_t group_sizze_82661;
int32_t wave_sizze_82660;
int32_t group_id_74788;
global_tid_74786 = get_global_id(0);
local_tid_74787 = get_local_id(0);
group_sizze_82661 = get_local_size(0);
wave_sizze_82660 = LOCKSTEP_WIDTH;
group_id_74788 = get_group_id(0);
int32_t gtid_74758;
int32_t gtid_74759;
int32_t gtid_74760;
int32_t gtid_74785;
__local char *red_arr_mem_82669;
red_arr_mem_82669 = (__local char *) red_arr_mem_82669_backing_0;
__local char *sync_arr_mem_82671;
sync_arr_mem_82671 = (__local char *) sync_arr_mem_82671_backing_1;
gtid_74758 = squot32(squot32(group_id_74788, squot32(num_groups_77093 +
smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526))),
K_68510 * D_68526);
gtid_74759 = squot32(squot32(group_id_74788, squot32(num_groups_77093 +
smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526))) -
squot32(squot32(group_id_74788,
squot32(num_groups_77093 + smax32(1,
N_68508 *
K_68510 *
D_68526) -
1, smax32(1, N_68508 *
K_68510 * D_68526))),
K_68510 * D_68526) * (K_68510 * D_68526),
D_68526);
gtid_74760 = squot32(group_id_74788, squot32(num_groups_77093 + smax32(1,
N_68508 *
K_68510 *
D_68526) -
1, smax32(1, N_68508 *
K_68510 *
D_68526))) -
squot32(squot32(group_id_74788, squot32(num_groups_77093 + smax32(1,
N_68508 *
K_68510 *
D_68526) -
1, smax32(1, N_68508 * K_68510 *
D_68526))), K_68510 *
D_68526) * (K_68510 * D_68526) - squot32(squot32(group_id_74788,
squot32(num_groups_77093 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526))) -
squot32(squot32(group_id_74788,
squot32(num_groups_77093 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526))),
K_68510 *
D_68526) *
(K_68510 * D_68526),
D_68526) * D_68526;
int32_t chunk_sizze_82673 = smin32(squot32(D_68526 + group_sizze_77083 *
squot32(num_groups_77093 +
smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526)) - 1,
group_sizze_77083 *
squot32(num_groups_77093 +
smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526))),
squot32(D_68526 -
srem32(global_tid_74786,
group_sizze_77083 *
squot32(num_groups_77093 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1, smax32(1,
N_68508 *
K_68510 *
D_68526))) +
thread_per_segment_82664 - 1,
thread_per_segment_82664));
double x_77099;
double x_77100;
x_77099 = 0.0;
for (int32_t i_82677 = 0; i_82677 < chunk_sizze_82673; i_82677++) {
gtid_74785 = srem32(global_tid_74786, group_sizze_77083 *
squot32(num_groups_77093 + smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 * K_68510 * D_68526))) +
thread_per_segment_82664 * i_82677;
// apply map function
{
double x_77109;
double x_77110;
double res_77111;
x_77109 = *(__global double *) &mem_80889[(gtid_74758 * (D_68526 *
K_68510) +
gtid_74759 * D_68526 +
gtid_74785) * 8];
x_77110 = *(__global double *) &mem_80879[(gtid_74758 * (D_68526 *
D_68526 *
K_68510) +
gtid_74759 * (D_68526 *
D_68526) +
gtid_74760 * D_68526 +
gtid_74785) * 8];
res_77111 = x_77109 * x_77110;
// save results to be reduced
{
x_77100 = res_77111;
}
// save map-out results
{ }
// apply reduction operator
{
double res_77101 = x_77099 + x_77100;
x_77099 = res_77101;
}
}
}
// to reduce current chunk, first store our result to memory
{
*(__local double *) &red_arr_mem_82669[local_tid_74787 * 8] = x_77099;
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82678;
int32_t skip_waves_82679;
double x_82674;
double x_82675;
offset_82678 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_74787, group_sizze_77083)) {
x_82674 = *(__local double *) &red_arr_mem_82669[(local_tid_74787 +
offset_82678) *
8];
}
}
offset_82678 = 1;
while (slt32(offset_82678, wave_sizze_82660)) {
if (slt32(local_tid_74787 + offset_82678, group_sizze_77083) &&
((local_tid_74787 - squot32(local_tid_74787, wave_sizze_82660) *
wave_sizze_82660) & (2 * offset_82678 - 1)) == 0) {
// read array element
{
x_82675 = *(volatile __local
double *) &red_arr_mem_82669[(local_tid_74787 +
offset_82678) * 8];
}
// apply reduction operation
{
double res_82676 = x_82674 + x_82675;
x_82674 = res_82676;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82669[local_tid_74787 * 8] = x_82674;
}
}
offset_82678 *= 2;
}
skip_waves_82679 = 1;
while (slt32(skip_waves_82679, squot32(group_sizze_77083 +
wave_sizze_82660 - 1,
wave_sizze_82660))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82678 = skip_waves_82679 * wave_sizze_82660;
if (slt32(local_tid_74787 + offset_82678, group_sizze_77083) &&
((local_tid_74787 - squot32(local_tid_74787, wave_sizze_82660) *
wave_sizze_82660) == 0 && (squot32(local_tid_74787,
wave_sizze_82660) & (2 *
skip_waves_82679 -
1)) ==
0)) {
// read array element
{
x_82675 = *(__local
double *) &red_arr_mem_82669[(local_tid_74787 +
offset_82678) * 8];
}
// apply reduction operation
{
double res_82676 = x_82674 + x_82675;
x_82674 = res_82676;
}
// write result of operation
{
*(__local double *) &red_arr_mem_82669[local_tid_74787 * 8] =
x_82674;
}
}
skip_waves_82679 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (squot32(num_groups_77093 + smax32(1, N_68508 * K_68510 * D_68526) - 1,
smax32(1, N_68508 * K_68510 * D_68526)) == 1) {
// first thread in group saves final result to memory
{
if (local_tid_74787 == 0) {
*(__global double *) &mem_80919[(gtid_74758 * (D_68526 *
K_68510) +
gtid_74759 * D_68526 +
gtid_74760) * 8] = x_82674;
}
}
} else {
int32_t old_counter_82680;
// first thread in group saves group result to global memory
{
if (local_tid_74787 == 0) {
*(__global double *) &group_res_arr_mem_82665[group_id_74788 *
8] = x_82674;
mem_fence_global();
old_counter_82680 = atomic_add((volatile __global int *) &
counter_mem_82667[srem32(squot32(group_id_74788,
squot32(num_groups_77093 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526))),
1024) *
4], 1);
*(__local bool *) &sync_arr_mem_82671[0] = old_counter_82680 ==
squot32(num_groups_77093 + smax32(1, N_68508 * K_68510 *
D_68526) - 1, smax32(1,
N_68508 *
K_68510 *
D_68526)) -
1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
bool is_last_group_82681 = *(__local bool *) &sync_arr_mem_82671[0];
if (is_last_group_82681) {
if (local_tid_74787 == 0) {
old_counter_82680 = atomic_add((volatile __global int *) &
counter_mem_82667[srem32(squot32(group_id_74788,
squot32(num_groups_77093 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526))),
1024) *
4], 0 -
squot32(num_groups_77093 +
smax32(1, N_68508 *
K_68510 *
D_68526) - 1,
smax32(1, N_68508 *
K_68510 *
D_68526)));
}
// read in the per-group-results
{
if (slt32(local_tid_74787, squot32(num_groups_77093 + smax32(1,
N_68508 *
K_68510 *
D_68526) -
1, smax32(1, N_68508 *
K_68510 *
D_68526)))) {
x_77099 = *(__global
double *) &group_res_arr_mem_82665[(squot32(group_id_74788,
squot32(num_groups_77093 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526))) *
squot32(num_groups_77093 +
smax32(1,
N_68508 *
K_68510 *
D_68526) -
1,
smax32(1,
N_68508 *
K_68510 *
D_68526)) +
local_tid_74787) *
8];
} else {
x_77099 = 0.0;
}
*(__local double *) &red_arr_mem_82669[local_tid_74787 * 8] =
x_77099;
}
barrier(CLK_LOCAL_MEM_FENCE);
// reduce the per-group results
{
int32_t offset_82682;
int32_t skip_waves_82683;
double x_82674;
double x_82675;
offset_82682 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_74787, group_sizze_77083)) {
x_82674 = *(__local
double *) &red_arr_mem_82669[(local_tid_74787 +
offset_82682) *
8];
}
}
offset_82682 = 1;
while (slt32(offset_82682, wave_sizze_82660)) {
if (slt32(local_tid_74787 + offset_82682,
group_sizze_77083) && ((local_tid_74787 -
squot32(local_tid_74787,
wave_sizze_82660) *
wave_sizze_82660) & (2 *
offset_82682 -
1)) ==
0) {
// read array element
{
x_82675 = *(volatile __local
double *) &red_arr_mem_82669[(local_tid_74787 +
offset_82682) *
8];
}
// apply reduction operation
{
double res_82676 = x_82674 + x_82675;
x_82674 = res_82676;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82669[local_tid_74787 *
8] = x_82674;
}
}
offset_82682 *= 2;
}
skip_waves_82683 = 1;
while (slt32(skip_waves_82683, squot32(group_sizze_77083 +
wave_sizze_82660 - 1,
wave_sizze_82660))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82682 = skip_waves_82683 * wave_sizze_82660;
if (slt32(local_tid_74787 + offset_82682,
group_sizze_77083) && ((local_tid_74787 -
squot32(local_tid_74787,
wave_sizze_82660) *
wave_sizze_82660) == 0 &&
(squot32(local_tid_74787,
wave_sizze_82660) &
(2 * skip_waves_82683 -
1)) == 0)) {
// read array element
{
x_82675 = *(__local
double *) &red_arr_mem_82669[(local_tid_74787 +
offset_82682) *
8];
}
// apply reduction operation
{
double res_82676 = x_82674 + x_82675;
x_82674 = res_82676;
}
// write result of operation
{
*(__local
double *) &red_arr_mem_82669[local_tid_74787 *
8] = x_82674;
}
}
skip_waves_82683 *= 2;
}
// and back to memory with the final result
{
if (local_tid_74787 == 0) {
*(__global double *) &mem_80919[(gtid_74758 * (D_68526 *
K_68510) +
gtid_74759 * D_68526 +
gtid_74760) * 8] =
x_82674;
}
}
}
}
}
}
__kernel void segred_large_76256(int32_t N_68508, int32_t D_68509,
int32_t K_68510, int32_t K_68511,
int32_t K_68513, int32_t K_68515,
int32_t D_68526, int32_t num_groups_76328,
__global unsigned char *x_mem_80366, __global
unsigned char *alphas_mem_80367, __global
unsigned char *mem_80631, __global
unsigned char *mem_80635, __global
unsigned char *mem_80639, __global
unsigned char *mem_80642, __global
unsigned char *mem_80645, __global
unsigned char *mem_80649,
int32_t thread_per_segment_82502, __global
unsigned char *group_res_arr_mem_82503,
__global unsigned char *counter_mem_82505)
{
const int32_t group_sizze_76318 = rev_gmm_objectivezigroup_sizze_76238;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(red_arr_mem_82507_backing_0, 8 *
rev_gmm_objectivezigroup_sizze_76238);
ALIGNED_LOCAL_MEMORY(sync_arr_mem_82509_backing_1, 1);
int32_t global_tid_76256;
int32_t local_tid_76257;
int32_t group_sizze_82499;
int32_t wave_sizze_82498;
int32_t group_id_76258;
global_tid_76256 = get_global_id(0);
local_tid_76257 = get_local_id(0);
group_sizze_82499 = get_local_size(0);
wave_sizze_82498 = LOCKSTEP_WIDTH;
group_id_76258 = get_group_id(0);
int32_t gtid_76227;
int32_t gtid_76255;
__local char *red_arr_mem_82507;
red_arr_mem_82507 = (__local char *) red_arr_mem_82507_backing_0;
__local char *sync_arr_mem_82509;
sync_arr_mem_82509 = (__local char *) sync_arr_mem_82509_backing_1;
gtid_76227 = squot32(group_id_76258, squot32(num_groups_76328 + smax32(1,
N_68508) -
1, smax32(1, N_68508)));
int32_t chunk_sizze_82511 = smin32(squot32(K_68510 + group_sizze_76318 *
squot32(num_groups_76328 +
smax32(1, N_68508) - 1,
smax32(1, N_68508)) - 1,
group_sizze_76318 *
squot32(num_groups_76328 +
smax32(1, N_68508) - 1,
smax32(1, N_68508))),
squot32(K_68510 -
srem32(global_tid_76256,
group_sizze_76318 *
squot32(num_groups_76328 +
smax32(1,
N_68508) -
1, smax32(1,
N_68508))) +
thread_per_segment_82502 - 1,
thread_per_segment_82502));
double x_76335;
double x_76336;
x_76335 = 0.0;
for (int32_t i_82515 = 0; i_82515 < chunk_sizze_82511; i_82515++) {
gtid_76255 = srem32(global_tid_76256, group_sizze_76318 *
squot32(num_groups_76328 + smax32(1, N_68508) - 1,
smax32(1, N_68508))) +
thread_per_segment_82502 * i_82515;
// apply map function
{
double alphas_elem_76339;
double res_76344;
double x_76358;
double res_76368;
double y_76421;
double res_76422;
double res_76423;
alphas_elem_76339 = *(__global
double *) &alphas_mem_80367[gtid_76255 * 8];
double x_76347 = 0.0;
for (int32_t chunk_offset_76346 = 0; chunk_offset_76346 < D_68526;
chunk_offset_76346++) {
double x_76354;
double res_76357;
x_76354 = *(__global double *) &mem_80631[(chunk_offset_76346 *
K_68513 +
gtid_76255) * 8];
res_76357 = x_76347 + x_76354;
double x_tmp_82516 = res_76357;
x_76347 = x_tmp_82516;
}
res_76344 = x_76347;
x_76358 = alphas_elem_76339 + res_76344;
for (int32_t i_76363 = 0; i_76363 < D_68526; i_76363++) {
double x_elem_elem_76364;
double means_elem_elem_76365;
double res_76366;
x_elem_elem_76364 = *(__global
double *) &x_mem_80366[(gtid_76227 *
D_68509 +
i_76363) * 8];
means_elem_elem_76365 = *(__global
double *) &mem_80635[(i_76363 *
K_68511 +
gtid_76255) *
8];
res_76366 = x_elem_elem_76364 - means_elem_elem_76365;
*(__global double *) &mem_80642[(group_id_76258 *
(group_sizze_76318 * D_68526) +
local_tid_76257 + i_76363 *
group_sizze_76318) * 8] =
res_76366;
}
double x_76371 = 0.0;
for (int32_t chunk_offset_76370 = 0; chunk_offset_76370 < D_68526;
chunk_offset_76370++) {
double qs_elem_elem_76381;
double res_76383;
double res_76418;
double res_76420;
qs_elem_elem_76381 = *(__global
double *) &mem_80631[(chunk_offset_76370 *
K_68513 +
gtid_76255) * 8];
double x_76386 = 0.0;
for (int32_t chunk_offset_76385 = 0; chunk_offset_76385 <
D_68526; chunk_offset_76385++) {
double x_76396;
bool cond_76398;
double res_76399;
double res_76415;
double res_76417;
x_76396 = *(__global double *) &mem_80642[(group_id_76258 *
(group_sizze_76318 *
D_68526) +
local_tid_76257 +
chunk_offset_76385 *
group_sizze_76318) *
8];
cond_76398 = slt32(chunk_offset_76370, chunk_offset_76385);
if (cond_76398) {
res_76399 = 0.0;
} else {
bool cond_76400;
double res_76401;
cond_76400 = chunk_offset_76370 == chunk_offset_76385;
if (cond_76400) {
double res_76402;
res_76402 = futrts_exp64(qs_elem_elem_76381);
res_76401 = res_76402;
} else {
int32_t y_76403;
int32_t x_76404;
int32_t res_76405;
int32_t gmm_knossos_tri_arg_76406;
int32_t y_76407;
int32_t x_76408;
int32_t res_76409;
int32_t x_76410;
int32_t x_76411;
int32_t y_76412;
int32_t i_76413;
double res_76414;
y_76403 = D_68526 - 1;
x_76404 = D_68526 * y_76403;
res_76405 = sdiv32(x_76404, 2);
gmm_knossos_tri_arg_76406 = D_68526 -
chunk_offset_76385;
y_76407 = gmm_knossos_tri_arg_76406 - 1;
x_76408 = gmm_knossos_tri_arg_76406 * y_76407;
res_76409 = sdiv32(x_76408, 2);
x_76410 = res_76405 - res_76409;
x_76411 = chunk_offset_76370 - chunk_offset_76385;
y_76412 = x_76411 - 1;
i_76413 = x_76410 + y_76412;
res_76414 = *(__global
double *) &mem_80639[(i_76413 *
K_68515 +
gtid_76255) *
8];
res_76401 = res_76414;
}
res_76399 = res_76401;
}
res_76415 = x_76396 * res_76399;
res_76417 = x_76386 + res_76415;
double x_tmp_82519 = res_76417;
x_76386 = x_tmp_82519;
}
res_76383 = x_76386;
res_76418 = res_76383 * res_76383;
res_76420 = x_76371 + res_76418;
double x_tmp_82518 = res_76420;
x_76371 = x_tmp_82518;
}
res_76368 = x_76371;
y_76421 = 0.5 * res_76368;
res_76422 = x_76358 - y_76421;
res_76423 = futrts_exp64(res_76422);
// save results to be reduced
{
x_76336 = res_76423;
}
// save map-out results
{
if (1) {
*(__global double *) &mem_80649[(gtid_76227 * K_68510 +
gtid_76255) * 8] =
res_76422;
}
}
// apply reduction operator
{
double res_76337 = x_76335 + x_76336;
x_76335 = res_76337;
}
}
}
// to reduce current chunk, first store our result to memory
{
*(__local double *) &red_arr_mem_82507[local_tid_76257 * 8] = x_76335;
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_82520;
int32_t skip_waves_82521;
double x_82512;
double x_82513;
offset_82520 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_76257, group_sizze_76318)) {
x_82512 = *(__local double *) &red_arr_mem_82507[(local_tid_76257 +
offset_82520) *
8];
}
}
offset_82520 = 1;
while (slt32(offset_82520, wave_sizze_82498)) {
if (slt32(local_tid_76257 + offset_82520, group_sizze_76318) &&
((local_tid_76257 - squot32(local_tid_76257, wave_sizze_82498) *
wave_sizze_82498) & (2 * offset_82520 - 1)) == 0) {
// read array element
{
x_82513 = *(volatile __local
double *) &red_arr_mem_82507[(local_tid_76257 +
offset_82520) * 8];
}
// apply reduction operation
{
double res_82514 = x_82512 + x_82513;
x_82512 = res_82514;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82507[local_tid_76257 * 8] = x_82512;
}
}
offset_82520 *= 2;
}
skip_waves_82521 = 1;
while (slt32(skip_waves_82521, squot32(group_sizze_76318 +
wave_sizze_82498 - 1,
wave_sizze_82498))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82520 = skip_waves_82521 * wave_sizze_82498;
if (slt32(local_tid_76257 + offset_82520, group_sizze_76318) &&
((local_tid_76257 - squot32(local_tid_76257, wave_sizze_82498) *
wave_sizze_82498) == 0 && (squot32(local_tid_76257,
wave_sizze_82498) & (2 *
skip_waves_82521 -
1)) ==
0)) {
// read array element
{
x_82513 = *(__local
double *) &red_arr_mem_82507[(local_tid_76257 +
offset_82520) * 8];
}
// apply reduction operation
{
double res_82514 = x_82512 + x_82513;
x_82512 = res_82514;
}
// write result of operation
{
*(__local double *) &red_arr_mem_82507[local_tid_76257 * 8] =
x_82512;
}
}
skip_waves_82521 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (squot32(num_groups_76328 + smax32(1, N_68508) - 1, smax32(1,
N_68508)) ==
1) {
// first thread in group saves final result to memory
{
if (local_tid_76257 == 0) {
*(__global double *) &mem_80645[gtid_76227 * 8] = x_82512;
}
}
} else {
int32_t old_counter_82522;
// first thread in group saves group result to global memory
{
if (local_tid_76257 == 0) {
*(__global double *) &group_res_arr_mem_82503[group_id_76258 *
8] = x_82512;
mem_fence_global();
old_counter_82522 = atomic_add((volatile __global int *) &
counter_mem_82505[srem32(squot32(group_id_76258,
squot32(num_groups_76328 +
smax32(1,
N_68508) -
1,
smax32(1,
N_68508))),
1024) *
4], 1);
*(__local bool *) &sync_arr_mem_82509[0] = old_counter_82522 ==
squot32(num_groups_76328 + smax32(1, N_68508) - 1, smax32(1,
N_68508)) -
1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
bool is_last_group_82523 = *(__local bool *) &sync_arr_mem_82509[0];
if (is_last_group_82523) {
if (local_tid_76257 == 0) {
old_counter_82522 = atomic_add((volatile __global int *) &
counter_mem_82505[srem32(squot32(group_id_76258,
squot32(num_groups_76328 +
smax32(1,
N_68508) -
1,
smax32(1,
N_68508))),
1024) *
4], 0 -
squot32(num_groups_76328 +
smax32(1, N_68508) - 1,
smax32(1, N_68508)));
}
// read in the per-group-results
{
if (slt32(local_tid_76257, squot32(num_groups_76328 + smax32(1,
N_68508) -
1, smax32(1, N_68508)))) {
x_76335 = *(__global
double *) &group_res_arr_mem_82503[(squot32(group_id_76258,
squot32(num_groups_76328 +
smax32(1,
N_68508) -
1,
smax32(1,
N_68508))) *
squot32(num_groups_76328 +
smax32(1,
N_68508) -
1,
smax32(1,
N_68508)) +
local_tid_76257) *
8];
} else {
x_76335 = 0.0;
}
*(__local double *) &red_arr_mem_82507[local_tid_76257 * 8] =
x_76335;
}
barrier(CLK_LOCAL_MEM_FENCE);
// reduce the per-group results
{
int32_t offset_82524;
int32_t skip_waves_82525;
double x_82512;
double x_82513;
offset_82524 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_76257, group_sizze_76318)) {
x_82512 = *(__local
double *) &red_arr_mem_82507[(local_tid_76257 +
offset_82524) *
8];
}
}
offset_82524 = 1;
while (slt32(offset_82524, wave_sizze_82498)) {
if (slt32(local_tid_76257 + offset_82524,
group_sizze_76318) && ((local_tid_76257 -
squot32(local_tid_76257,
wave_sizze_82498) *
wave_sizze_82498) & (2 *
offset_82524 -
1)) ==
0) {
// read array element
{
x_82513 = *(volatile __local
double *) &red_arr_mem_82507[(local_tid_76257 +
offset_82524) *
8];
}
// apply reduction operation
{
double res_82514 = x_82512 + x_82513;
x_82512 = res_82514;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_82507[local_tid_76257 *
8] = x_82512;
}
}
offset_82524 *= 2;
}
skip_waves_82525 = 1;
while (slt32(skip_waves_82525, squot32(group_sizze_76318 +
wave_sizze_82498 - 1,
wave_sizze_82498))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_82524 = skip_waves_82525 * wave_sizze_82498;
if (slt32(local_tid_76257 + offset_82524,
group_sizze_76318) && ((local_tid_76257 -
squot32(local_tid_76257,
wave_sizze_82498) *
wave_sizze_82498) == 0 &&
(squot32(local_tid_76257,
wave_sizze_82498) &
(2 * skip_waves_82525 -
1)) == 0)) {
// read array element
{
x_82513 = *(__local
double *) &red_arr_mem_82507[(local_tid_76257 +
offset_82524) *
8];
}
// apply reduction operation
{
double res_82514 = x_82512 + x_82513;
x_82512 = res_82514;
}
// write result of operation
{
*(__local
double *) &red_arr_mem_82507[local_tid_76257 *
8] = x_82512;
}
}
skip_waves_82525 *= 2;
}
// and back to memory with the final result
{
if (local_tid_76257 == 0) {
*(__global double *) &mem_80645[gtid_76227 * 8] =
x_82512;
}
}
}
}
}
}
__kernel void segred_large_78239(int32_t N_68508, int32_t K_68510,
int32_t D_68526, int32_t num_groups_78497,
__global unsigned char *mem_81484, __global
unsigned char *mem_81488,
int32_t thread_per_segment_83108, __global
unsigned char *group_res_arr_mem_83109,
__global unsigned char *counter_mem_83111)
{
const int32_t group_sizze_78487 = rev_gmm_objectivezigroup_sizze_78221;
const int block_dim0 = 0;
const int block_dim1 = 1;
const int block_dim2 = 2;
ALIGNED_LOCAL_MEMORY(red_arr_mem_83113_backing_0, 8 *
rev_gmm_objectivezigroup_sizze_78221);
ALIGNED_LOCAL_MEMORY(sync_arr_mem_83115_backing_1, 1);
int32_t global_tid_78239;
int32_t local_tid_78240;
int32_t group_sizze_83105;
int32_t wave_sizze_83104;
int32_t group_id_78241;
global_tid_78239 = get_global_id(0);
local_tid_78240 = get_local_id(0);
group_sizze_83105 = get_local_size(0);
wave_sizze_83104 = LOCKSTEP_WIDTH;
group_id_78241 = get_group_id(0);
int32_t gtid_78215;
int32_t gtid_78216;
int32_t gtid_78238;
__local char *red_arr_mem_83113;
red_arr_mem_83113 = (__local char *) red_arr_mem_83113_backing_0;
__local char *sync_arr_mem_83115;
sync_arr_mem_83115 = (__local char *) sync_arr_mem_83115_backing_1;
gtid_78215 = squot32(squot32(group_id_78241, squot32(num_groups_78497 +
smax32(1, K_68510 *
D_68526) - 1,
smax32(1, K_68510 *
D_68526))),
D_68526);
gtid_78216 = squot32(group_id_78241, squot32(num_groups_78497 + smax32(1,
K_68510 *
D_68526) -
1, smax32(1, K_68510 *
D_68526))) -
squot32(squot32(group_id_78241, squot32(num_groups_78497 + smax32(1,
K_68510 *
D_68526) -
1, smax32(1, K_68510 *
D_68526))), D_68526) *
D_68526;
int32_t chunk_sizze_83117 = smin32(squot32(N_68508 + group_sizze_78487 *
squot32(num_groups_78497 +
smax32(1, K_68510 *
D_68526) - 1,
smax32(1, K_68510 *
D_68526)) - 1,
group_sizze_78487 *
squot32(num_groups_78497 +
smax32(1, K_68510 *
D_68526) - 1,
smax32(1, K_68510 *
D_68526))),
squot32(N_68508 -
srem32(global_tid_78239,
group_sizze_78487 *
squot32(num_groups_78497 +
smax32(1,
K_68510 *
D_68526) -
1, smax32(1,
K_68510 *
D_68526))) +
thread_per_segment_83108 - 1,
thread_per_segment_83108));
double x_78503;
double x_78504;
x_78503 = 0.0;
for (int32_t i_83121 = 0; i_83121 < chunk_sizze_83117; i_83121++) {
gtid_78238 = srem32(global_tid_78239, group_sizze_78487 *
squot32(num_groups_78497 + smax32(1, K_68510 *
D_68526) - 1,
smax32(1, K_68510 * D_68526))) +
thread_per_segment_83108 * i_83121;
// apply map function
{
double x_78508 = *(__global double *) &mem_81484[(gtid_78215 *
(N_68508 *
D_68526) +
gtid_78216 *
N_68508 +
gtid_78238) * 8];
// save results to be reduced
{
x_78504 = x_78508;
}
// save map-out results
{ }
// apply reduction operator
{
double res_78505 = x_78503 + x_78504;
x_78503 = res_78505;
}
}
}
// to reduce current chunk, first store our result to memory
{
*(__local double *) &red_arr_mem_83113[local_tid_78240 * 8] = x_78503;
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t offset_83122;
int32_t skip_waves_83123;
double x_83118;
double x_83119;
offset_83122 = 0;
// participating threads read initial accumulator
{
if (slt32(local_tid_78240, group_sizze_78487)) {
x_83118 = *(__local double *) &red_arr_mem_83113[(local_tid_78240 +
offset_83122) *
8];
}
}
offset_83122 = 1;
while (slt32(offset_83122, wave_sizze_83104)) {
if (slt32(local_tid_78240 + offset_83122, group_sizze_78487) &&
((local_tid_78240 - squot32(local_tid_78240, wave_sizze_83104) *
wave_sizze_83104) & (2 * offset_83122 - 1)) == 0) {
// read array element
{
x_83119 = *(volatile __local
double *) &red_arr_mem_83113[(local_tid_78240 +
offset_83122) * 8];
}
// apply reduction operation
{
double res_83120 = x_83118 + x_83119;
x_83118 = res_83120;
}
// write result of operation
{
*(volatile __local
double *) &red_arr_mem_83113[local_tid_78240 * 8] = x_83118;
}
}
offset_83122 *= 2;
}
skip_waves_83123 = 1;
while (slt32(skip_waves_83123, squot32(group_sizze_78487 +
wave_sizze_83104 - 1,
wave_sizze_83104))) {
barrier(CLK_LOCAL_MEM_FENCE);
offset_83122 = skip_waves_83123 * wave_sizze_83104;
if (slt32(local_tid_78240 + offset_83122, group_sizze_78487) &&
((local_tid_78240 - squot32(local_tid_78240, wave_sizze_83104) *
wave_sizze_83104) == 0 && (squot32(local_tid_78240,
wave_sizze_83104) & (2 *
skip_waves_83123 -
1)) ==
0)) {
// read array element
{
x_83119 = *(__local
double *) &red_arr_mem_83113[(local_tid_78240 +
offset_83122) * 8];
}
// apply reduction operation
{
double res_83120 = x_83118 + x_83119;
x_83118 = res_83120;
}
// write result of operation
{
*(__local double *) &red_arr_mem_83113[local_tid_78240 * 8] =
x_83118;
}
}
skip_waves_83123 *= 2;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (squot32(num_groups_78497 + smax32(1, K_68510 * D_68526) - 1, smax32(1,
K_68510 *
D_68526)) ==
1) {
// first thread in group saves final result to memory
{
if (local_tid_78240 == 0) {
*(__global double *) &mem_81488[(gtid_78215 * D_68526 +
gtid_78216) * 8] = x_83118;
}
}
} else {
int32_t old_counter_83124;
// first thread in group saves group result to global memory
{
if (local_tid_78240 == 0) {
*(__global double *) &group_res_arr_mem_83109[group_id_78241 *
8] = x_83118;
mem_fence_global();
old_counter_83124 = atomic_add((volatile __global int *) &
counter_mem_83111[srem32(squot32(group_id_78241,
squot32(num_groups_78497 +
smax32(1,
K_68510 *
D_68526) -
1,
smax32(1,
K_68510 *
D_68526))),
1024) *
4], 1);
*(__local bool *) &sync_arr_mem_83115[0] = old_counter_83124 ==
squot32(num_groups_78497 + smax32(1, K_68510 * D_68526) - 1,
smax32(1, K_68510 * D_68526)) - 1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
bool is_last_group_83125 = *(__local bool *) &sync_arr_mem_83115[0];
if (is_last_group_83125) {
if (local_tid_78240 == 0) {
old_counter_83124 = atomic_add((volatile __global int *) &
counter_mem_83111[srem32(squot32(group_id_78241,
squot32(num_groups_78497 +
smax32(1,
K_68510 *
D_68526) -
1,
smax32(1,
K_68510 *
D_68526))),
1024) *
4], 0 -
squot32(num_groups_78497 +
smax32(1, K_68510 *
D_68526) - 1,
smax32(1, K_68510 *
D_68526)));
}
// read in the per-group-results
{
if (slt32(local_tid_78240, squot32(num_groups_78497 + smax32(1,
K_68510 *
D_68526) -
1, smax32(1, K_68510 *
D_68526)))) {
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment