Skip to content

Instantly share code, notes, and snippets.

@Mezzano
Last active August 23, 2016 15:15
Show Gist options
  • Save Mezzano/c3c3dbca6c47a50918f93c1fec3a7999 to your computer and use it in GitHub Desktop.
Save Mezzano/c3c3dbca6c47a50918f93c1fec3a7999 to your computer and use it in GitHub Desktop.
Setting to use device 0
Build Status = -2 ( Err = -11 )
Log: (1387:0) : error : atomic function "atomic_cmpxchg" not supported
Sources: #ifndef __OPENCL_VERSION__
#define __kernel
#define __global
#define __constant
#define __local
#define get_global_id(x) 0
#define get_global_size(x) 0
#define get_local_id(x) 0
#define get_local_size(x) 0
#define FLT_MAX 0
#define FLT_MIN 0
#define cl_khr_fp64
#define cl_amd_fp64
#define DOUBLE_SUPPORT_AVAILABLE
#define CLK_LOCAL_MEM_FENCE
#define CLK_GLOBAL_MEM_FENCE
#define Dtype float
#define barrier(x)
#define atomic_cmpxchg(x, y, z) x
#define signbit(x) x
#define int_tp long
#define uint_tp unsigned long
#define int_tpc long
#define uint_tpc unsigned long
#endif
#define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type)
#define TYPE_FLOAT 1
#define TYPE_DOUBLE 2
#if defined(cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define DOUBLE_SUPPORT_AVAILABLE
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#define DOUBLE_SUPPORT_AVAILABLE
#endif
#if defined(cl_khr_int64_base_atomics)
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#define ATOMICS_64_AVAILABLE
#endif
// Types used for parameters, offset computations and so on
#define int_tp int
#define uint_tp unsigned int
// Definitions used to cast the types above as needed
#define int_tpc int
#define uint_tpc unsigned int
#define Dtype float
#define Dtype2 float2
#define Dtype4 float4
#define Dtype8 float8
#define Dtype16 float16
#define TYPE TYPE_FLOAT
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(relu_forward,Dtype)(const int_tp n,
__global const Dtype* in,
__global Dtype* out,
Dtype negative_slope) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;
}
}
__kernel void TEMPLATE(relu_backward,Dtype)(const int_tp n,
__global const Dtype* in_diff,
__global const Dtype* in_data,
__global Dtype* out_diff,
Dtype negative_slope) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out_diff[index] = in_diff[index]
* ((in_data[index] > 0?1.0:0.0) + (in_data[index] <= 0?1.0:0.0) * negative_slope);
}
}
__kernel void TEMPLATE(tanh_forward,Dtype)(const int_tp n,
__global const Dtype* in,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out[index] = tanh(in[index]);
}
}
__kernel void TEMPLATE(tanh_backward,Dtype)(const int_tp n,
__global const Dtype* in_diff,
__global const Dtype* out_data,
__global Dtype* out_diff) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
Dtype tanhx = out_data[index];
out_diff[index] = in_diff[index] * (1 - tanhx * tanhx);
}
}
__kernel void TEMPLATE(sigmoid_forward,Dtype)(const int_tp n,
__global const Dtype* in,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out[index] = 1.0 / (1.0 + exp(-in[index]));
}
}
__kernel void TEMPLATE(sigmoid_backward,Dtype)(const int_tp n,
__global const Dtype* in_diff,
__global const Dtype* out_data,
__global Dtype* out_diff) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
const Dtype sigmoid_x = out_data[index];
out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);
}
}
__kernel void TEMPLATE(threshold,Dtype)(const int_tp n, const Dtype threshold,
__global const Dtype* in,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out[index] = in[index] > threshold ? 1.0 : 0.0;
}
}
__kernel void TEMPLATE(prelu_forward,Dtype)(const int_tp n, const int_tp channels,
const int_tp dim,
__global const Dtype* in,
__global Dtype* out,
__global const Dtype* slope_data,
const int_tp div_factor) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
int_tp c = (index / dim) % channels / div_factor;
out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
}
}
__kernel void TEMPLATE(prelu_backward,Dtype)(const int_tp n, const int_tp channels,
const int_tp dim,
__global const Dtype* in_diff,
__global const Dtype* in_data,
__global Dtype* out_diff,
__global const Dtype* slope_data,
const int_tp div_factor) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
int_tp c = (index / dim) % channels / div_factor;
out_diff[index] = in_diff[index]
* ((in_data[index] > 0?1.0:0.0) + (in_data[index] <= 0?1.0:0.0) * slope_data[c]);
}
}
__kernel void TEMPLATE(prelu_param_backward,Dtype)(const int_tp n, const int_tp rows,
const int_tp rowPitch,
__global const Dtype* in_diff,
__global const Dtype* in_data,
__global Dtype* out_diff) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0?1.0:0.0);
for (int k = 1; k < rows; k++) {
out_diff[index] += in_diff[index + k * rowPitch]
* in_data[index + k * rowPitch]
* (in_data[index + k * rowPitch] <= 0?1.0:0.0);
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(gpu_set,Dtype)(const int_tp n, const Dtype alpha, __global Dtype* y) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[index] = alpha;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(br_forward,Dtype)(const int_tp count, const int_tp inner_dim,
__global const Dtype* in,
__global const Dtype* permut,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < count;
index += get_global_size(0)) {
int_tp n = index / (inner_dim);
int_tp in_n = (int_tp) (permut[n]);
out[index] = in[in_n * (inner_dim) + index % (inner_dim)];
}
}
__kernel void TEMPLATE(br_backward,Dtype)(const int_tp count, const int_tp inner_dim,
__global const Dtype* in,
__global const Dtype* top_indexes,
__global const Dtype* begins,
__global const Dtype* counts,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < count;
index += get_global_size(0)) {
int_tp n = index / (inner_dim);
out[index] = 0;
int_tp lower = (int_tp) (begins[n]);
int_tp upper = lower + (int_tp) (counts[n]);
for (int_tp i = lower; i < upper; ++i) {
int_tp in_n = (int_tp) (top_indexes[i]);
out[index] += in[in_n * (inner_dim) + index % (inner_dim)];
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(null_kernel,Dtype)(Dtype arg) {
Dtype out = arg;
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(bias_forward,Dtype)(const int_tp n,
__global const Dtype* in,
__global const Dtype* bias,
const int_tp bias_dim,
const int_tp inner_dim,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < n;
index += get_global_size(0)) {
const int_tp bias_index = (index / inner_dim) % bias_dim;
out[index] = in[index] + bias[bias_index];
}
}
__kernel void TEMPLATE(scale_forward,Dtype)(const int_tp n,
__global const Dtype* in,
__global const Dtype* scale,
const int_tp scale_dim,
const int_tp inner_dim,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < n;
index += get_global_size(0)) {
const int_tp scale_index = (index / inner_dim) % scale_dim;
out[index] = in[index] * scale[scale_index];
}
}
__kernel void TEMPLATE(scale_bias_forward,Dtype)(const int_tp n,
__global const Dtype* in,
__global const Dtype* scale,
__global const Dtype* bias,
const int_tp scale_dim,
const int_tp inner_dim,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < n;
index += get_global_size(0)) {
const int_tp scale_index = (index / inner_dim) % scale_dim;
out[index] = in[index] * scale[scale_index] + bias[scale_index];
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(bnll_forward,Dtype)(const int_tp n,
__global const Dtype* in,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
if (in[index] > 0.0f) {
out[index] = in[index] + log((Dtype) (1.0 + exp(-in[index])));
} else {
out[index] = log((Dtype) (1.0 + exp(in[index])));
}
}
}
__kernel void TEMPLATE(bnll_backward,Dtype)(const int_tp n,
__global const Dtype* in_diff,
__global const Dtype* in_data,
__global Dtype* out_diff) {
Dtype kBNLL_THRESHOLD = 50.;
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
Dtype expval = exp(min(in_data[index], kBNLL_THRESHOLD));
out_diff[index] = in_diff[index] * expval / (expval + 1.);
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(kernel_channel_max,Dtype)(const int_tp num, const int_tp channels,
const int_tp spatial_dim,
__global const Dtype* data,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < num * spatial_dim; index +=
get_global_size(0)) {
int_tp n = index / spatial_dim;
int_tp s = index % spatial_dim;
float maxval = -FLT_MAX;
for (int_tp c = 0; c < channels; ++c) {
maxval = max((Dtype)(data[(n * channels + c) * spatial_dim + s]), (Dtype)maxval);
}
out[index] = maxval;
}
}
__kernel void TEMPLATE(kernel_channel_subtract,Dtype)(const int_tp count, const int_tp num,
const int_tp channels,
const int_tp spatial_dim,
__global const Dtype* channel_max,
__global Dtype* data) {
for (int_tp index = get_global_id(0); index < count;
index += get_global_size(0)) {
int_tp n = index / channels / spatial_dim;
int_tp s = index % spatial_dim;
data[index] -= channel_max[n * spatial_dim + s];
}
}
__kernel void TEMPLATE(kernel_exp,Dtype)(const int_tp count, __global const Dtype* data,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < count;
index += get_global_size(0)) {
out[index] = exp(data[index]);
}
}
__kernel void TEMPLATE(kernel_channel_sum,Dtype)(const int_tp num, const int_tp channels,
const int_tp spatial_dim,
__global const Dtype* data,
__global Dtype* channel_sum) {
for (int_tp index = get_global_id(0); index < num * spatial_dim; index +=
get_global_size(0)) {
int_tp n = index / spatial_dim;
int_tp s = index % spatial_dim;
Dtype sum = 0;
for (int_tp c = 0; c < channels; ++c) {
sum += data[(n * channels + c) * spatial_dim + s];
}
channel_sum[index] = sum;
}
}
__kernel void TEMPLATE(kernel_channel_div,Dtype)(const int_tp count, const int_tp num,
const int_tp channels, const int_tp spatial_dim,
__global const Dtype* channel_sum,
__global Dtype* data) {
for (int_tp index = get_global_id(0); index < count;
index += get_global_size(0)) {
int_tp n = index / channels / spatial_dim;
int_tp s = index % spatial_dim;
data[index] /= channel_sum[n * spatial_dim + s];
}
}
__kernel void TEMPLATE(kernel_channel_dot,Dtype)(const int_tp num, const int_tp channels,
const int_tp spatial_dim,
__global const Dtype* data_1,
__global const Dtype* data_2,
__global Dtype* channel_dot) {
for (int_tp index = get_global_id(0); index < num * spatial_dim; index +=
get_global_size(0)) {
int_tp n = index / spatial_dim;
int_tp s = index % spatial_dim;
Dtype dot = 0;
for (int_tp c = 0; c < channels; ++c) {
dot += (data_1[(n * channels + c) * spatial_dim + s]
* data_2[(n * channels + c) * spatial_dim + s]);
}
channel_dot[index] = dot;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(concat,Dtype)(const int_tp nthreads, __global const Dtype* in_data,
const int forward, const int_tp num_concats,
const int_tp concat_size,
const int_tp top_concat_axis,
const int_tp bottom_concat_axis,
const int_tp offset_concat_axis,
__global Dtype* out_data) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp total_concat_size = concat_size * bottom_concat_axis;
const int_tp concat_num = index / total_concat_size;
const int_tp concat_index = index % total_concat_size;
const int_tp top_index = concat_index
+ (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
if (forward == 1) {
out_data[top_index] = in_data[index];
} else {
out_data[index] = in_data[top_index];
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(cll_backward,Dtype)(const int_tp count, const int_tp channels,
const Dtype margin, const Dtype alpha, __global const Dtype* y,
__global const Dtype* diff, __global const Dtype* dist_sq,
__global Dtype *bottom_diff) {
for (int_tp i = get_global_id(0); i < count;
i += get_global_size(0)) {
int_tp n = i / channels; // the num index, to access y and dist_sq
if (trunc(y[n]) != 0.) { // similar pairs
bottom_diff[i] = alpha * diff[i];
} else { // dissimilar pairs
Dtype mdist = 0.;
Dtype beta = 0.;
Dtype dist = sqrt(dist_sq[n]);
mdist = (margin - dist);
beta = -alpha * mdist / (dist + 1e-4) * diff[i];
if (mdist > 0.) {
bottom_diff[i] = beta;
} else {
bottom_diff[i] = 0;
}
}
}
}
__kernel void TEMPLATE(cll_backward_legacy,Dtype)(const int count, const int channels,
const Dtype margin, const Dtype alpha, __global Dtype* y,
__global Dtype* diff, __global Dtype* dist_sq,
__global Dtype* bottom_diff) {
for (int_tp i = get_global_id(0); i < count;
i += get_global_size(0)) {
int n = i / channels; // the num index, to access y and dist_sq
if (trunc(y[n]) != 0.) { // similar pairs
bottom_diff[i] = alpha * diff[i];
} else { // dissimilar pairs
Dtype mdist = 0.;
Dtype beta = 0.;
mdist = (margin - dist_sq[n]);
beta = -alpha;
if (mdist > 0.) {
bottom_diff[i] = beta;
} else {
bottom_diff[i] = 0;
}
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(conv_layer_spatial_phony,Dtype)(Dtype arg) {
Dtype out = arg;
}
#define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y)
#define LOOP1(VAR, STMT) (STMT); (VAR)++;
#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;
#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;
#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;
#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;
#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;
#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;
#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;
#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;
#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;
#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;
#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;
#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;
#define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++;
#define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++;
#define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++;
#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
#ifdef MULTI
__kernel void CFMulti(__global Dtype* image_data, int_tp image_offset,
__global Dtype* kernel_data, int_tp kernel_offset,
__global Dtype* bias,const int_tp bias_offset,
__global Dtype* convolved_image,const int_tp convolved_image_offset,
const ushort WIDTH,
const ushort HEIGHT,
const ushort OUTPUT_W,
const ushort OUTPUT_H) {
const int_tp outputX = get_global_id(0);
const int_tp outputY = get_global_id(1);
const int_tp kernelNum = get_global_id(2)*ZPAR;
if(outputX < OUTPUT_W && outputY < OUTPUT_H)
{
Dtype sum[ZPAR];
Dtype4 vectorSum[ZPAR];
for(int_tp kern =0; kern < ZPAR; kern++)
{
sum[kern] = 0.0f;
vectorSum[kern] = (0.0f,0.0f,0.0f,0.0f);
}
const int_tp currentKernelOffset = kernel_offset + kernelNum*KERNEL_H*KERNEL_W*CHANNELS;
const int_tp biasIndex=bias_offset + kernelNum;
const int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W;
const int_tp imageSize = WIDTH*HEIGHT;
const int_tp float4Reads = KERNEL_W / 4;
const int_tp floatReads = KERNEL_W % 4;
Dtype4 imageCache;
__global Dtype* image_dataPtrFloat = (image_data + (image_offset + local_image_offset));
__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
for(int_tp c = 0; c < CHANNELS; c++)
{
for(int_tp y = 0; y < KERNEL_H; y++)
{
for(int_tp x=0; x< float4Reads; x++)
{
imageCache = ((__global Dtype4*)image_dataPtrFloat)[x];
for(int_tp kern =0; kern < ZPAR; kern++)
{
vectorSum[kern] += imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[x];
}
}
if(floatReads == 1)
{
imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads];
for(int_tp kern =0; kern < ZPAR; kern++)
vectorSum[kern].s0 += ( imageCache * ( (__global Dtype4*) &(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]) )[float4Reads] ).s0;
}
else if(floatReads == 2)
{
imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads];
for(int_tp kern =0; kern < ZPAR; kern++)
vectorSum[kern].s01 += (imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[float4Reads]).s01;
}
else if(floatReads == 3)
{
imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads];
for(int_tp kern =0; kern < ZPAR; kern++)
vectorSum[kern].s012 += (imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[float4Reads]).s012;
}
image_dataPtrFloat += WIDTH;
kernel_dataPtrFloat += KERNEL_W;
}
image_dataPtrFloat += imageSize - WIDTH*KERNEL_H;
}
for(int_tp kern =0; kern < ZPAR; kern++)
sum[kern] = vectorSum[kern].x + vectorSum[kern].y + vectorSum[kern].z + vectorSum[kern].w;
if(APPLY_BIAS == 1)
{
for(int_tp kern = 0; kern < ZPAR; kern++)
if(kernelNum+kern < OUTPUT_Z)
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX] =
sum[kern] + bias[biasIndex +kern];
}
else
for(int_tp kern = 0; kern < ZPAR; kern++)
if(kernelNum+kern < OUTPUT_Z)
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX] = sum[kern];
}
}
#endif
#ifdef MULTI_11
__kernel void CFMulti_11_11_4(__global Dtype* image_data, int_tp image_offset,
__global Dtype* kernel_data, int_tp kernel_offset,
__global Dtype* bias,const int_tp bias_offset,
__global Dtype* convolved_image,const int_tp convolved_image_offset,
const ushort WIDTH,
const ushort HEIGHT,
const ushort OUTPUT_W,
const ushort OUTPUT_H) {
int_tp outputX = get_global_id(0)*XPAR;
int_tp outputY = get_global_id(1)*YPAR;
int_tp kernelNum = get_global_id(2)*ZPAR;
if(outputX < OUTPUT_W && outputY < OUTPUT_H)
{
Dtype sum[XPAR*YPAR*ZPAR];
for(int_tp kern =0; kern < XPAR*YPAR*ZPAR; kern++)
{
sum[kern] = 0.0f;
}
int_tp currentKernelOffset = kernel_offset + kernelNum*KERNELSIZE*CHANNELS;
int_tp biasIndex=bias_offset + kernelNum;
int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W;
int_tp imageSize = WIDTH*HEIGHT;
int_tp index;
__global Dtype* image_dataPtrFloat = (image_data + (image_offset + local_image_offset));
__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
Dtype16 imageCache;
Dtype8 imageCacheR;
Dtype8 kernelCache;
Dtype4 kernelCacheR;
for(int_tp c = 0; c < CHANNELS; c++)
{
for(int_tp y = 0; y < 11; y++)
{
imageCache = ((__global Dtype16*)image_dataPtrFloat)[0];
imageCacheR =((__global Dtype8*)image_dataPtrFloat)[2];
for(int_tp kern =0; kern < ZPAR; kern++)
{
kernelCache = ((__global Dtype8*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[0];
kernelCacheR = ((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[2];
index = kern*XPAR;
sum[index + 0] += dot(imageCache.S0123,kernelCache.S0123);
sum[index + 1] += dot(imageCache.S4567,kernelCache.S0123);
sum[index + 2] += dot(imageCache.S89AB,kernelCache.S0123);
sum[index + 3] += dot(imageCache.SCDEF,kernelCache.S0123);
sum[index + 0] += dot(imageCache.S4567,kernelCache.S4567);
sum[index + 1] += dot(imageCache.S89AB,kernelCache.S4567);
sum[index + 2] += dot(imageCache.SCDEF,kernelCache.S4567);
sum[index + 3] += dot(imageCacheR.S0123,kernelCache.S4567);
sum[index + 0] += dot(imageCache.S89A,kernelCacheR.S012);
sum[index + 1] += dot(imageCache.SCDE,kernelCacheR.S012);
sum[index + 2] += dot(imageCacheR.S012,kernelCacheR.S012);
sum[index + 3] += dot(imageCacheR.S456,kernelCacheR.S012);
}
image_dataPtrFloat += WIDTH;
kernel_dataPtrFloat += KERNEL_W;
}
image_dataPtrFloat += imageSize - WIDTH*KERNEL_H;
}
if(APPLY_BIAS == 1)
{
for(int_tp kern = 0; kern < ZPAR; kern++)
{
for(int_tp wi =0; wi < XPAR; wi++)
if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W)
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX + wi] =
sum[kern*XPAR + wi] + bias[biasIndex +kern];
}
}
else
for(int_tp kern = 0; kern < ZPAR; kern++)
for(int_tp wi =0; wi < XPAR; wi++)
if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W)
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX + wi] = sum[kern*XPAR + wi];
}
}
#endif
#ifdef MULTI_GEN
__kernel void CFMulti_6(__global const Dtype* restrict image_data, const int_tp image_offset,
__global const Dtype* restrict kernel_data, const int_tp kernel_offset,
__global const Dtype* restrict bias,const int_tp bias_offset,
__global Dtype* restrict convolved_image,const int_tp convolved_image_offset,
const ushort WIDTH,
const ushort HEIGHT,
const ushort OUTPUT_W,
const ushort OUTPUT_H) {
const int_tp outputX = get_global_id(0)*XPAR;
const int_tp outputY = get_global_id(1)*YPAR;
const int_tp kernelNum = get_global_id(2)*ZPAR;
if(outputX < OUTPUT_W && outputY < OUTPUT_H)
{
Dtype sum[XPAR*YPAR*ZPAR];
for(uint_tp kern = 0; kern < XPAR*YPAR*ZPAR; kern++)
sum[kern] = 0.0f;
const int_tp currentKernelOffset = kernel_offset + kernelNum*KERNELSIZE*CHANNELS;
const int_tp biasIndex=bias_offset + kernelNum;
const int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W;
const int_tp imageSize = WIDTH*HEIGHT;
int_tp index;
__global const Dtype* image_dataPtrFloat[2];
image_dataPtrFloat[0] = (image_data + (image_offset + local_image_offset));
image_dataPtrFloat[1] = image_dataPtrFloat[0];
__global const Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
DTImage imageCache[YPAR];
DTKernel kernelCache;
Dtype4 temp;
for(uint_tp c = 0; c < CHANNELS; c++)
{
imageCache[0] = ((__global DTImage*)image_dataPtrFloat[1])[0];
for(uint_tp preload = 1; preload < YPAR; preload++)
{
image_dataPtrFloat[1] += WIDTH;
imageCache[preload] = ((__global DTImage*)image_dataPtrFloat[1])[0];
}
int_tp y =0;
LOOP(KERNEL_H, y,
{
int_tp kern=0;
LOOP(ZPAR, kern,
{
kernelCache = ((__global DTKernel*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[0];
index = kern*XPAR*YPAR;
for(uint_tp y_par = 0; y_par < YPAR; y_par++)
{
temp = floatDotV4(imageCache[y_par],kernelCache);
sum[index + y_par*XPAR + 0] += temp.s0;
sum[index + y_par*XPAR + 1] += temp.s1;
sum[index + y_par*XPAR + 2] += temp.s2;
sum[index + y_par*XPAR + 3] += temp.s3;
}
});
kernel_dataPtrFloat += KERNEL_W;
for(uint_tp rotateData = 0; rotateData < YPAR - 1; rotateData++)
imageCache[rotateData] = imageCache[rotateData + 1];
image_dataPtrFloat[1] += WIDTH;
imageCache[YPAR - 1] = ((__global DTImage*)image_dataPtrFloat[1])[0];
});
image_dataPtrFloat[0] += imageSize;
image_dataPtrFloat[1] = image_dataPtrFloat[0];
}
if(APPLY_BIAS == 1)
{
for(uint_tp kern = 0; kern < ZPAR; kern++)
{
for(uint_tp hi =0; hi < YPAR; hi++)
for(uint_tp wi =0; wi < XPAR; wi++)
if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W && outputY + hi < OUTPUT_H)
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + (outputY +hi)*OUTPUT_W + outputX + wi] =
sum[kern*XPAR*YPAR + XPAR*hi + wi] + bias[biasIndex +kern];
}
}
else
for(uint_tp kern = 0; kern < ZPAR; kern++)
for(uint_tp hi =0; hi < YPAR; hi++)
for(uint_tp wi =0; wi < XPAR; wi++)
if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W && outputY + hi < OUTPUT_H)
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + (outputY + hi)*OUTPUT_W + outputX + wi] = sum[kern*XPAR*YPAR +XPAR*hi +wi];
}
}
#endif
//Begin IDLF kernels below here
#ifdef IDLF
#define activation_function(x) (x)
#if 0
#define _IW INPUT_WIDTH
#define _IH INPUT_HEIGHT
#define _OW OUTPUT_WIDTH
#define _OH OUTPUT_HEIGHT
#endif
#define _ID INPUT_DEPTH
#define _OD NUM_FILTERS
#define FILTER_DEPTH INPUT_DEPTH
#define NUM_INPUT INPUT_DEPTH
#define NUM_OUTPUT NUM_FILTERS
#define KERNEL FILTER_WIDTH
// convolution stride, same for x and y
#define K_STRIDE STRIDEX
#ifndef IWPAD
#define IWPAD 0
#endif
#ifndef IHPAD
#define IHPAD 0
#endif
#define OUT_BLOCK_SIZE (OUT_BLOCK_WIDTH*OUT_BLOCK_HEIGHT)
#ifndef MASTER_OUT_BLOCK_WIDTH
#define MASTER_OUT_BLOCK_WIDTH OUT_BLOCK_WIDTH
#endif
#ifndef MASTER_OUT_BLOCK_HEIGHT
#define MASTER_OUT_BLOCK_HEIGHT OUT_BLOCK_HEIGHT
#endif
// Each work-item computes a 4x6 region of one output map.
// Each work-group (which will be mapped to 1 SIMD16 EU thread) will compute 16 different feature maps, but each feature map is for the same 4x6 region of the imput image.
// NDRange: (_OW+pad)/ OUT_BLOCK_WIDTH, (_OH+pad)/OUT_BLOCK_HEIGHT, _OD/OUT_BLOCK_DEPTH
//#define SIMD_SIZE 16
// NOTE: this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break.
#ifdef SIMD16
#define TILE_X ((OUT_BLOCK_WIDTH - 1) * STRIDEX + KERNEL)
#define TILE_Y ((OUT_BLOCK_HEIGHT - 1) * STRIDEY + KERNEL)
#if (TILE_X % 4) != 0
__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
kernel void
convolve_simd16( // __global float *inputs, __global float* weights, __global float* outputs
__global float* inputs_base,
filter_qualifier float* weights_base,
__global float* biases_base,
__global float* outputs_base,
const ushort _IW,
const ushort _IH,
const ushort _OW,
const ushort _OH)
{
__global float* outputs = outputs_base;
__global float* inputs = inputs_base;
filter_qualifier float* weights = weights_base;
__global float* biases = biases_base;
uint_tp oc = get_global_id(0) * MASTER_OUT_BLOCK_WIDTH; // oc = Output Column
uint_tp or = get_global_id(1) * MASTER_OUT_BLOCK_HEIGHT;// or = Output Row
uint_tp fm = get_global_id(2);// fm = Feature Map = od = Output Depth
uint_tp fmg = get_group_id(2);
uint_tp lid = get_local_id(2);
float in[IN_BUFFER_SIZE];// load 11x16 block of input data, really only need 11x15 for 4x6 outputs, but keep it simple.
//float out[24]; // 4x6 block of outputs that is SIMD_SIZE deep (along the Feature Map dimension).
float out[OUT_BLOCK_SIZE];
uint_tp in_addr;
// find weights adress of given neuron (lid is index)
uint_tp weight_addr = (fmg % (_OD/SIMD_SIZE)) * INPUT_DEPTH * KERNEL * KERNEL * SIMD_SIZE + lid;
for(int_tp i=0;i<OUT_BLOCK_SIZE;i++) {
out[i]=0.0f;
}
uint_tp num_in_batch = fm / _OD;
uint_tp input_batch_offset = num_in_batch * (_IH + IHPAD) * (_IW + IWPAD) * TOTAL_INPUT_DEPTH_SIZE;
for(int_tp kd = 0; kd < _ID; kd++)
{
in_addr = input_batch_offset + (kd + INPUT_START_Z) * (_IH + IHPAD) * (_IW + IWPAD) + (or*K_STRIDE + INPUT_START_Y) * (_IW + IWPAD) + (oc*K_STRIDE + INPUT_START_X) + lid;
// read 11x16 input block into registers.
for(uint_tp reg = 0; reg < IN_BUFFER_SIZE; reg++) {
in[reg] = inputs[in_addr]; // read 16 elements
in_addr += (_IW + IWPAD);// move to next row down
}
// PREF could be 4 or 8, could not be other values.
#define WEIGHT_PREF 8
union {
float w[WEIGHT_PREF];
uint8 ui8;
} weight_buf;
int_tp w_idx=0;
weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]);
uint_tp orig_weight_addr = weight_addr;
weight_addr += SIMD_SIZE * WEIGHT_PREF;
int_tp kr = 0; // kr = Kernel Row
LOOP(KERNEL, kr,// LOOP is a macro that unrolls the loop.
{
int_tp kc = 0; // kc = Kernel Column
LOOP(KERNEL, kc,
{
for(int_tp br=0; br < OUT_BLOCK_HEIGHT; br++) {
for(int_tp bc=0; bc < OUT_BLOCK_WIDTH; bc++) {
float input = intel_sub_group_shuffle( in[br * K_STRIDE + kr], bc * K_STRIDE + kc);
out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
}
}
// We assume KERNEL_W is equal to KERNEL_H here.
if ((w_idx + 1) % WEIGHT_PREF == 0
#if KERNEL*KERNEL % 8 != 0
&& ((w_idx + 1) <= (KERNEL * KERNEL - WEIGHT_PREF))
#endif
) {
weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]);
weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
}
#if KERNEL*KERNEL % 8 == 0
// need to do nothing
#else
else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL * KERNEL - WEIGHT_PREF)))
#if KERNEL*KERNEL % 8 == 1
weight_buf.w[0] = weights[weight_addr];
#elif KERNEL*KERNEL % 4 == 0
weight_buf.ui8.s0123 = intel_sub_group_block_read4((__global uint *)&weights[weight_addr]);
#else
// should never be here if kernel_w equal to kernel_h. just in case.
#error unsupported kernel size.
#endif
#endif
++w_idx;
});
});
weight_addr = orig_weight_addr + KERNEL * KERNEL * SIMD_SIZE;
}
#ifdef IMAGE_AS_OUTPUT
// TODO: no ULT for that one yet!
uint_tp out_addr = ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD)) * (_OW + OWPAD) * (_OH + OHPAD);// out_addr indexes into start of 16 feature maps.
#else
// we need this address calculation for outputs because we support views and batching
uint_tp out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD) ) * (_OW + OWPAD) * (_OH + OHPAD);
#endif
out_addr += or * (_OW + OWPAD) + oc; // offset for the 4x3 block that this workitem is working on;
// we need this address calculation for biases because we support views and batching
float bias = biases[(fm) % _OD ];
#ifndef WRITE_PADDED_VALUES
if(get_global_id(0) != (get_global_size(0)-1) &&
get_global_id(1) != (get_global_size(1)-1) )
{
#endif
for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) {
for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) {
// this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
#ifdef IMAGE_AS_OUTPUT
write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]));
#else
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
#endif
}
}
#ifndef WRITE_PADDED_VALUES
} else if ( get_global_id(1) != (get_global_size(1)-1) )
{
for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) {
for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) {
#ifdef IMAGE_AS_OUTPUT
write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]));
#else
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
#endif
}
}
}
else if ( get_global_id(0) != (get_global_size(0)-1) )
{
for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) {
for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) {
#ifdef IMAGE_AS_OUTPUT
write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]));
#else
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
#endif
}
}
}
else
{
for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) {
for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) {
#ifdef IMAGE_AS_OUTPUT
write_imagef(outputs,(int2)(c,r*(_OW + OWPAD)),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]));
#else
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
#endif
}
}
}
#endif //#ifndef WRITE_PADDED_VALUES
}
#endif
#if TILE_X % 4 == 0
#define TILE_Y_STRIDE (64 / TILE_X)
#define INVEC_NUM ((TILE_Y + TILE_Y_STRIDE - 1) / TILE_Y_STRIDE)
__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
kernel void
convolve_simd16( // __global float *inputs, __global float* weights, __global float* outputs
__global float* inputs_base,
filter_qualifier float* weights_base,
__global float* biases_base,
__global float* outputs_base,
const ushort _IW,
const ushort _IH,
const ushort _OW,
const ushort _OH)
{
__global float* outputs = outputs_base;
__global float* inputs = inputs_base;
filter_qualifier float* weights = weights_base;
__global float* biases = biases_base;
uint_tp oc = get_global_id(0) * MASTER_OUT_BLOCK_WIDTH; // oc = Output Column
uint_tp or = get_global_id(1) * MASTER_OUT_BLOCK_HEIGHT;// or = Output Row
uint_tp fm = get_global_id(2);// fm = Feature Map = od = Output Depth
uint_tp fmg = get_group_id(2);
uint_tp lid = get_local_id(2);
float out[OUT_BLOCK_SIZE];
uint_tp in_addr;
// find weights adress of given neuron (lid is index)
uint_tp weight_addr = (fmg % (_OD/SIMD_SIZE)) * INPUT_DEPTH * KERNEL * KERNEL * SIMD_SIZE + lid;
for(int_tp i=0;i<OUT_BLOCK_SIZE;i++) {
out[i]=0.0f;
}
uint_tp num_in_batch = ( fm ) / _OD;
uint_tp input_batch_offset = num_in_batch * (_IH + IHPAD) * (_IW + IWPAD) * TOTAL_INPUT_DEPTH_SIZE;
in_addr = input_batch_offset + INPUT_START_Z * (_IH + IHPAD) * (_IW + IWPAD) + (or*STRIDEY + INPUT_START_Y) * (_IW + IWPAD) + (oc*STRIDEX + INPUT_START_X)
+ ( lid / ( TILE_X / 4 ) ) * (_IW + IWPAD) * STRIDEY // y tile offset
+ ( lid % ( TILE_X / 4 ) ) * 4 * STRIDEX; // x tile offset
for(int_tp kd = 0; kd < _ID; kd++)
{
union {
float4 in_vec[INVEC_NUM];
float in_array[INVEC_NUM * 4];
} in_buf;
uint_tp in_offset = in_addr;
int_tp reg = 0;
#if INVEC_NUM == 1
LOOP(1, reg,
#elif INVEC_NUM == 2
LOOP(2, reg,
#elif INVEC_NUM == 3
LOOP(3, reg,
#elif INVEC_NUM == 4
LOOP(4, reg,
#else
#error too large invec_num.
#endif
{
in_buf.in_vec[reg] = *(global float4*)(inputs + in_offset); // read 16 elements
in_offset += (_IW + IWPAD) * TILE_Y_STRIDE;
});
in_addr += (_IH + IHPAD) * (_IW + IWPAD);
// PREF could be 4 or 8, could not be other values.
#define WEIGHT_PREF 8
union {
float w[WEIGHT_PREF];
uint8 ui8;
} weight_buf;
int_tp w_idx=0;
weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]);
uint_tp orig_weight_addr = weight_addr;
weight_addr += SIMD_SIZE * WEIGHT_PREF;
#define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4))
int_tp kr = 0; // kr = Kernel Row
LOOP(KERNEL, kr,// LOOP is a macro that unrolls the loop.
{
int_tp kc = 0; // kc = Kernel Column
LOOP(KERNEL, kc,
{
for(int_tp br=0; br < OUT_BLOCK_HEIGHT; br++) {
for(int_tp bc=0; bc < OUT_BLOCK_WIDTH; bc++) {
float input = BLOCK_IN((br * STRIDEY + kr) * TILE_X + bc * STRIDEX + kc);//intel_sub_group_shuffle( in[br * K_STRIDE + kr], bc * K_STRIDE + kc);
out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
}
}
// We assume KERNEL_W is equal to KERNEL_H here.
if ((w_idx + 1) % WEIGHT_PREF == 0
#if KERNEL*KERNEL % 8 != 0
&& ((w_idx + 1) <= (KERNEL * KERNEL - WEIGHT_PREF))
#endif
) {
weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]);
weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
}
#if KERNEL*KERNEL % 8 == 0
// need to do nothing
#else
else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL * KERNEL - WEIGHT_PREF)))
#if KERNEL*KERNEL % 8 == 1
weight_buf.w[0] = weights[weight_addr];
#elif KERNEL*KERNEL % 4 == 0
weight_buf.ui8.s0123 = intel_sub_group_block_read4((__global uint *)&weights[weight_addr]);
#else
// should never be here if kernel_w equal to kernel_h. just in case.
#error unsupported kernel size.
#endif
#endif
++w_idx;
});
});
weight_addr = orig_weight_addr + KERNEL * KERNEL * SIMD_SIZE;
}
// we need this address calculation for outputs because we support views and batching
uint_tp out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD) ) * (_OW + OWPAD) * (_OH + OHPAD);
out_addr += or * (_OW + OWPAD) + oc; // offset for the 4x3 block that this workitem is working on;
// we need this address calculation for biases because we support views and batching
float bias = biases[(fm) % _OD ];
#ifndef WRITE_PADDED_VALUES
if(get_global_id(0) != (get_global_size(0)-1) &&
get_global_id(1) != (get_global_size(1)-1) )
{
#endif
for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) {
for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) {
// this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
}
}
#ifndef WRITE_PADDED_VALUES
} else if ( get_global_id(1) != (get_global_size(1)-1) )
{
for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) {
for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) {
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
}
}
}
else if ( get_global_id(0) != (get_global_size(0)-1) )
{
for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) {
for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) {
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
}
}
}
else
{
for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) {
for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) {
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
}
}
}
#endif //#ifndef WRITE_PADDED_VALUES
}
#endif // Stride > 2
#endif
#endif
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(copyImage, Dtype)
(__global Dtype* image_data,
int_tp image_offset,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp adjustedHeight, const int_tp adjustedWidth,
const int_tp pad_h, const int_tp pad_w,
__global Dtype* output_image,
const int_tp output_offset,
const int_tp batch_size) {
uint_tp sX = get_global_id(0);
uint_tp sY = get_global_id(1);
uint_tp sZ = get_global_id(2);
int_tp in_y = sY - pad_h;
int_tp in_x = sX - pad_w;
int_tp batch_offset = 0;
int_tp adjusted_batch_offset = 0;
for(uint_tp batch_idx = 0; batch_idx < batch_size; batch_idx++) {
int_tp dst_offset = adjusted_batch_offset + output_offset + sZ*adjustedHeight*adjustedWidth + sY*adjustedWidth +sX;
int_tp src_offset = batch_offset + image_offset + sZ*height*width + in_y*width + in_x;
if((in_y >= 0 && in_y < height && in_x >= 0 && in_x < width))
output_image[dst_offset] = image_data[src_offset];
else
output_image[dst_offset] = 0;
batch_offset += height * width * channels;
adjusted_batch_offset += adjustedHeight * adjustedWidth * channels;
}
}
__kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
(__global Dtype* weightIn,
__global Dtype* weightOut,
const int_tp kernel_w,
const int_tp kernel_h,
const int_tp channels,
const int_tp outputs,
const int_tp swizzleFactor) {
uint_tp sX = get_global_id(0);
//Original location
//Output location
int_tp outputSublayer = channels / swizzleFactor;
int_tp outputSublayerIndex = channels % swizzleFactor;
int_tp filter = sX / (kernel_w*kernel_h*channels);
int_tp kernel_X = sX % kernel_w;
int_tp kernel_Y = (sX / kernel_w) % kernel_h;
int_tp kernel_C = (sX / (kernel_w * kernel_h)) % channels;
int_tp FP = filter / swizzleFactor;
int_tp F1 = filter % swizzleFactor;
weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1]
= weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X];
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(crop_copy, Dtype)(const int_tp n, const int_tp height,
const int_tp width,
const int_tp src_outer_stride,
const int_tp src_inner_stride,
const int_tp dest_outer_stride,
const int_tp dest_inner_stride,
__global const Dtype* src,
const int_tp src_off,
__global Dtype* dest,
const int_tp dest_off) {
for (int_tp index = get_global_id(0); index < n;
index += get_global_size(0)) {
int_tp src_start = index / height * src_outer_stride
+ index % height * src_inner_stride;
int_tp dest_start = index / height * dest_outer_stride
+ index % height * dest_inner_stride;
for (int_tp i = 0; i < width; ++i) {
dest[dest_off + dest_start + i] = src[src_off + src_start + i];
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(dropout_forward,Dtype)(const int_tp n,
__global const Dtype* in,
__global const uint_tp* mask,
const uint_tp threshold,
const Dtype scale,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out[index] = in[index] * ((mask[index] > threshold)?1.0:0.0) * scale;
}
}
__kernel void TEMPLATE(dropout_backward,Dtype)(
const int_tp n, __global const Dtype* in_diff,
__global const uint_tp* mask, const uint_tp threshold,
const Dtype scale,
__global Dtype* out_diff) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out_diff[index] = in_diff[index] * ((mask[index] > threshold)?1.0:0.0) * scale;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(eltwise_max_forward,Dtype)(
const int_tp nthreads, __global const Dtype* bottom_data_a,
__global const Dtype* bottom_data_b, const int_tp blob_idx,
__global Dtype* top_data,
__global int_tp* mask) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
Dtype maxval = -FLT_MAX;
int_tp maxidx = -1;
if (bottom_data_a[index] > bottom_data_b[index]) {
// only update for very first bottom_data blob (blob_idx == 0)
if (blob_idx == 0) {
maxval = bottom_data_a[index];
top_data[index] = maxval;
maxidx = blob_idx;
mask[index] = maxidx;
}
} else {
maxval = bottom_data_b[index];
top_data[index] = maxval;
maxidx = blob_idx + 1;
mask[index] = maxidx;
}
}
}
__kernel void TEMPLATE(eltwise_max_backward,Dtype)(const int_tp nthreads,
__global const Dtype* top_diff,
const int_tp blob_idx,
__global const int_tp* mask,
__global Dtype* bottom_diff) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
Dtype gradient = 0;
if (mask[index] == blob_idx) {
gradient += top_diff[index];
}
bottom_diff[index] = gradient;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(elu_forward,Dtype)(const int n, __global const Dtype* in,
__global Dtype* out,
Dtype alpha) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out[index] = in[index] > 0 ? in[index] : alpha * (exp(in[index]) - 1.0);
}
}
__kernel void TEMPLATE(elu_backward,Dtype)(const int n, __global const Dtype* in_diff,
__global const Dtype* out_data,
__global const Dtype* in_data,
__global Dtype* out_diff,
Dtype alpha) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out_diff[index] =
in_data[index] > 0 ?
in_diff[index] : in_diff[index] * (out_data[index] + alpha);
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(embed_forward,Dtype)(const int_tp nthreads,
__global const Dtype* bottom_data,
__global const Dtype* weight,
const int_tp M, const int_tp N,
const int_tp K,
__global Dtype* top_data) {
for (int_tp top_index = get_global_id(0); top_index < nthreads;
top_index += get_global_size(0)) {
const int_tp n = top_index / N;
const int_tp d = top_index % N;
const int_tp index = (int_tp)(bottom_data[n]);
const int_tp weight_index = index * N + d;
top_data[top_index] = weight[weight_index];
}
}
// atomic_add from: http://suhorukov.blogspot.com/2011/12/opencl-11-atomic-operations-on-floating.html
#if (TYPE == TYPE_FLOAT)
inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) {
union {
uint_tp intVal;
Dtype floatVal;
} newVal;
union {
uint_tp intVal;
Dtype floatVal;
} prevVal;
do {
prevVal.floatVal = *source;
newVal.floatVal = prevVal.floatVal + operand;
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
}
__kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data,
__global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K,
__global Dtype* weight_diff) {
for (int_tp top_index = get_global_id(0); top_index < nthreads;
top_index += get_global_size(0)) {
const int_tp n = top_index / N;
const int_tp d = top_index % N;
const int_tp index = (int_tp)(bottom_data[n]);
const int_tp weight_index = index * N + d;
TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index));
}
}
#endif
#if (TYPE == TYPE_DOUBLE)
#ifdef ATOMICS_64_AVAILABLE
inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) {
union {
unsigned long intVal;
Dtype floatVal;
} newVal;
union {
unsigned long intVal;
Dtype floatVal;
} prevVal;
do {
prevVal.floatVal = *source;
newVal.floatVal = prevVal.floatVal + operand;
} while (atom_cmpxchg((volatile __global unsigned long *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
}
__kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data,
__global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K,
__global Dtype* weight_diff) {
for (int_tp top_index = get_global_id(0); top_index < nthreads;
top_index += get_global_size(0)) {
const int_tp n = top_index / N;
const int_tp d = top_index % N;
const int_tp index = (int_tp)(bottom_data[n]);
const int_tp weight_index = index * N + d;
TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index));
}
}
#endif
#endif
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(fft_phony,Dtype)(Dtype arg) {
Dtype out = arg;
}
#ifdef FFT
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
#define DtypeComplex Dtype2
__kernel void TEMPLATE(copy2buffer_cyclic_shift_in,Dtype)(
__global Dtype* fft_gpu_weights_real, const int_tp offset_fft_gpu_weights_real,
__global Dtype* weight, const int_tp offset_weight,
const int_tp ker_size, const int_tp ch_gr, const int_tp ker_size_ch_gr,
const int_tp ker_w, const int_tp ker_c_h, const int_tp ker_c_w,
const int_tp fft_height, const int_tp fft_width, const int_tp complex_w_len) {
fft_gpu_weights_real += offset_fft_gpu_weights_real;
weight += offset_weight;
int_tp gId = get_global_id(0);
int_tp out = gId / ker_size_ch_gr;
int_tp c = (gId - out * ker_size_ch_gr) / ker_size;
int_tp map_offset = out * ch_gr + c;
int_tp map_offset_ker_size = map_offset * ker_size;
int_tp pos_in_map = gId - map_offset_ker_size;
int_tp h = pos_in_map / ker_w;
int_tp h_ker_w = h * ker_w;
int_tp w = pos_in_map - h_ker_w;
int_tp src_idx = map_offset_ker_size + h_ker_w + w;
int_tp ky = h - ker_c_h;
if (ky < 0) ky += fft_height;
int_tp kx = w - ker_c_w;
if (kx < 0) kx += fft_width;
int_tp dst_idx = (map_offset * fft_height + ky) * complex_w_len + kx;
fft_gpu_weights_real[dst_idx] = weight[src_idx];
}
/* Use when width < 4 */
__kernel void TEMPLATE(copy2buffer_left_top_in_naive,Dtype)(__global Dtype* map_out,
const int_tp offset_map_out,
const __global Dtype* map_in, const int_tp offset_map_in,
const int_tp size,
const int_tp height_out, const int_tp width_out,
const int_tp height, const int_tp width, const int_tp stride_h, const int_tp stride_w,
const int_tp pad_h, const int_tp pad_w) {
map_out += offset_map_out;
map_in += offset_map_in;
int_tp gId = get_global_id(0);
int_tp h = gId / width;
int_tp w = gId - (h * width);
int_tp dst_idx = (h*stride_h + pad_h)*width_out + (w*stride_w + pad_w);
map_out[dst_idx] = map_in[gId];
}
/* Use when width < 4 */
__kernel void TEMPLATE(copy2buffer_left_top_in_naive_2d,Dtype)(__global Dtype* map_out,
const int_tp offset_map_out,
const __global Dtype* map_in, const int_tp offset_map_in,
const int_tp map_out_size, const int_tp size, const int_tp count,
const int_tp height_out, const int_tp width_out,
const int_tp height, const int_tp width, const int_tp stride_h, const int_tp stride_w,
const int_tp pad_h, const int_tp pad_w) {
map_out += offset_map_out;
map_in += offset_map_in;
int_tp gId_x = get_global_id(0);
int_tp gId_y = get_global_id(1);
int_tp h = gId_x / width;
int_tp w = gId_x - (h * width);
int_tp src_idx = gId_y * size + gId_x;
int_tp dst_idx = gId_y * map_out_size +
(h * stride_h + pad_h) * width_out + (w * stride_w + pad_w);
map_out[dst_idx] = map_in[src_idx];
}
/* Use when width >= 4 */
__kernel void TEMPLATE(copy2buffer_left_top_in,Dtype)(__global Dtype* map_out,
const int_tp offset_map_out,
const __global Dtype* map_in, const int_tp offset_map_in,
const int_tp size,
const int_tp height_out, const int_tp width_out,
const int_tp height, const int_tp width, const int_tp stride_h, const int_tp stride_w,
const int_tp pad_h, const int_tp pad_w) {
map_out += offset_map_out;
map_in += offset_map_in;
int_tp gId = get_global_id(0);
int_tp count = size >> 2;
int_tp gId4 = gId << 2;
int_tp h = gId4 / width;
int_tp w = gId4 - (h * width);
int_tp dst_h = h*stride_h + pad_h;
int_tp dst_w = w*stride_w + pad_w;
int_tp dst_idx = dst_h*width_out + dst_w;
if (gId < count) {
Dtype4 map_in_cache4 = vload4(gId, map_in);
int_tp has_pad = width - dst_w;
if (has_pad >= 4) {
vstore4(map_in_cache4, dst_idx >> 2, map_out);
} else {
if (0 == has_pad) {
dst_idx += width_out + pad_w - dst_w;
}
map_out[dst_idx] = map_in_cache4.x;
if (1 == has_pad) {
dst_idx += width_out + pad_w - dst_w - 1;
}
map_out[dst_idx+1] = map_in_cache4.y;
if (2 == has_pad) {
dst_idx += width_out + pad_w - dst_w - 2;
}
map_out[dst_idx+2] = map_in_cache4.z;
if (3 == has_pad) {
dst_idx += width_out + pad_w - dst_w - 3;
}
map_out[dst_idx+3] = map_in_cache4.w;
dst_h += 1;
dst_w = pad_w;
}
} else if (gId == count) {
int_tp res = size - (count << 2); /* size % 4 */
if (res > 0) {
Dtype4 map_in_cache4 = 0.f;
if (res >= 1)
map_in_cache4.x = map_in[gId4];
if (res >= 2)
map_in_cache4.y = map_in[gId4+1];
if (res == 3)
map_in_cache4.z = map_in[gId4+2];
int_tp has_pad = width - dst_w;
if (has_pad >= 4) {
vstore4(map_in_cache4, dst_idx >> 2, map_out);
} else {
if (0 == has_pad) {
dst_idx += width_out + pad_w - dst_w;
}
map_out[dst_idx] = map_in_cache4.x;
if (1 == has_pad) {
dst_idx += width_out + pad_w - dst_w - 1;
}
map_out[dst_idx+1] = map_in_cache4.y;
if (2 == has_pad) {
dst_idx += width_out + pad_w - dst_w - 2;
}
map_out[dst_idx+2] = map_in_cache4.z;
if (3 == has_pad) {
dst_idx += width_out + pad_w - dst_w - 3;
}
map_out[dst_idx+3] = map_in_cache4.w;
dst_h += 1;
dst_w = pad_w;
}
}
}
}
/* Use when width >= 4 */
__kernel void TEMPLATE(copy2buffer_left_top_in_2d,Dtype)(__global Dtype* map_out,
const int_tp offset_map_out,
const __global Dtype* map_in, const int_tp offset_map_in,
const int_tp map_out_size, const int_tp size, const int_tp count,
const int_tp height_out, const int_tp width_out,
const int_tp height, const int_tp width, const int_tp stride_h, const int_tp stride_w,
const int_tp pad_h, const int_tp pad_w) {
map_out += offset_map_out;
map_in += offset_map_in;
int_tp gId = get_global_id(0);
int_tp gId_y = get_global_id(1);
int_tp gId4 = gId << 2;
int_tp h = gId4 / width;
int_tp w = gId4 - (h * width);
int_tp dst_h = h*stride_h + pad_h;
int_tp dst_w = w*stride_w + pad_w;
int_tp dst_idx = dst_h*width_out + dst_w;
const __global Dtype* map_in_2d = map_in + gId_y * size;
__global Dtype* map_out_2d = map_out + gId_y * map_out_size;
if (gId < count) {
Dtype4 map_in_cache4 = vload4(gId, map_in_2d);
int_tp has_pad = width - dst_w;
if (has_pad >= 4) {
vstore4(map_in_cache4, dst_idx >> 2, map_out_2d);
} else {
if (0 == has_pad) {
dst_idx += width_out + pad_w - dst_w;
}
map_out_2d[dst_idx] = map_in_cache4.x;
if (1 == has_pad) {
dst_idx += width_out + pad_w - dst_w - 1;
}
map_out_2d[dst_idx+1] = map_in_cache4.y;
if (2 == has_pad) {
dst_idx += width_out + pad_w - dst_w - 2;
}
map_out_2d[dst_idx+2] = map_in_cache4.z;
if (3 == has_pad) {
dst_idx += width_out + pad_w - dst_w - 3;
}
map_out_2d[dst_idx+3] = map_in_cache4.w;
dst_h += 1;
dst_w = pad_w;
}
} else if (gId == count) {
int_tp res = size - (count << 2); /* size % 4 */
if (res > 0) {
Dtype4 map_in_cache4 = 0.f;
if (res >= 1)
map_in_cache4.x = map_in_2d[gId4];
if (res >= 2)
map_in_cache4.y = map_in_2d[gId4+1];
if (res == 3)
map_in_cache4.z = map_in_2d[gId4+2];
int_tp has_pad = width - dst_w;
if (has_pad >= 4) {
vstore4(map_in_cache4, dst_idx >> 2, map_out_2d);
} else {
if (0 == has_pad) {
dst_idx += width_out + pad_w - dst_w;
}
map_out_2d[dst_idx] = map_in_cache4.x;
if (1 == has_pad) {
dst_idx += width_out + pad_w - dst_w - 1;
}
map_out_2d[dst_idx+1] = map_in_cache4.y;
if (2 == has_pad) {
dst_idx += width_out + pad_w - dst_w - 2;
}
map_out_2d[dst_idx+2] = map_in_cache4.z;
if (3 == has_pad) {
dst_idx += width_out + pad_w - dst_w - 3;
}
map_out_2d[dst_idx+3] = map_in_cache4.w;
dst_h += 1;
dst_w = pad_w;
}
}
}
}
/* Use when width_out < 4 */
__kernel void TEMPLATE(copy2buffer_left_top_out_naive,Dtype)(__global Dtype* map_out,
const int_tp offset_map_out,
const __global Dtype* map_in, const int_tp offset_map_in,
const int_tp size,
const int_tp height_out, const int_tp width_out,
const int_tp fft_height, const int_tp fft_width,
const int_tp ker_center_h, const int_tp ker_center_w,
const int_tp stride_h, const int_tp stride_w,
const int_tp pad_h, const int_tp pad_w) {
map_out += offset_map_out;
map_in += offset_map_in;
int_tp gId = get_global_id(0);
int_tp h_out = gId / width_out;
int_tp w_out = gId - (h_out * width_out);
int_tp h = h_out * stride_h + ker_center_h;
int_tp w = w_out * stride_w + ker_center_w;
int_tp src_idx = h*fft_width + w;
map_out[gId] = map_in[src_idx];
}
/* Use when width_out < 4 */
__kernel void TEMPLATE(copy2buffer_left_top_out_naive_2d,Dtype)(__global Dtype* map_out,
const int_tp offset_map_out,
const __global Dtype* map_in, const int_tp offset_map_in,
const int_tp size, const int_tp count, const int_tp map_in_size,
const int_tp height_out, const int_tp width_out,
const int_tp fft_height, const int_tp fft_width,
const int_tp ker_center_h, const int_tp ker_center_w,
const int_tp stride_h, const int_tp stride_w,
const int_tp pad_h, const int_tp pad_w) {
map_out += offset_map_out;
map_in += offset_map_in;
int_tp gId = get_global_id(0);
int_tp out = get_global_id(1);
int_tp h_out = gId / width_out;
int_tp w_out = gId - (h_out * width_out);
int_tp h = h_out * stride_h + ker_center_h;
int_tp w = w_out * stride_w + ker_center_w;
int_tp src_idx = out * map_in_size + h*fft_width + w;
int_tp dst_idx = out * size + gId;
map_out[dst_idx] = map_in[src_idx];
}
/* Use when width_out >= 4 */
__kernel void TEMPLATE(copy2buffer_left_top_out,Dtype)(__global Dtype* map_out,
const int_tp offset_map_out,
const __global Dtype* map_in, const int_tp offset_map_in,
const int_tp size,
const int_tp height_out, const int_tp width_out,
const int_tp fft_height, const int_tp fft_width,
const int_tp ker_c_h, const int_tp ker_c_w,
const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w) {
map_out += offset_map_out;
map_in += offset_map_in;
int_tp gId = get_global_id(0);
int_tp count = size >> 2;
int_tp gId4 = gId << 2;
int_tp h_out = gId4 / width_out;
int_tp w_out = gId4 - (h_out * width_out);
int_tp h = h_out * stride_h + ker_c_h;
int_tp w = w_out * stride_w + ker_c_w;
int_tp src_idx = h*fft_width + w;
if (gId < count) {
Dtype4 map_in_cache4;
int_tp has_pad = width_out - (w - pad_w);
if (has_pad >= 4) {
map_in_cache4 = vload4(src_idx >> 2, map_in);
} else {
int_tp right_elements = fft_width - width_out;
if (0 == has_pad) {
src_idx += right_elements;
}
map_in_cache4.x = map_in[src_idx];
if (1 == has_pad) {
src_idx += right_elements;
}
map_in_cache4.y = map_in[src_idx+1];
if (2 == has_pad) {
src_idx += right_elements;
}
map_in_cache4.z = map_in[src_idx+2];
if (3 == has_pad) {
src_idx += right_elements;
}
map_in_cache4.w = map_in[src_idx+3];
}
vstore4(map_in_cache4, gId, map_out);
} else if (gId == count) {
int_tp res = size - (count << 2); /* size % 4 */
if (res > 0) {
for (int_tp i = gId4; i < size; ++i) {
map_out[i] = map_in[src_idx];
src_idx++;
}
}
}
}
/* Use when width_out >= 4 */
__kernel void TEMPLATE(copy2buffer_left_top_out_2d,Dtype)(__global Dtype* map_out,
const int_tp offset_map_out,
const __global Dtype* map_in, const int_tp offset_map_in,
const int_tp size, const int_tp count, const int_tp map_in_size,
const int_tp height_out, const int_tp width_out,
const int_tp fft_height, const int_tp fft_width,
const int_tp ker_c_h, const int_tp ker_c_w,
const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w) {
map_out += offset_map_out;
map_in += offset_map_in;
int_tp gId = get_global_id(0);
int_tp out = get_global_id(1);
int_tp gId4 = gId << 2;
int_tp h_out = gId4 / width_out;
int_tp w_out = gId4 - (h_out * width_out);
int_tp h = h_out * stride_h + ker_c_h;
int_tp w = w_out * stride_w + ker_c_w;
int_tp src_idx = h*fft_width + w;
const __global Dtype* map_in_2d = map_in + out * map_in_size;
__global Dtype* map_out_2d = map_out + out * size;
if (gId < count) {
Dtype4 map_in_cache4;
int_tp has_pad = width_out - (w - pad_w);
if (has_pad >= 4) {
map_in_cache4 = vload4(src_idx >> 2, map_in_2d);
} else {
int_tp right_elements = fft_width - width_out;
if (0 == has_pad) {
src_idx += right_elements;
}
map_in_cache4.x = map_in_2d[src_idx];
if (1 == has_pad) {
src_idx += right_elements;
}
map_in_cache4.y = map_in_2d[src_idx+1];
if (2 == has_pad) {
src_idx += right_elements;
}
map_in_cache4.z = map_in_2d[src_idx+2];
if (3 == has_pad) {
src_idx += right_elements;
}
map_in_cache4.w = map_in_2d[src_idx+3];
}
vstore4(map_in_cache4, gId, map_out_2d);
} else if (gId == count) {
int_tp res = size - (count << 2); /* size % 4 */
if (res > 0) {
const __global Dtype4* map_in_2d_4 =
(const __global Dtype4*)(map_in_2d + src_idx);
__global Dtype4* map_out_2d_4 = (__global Dtype4*)(map_out_2d + gId4);
if (res == 3) {
map_out_2d_4[0].xyz = map_in_2d_4[0].xyz;
} else if (res == 2) {
map_out_2d_4[0].xy = map_in_2d_4[0].xy;
} else if (res == 1) {
map_out_2d_4[0].x = map_in_2d_4[0].x;
}
}
}
}
__kernel void TEMPLATE(copy2buffer_cyclic_shift_out,Dtype)(__global Dtype* map_out,
const int_tp offset_map_out,
const __global Dtype* map_in, const int_tp offset_map_in,
const int_tp width_out,
const int_tp fft_height, const int_tp fft_width,
const int_tp ker_center_h, const int_tp ker_center_w,
const int_tp stride_h, const int_tp stride_w,
const int_tp pad_h, const int_tp pad_w) {
map_out += offset_map_out;
map_in += offset_map_in;
int_tp gId = get_global_id(0);
int_tp h_out = gId / width_out;
int_tp w_out = gId - (h_out * width_out);
int_tp h = h_out * stride_h + pad_h;
int_tp w = w_out * stride_w + pad_w;
int_tp ky = h - ker_center_h;
if (ky < 0) ky += fft_height;
int_tp kx = w - ker_center_w;
if (kx < 0) kx += fft_width;
int_tp src_idx = ky*fft_width + kx;
map_out[gId] = map_in[src_idx];
}
__kernel void TEMPLATE(copy2buffer_cyclic_shift_out_2d,Dtype)(__global Dtype* map_out,
const int_tp offset_map_out,
const __global Dtype* map_in, const int_tp offset_map_in,
const int_tp map_out_size, const int_tp map_in_size,
const int_tp width_out,
const int_tp fft_height, const int_tp fft_width,
const int_tp ker_center_h, const int_tp ker_center_w,
const int_tp stride_h, const int_tp stride_w,
const int_tp pad_h, const int_tp pad_w) {
map_out += offset_map_out;
map_in += offset_map_in;
int_tp gId = get_global_id(0);
int_tp gId_y = get_global_id(1);
int_tp h_out = gId / width_out;
int_tp w_out = gId - (h_out * width_out);
int_tp h = h_out * stride_h + pad_h;
int_tp w = w_out * stride_w + pad_w;
int_tp ky = h - ker_center_h;
if (ky < 0) ky += fft_height;
int_tp kx = w - ker_center_w;
if (kx < 0) kx += fft_width;
int_tp src_idx = gId_y * map_in_size + ky*fft_width + kx;
int_tp dst_idx = gId_y * map_out_size + gId;
map_out[dst_idx] = map_in[src_idx];
}
__kernel void TEMPLATE(complex_conjugate_multiplication_1d,Dtype)(__global Dtype* dst,
const int_tp offset_dst,
const __global Dtype* src1, const int_tp offset_src1,
const __global Dtype* src2, const int_tp offset_src2,
const int_tp ch_gr) {
dst += offset_dst;
src1 += offset_src1;
src2 += offset_src2;
int_tp gId = get_global_id(0);
int_tp size = get_global_size(0);
Dtype4 dst_cache = 0.f;
int_tp src_idx;
Dtype4 s1_cache;
Dtype4 s2_cache;
for (int_tp c = 0; c < ch_gr; ++c) {
src_idx = size * c + gId;
s1_cache = vload4(src_idx, src1);
s2_cache = vload4(src_idx, src2);
dst_cache.x += s1_cache.x * s2_cache.x + s1_cache.y * s2_cache.y;
dst_cache.y += -s1_cache.x * s2_cache.y + s1_cache.y * s2_cache.x;
dst_cache.z += s1_cache.z * s2_cache.z + s1_cache.w * s2_cache.w;
dst_cache.w += -s1_cache.z * s2_cache.w + s1_cache.w * s2_cache.z;
}
((__global Dtype4*)(&dst[gId<<2]))[0] += dst_cache;
}
__kernel void TEMPLATE(complex_conjugate_multiplication_2d,Dtype)(__global Dtype* dst,
const int_tp offset_dst,
const __global Dtype* src1, const int_tp offset_src1,
const __global Dtype* src2, const int_tp offset_src2,
const int_tp out_gr, const int_tp map_size, const int_tp ch_gr) {
dst += offset_dst;
src1 += offset_src1;
src2 += offset_src2;
int_tp gId = get_global_id(0);
int_tp out = get_global_id(1);
int_tp src1_idx, src2_idx;
int_tp dst_map_offset = map_size * out;
int_tp dst_idx = dst_map_offset + gId;
Dtype4 s1_cache, s2_cache;
Dtype4 dst_cache = 0.f;
int_tp map_offset = dst_map_offset * ch_gr;
for (int_tp i = 0; i < ch_gr; ++i) {
src1_idx = map_size * i + gId;
src2_idx = map_offset + src1_idx;
s1_cache = vload4(src1_idx, src1);
s2_cache = vload4(src2_idx, src2);
dst_cache.xz += mad( s1_cache.xz, s2_cache.xz, s1_cache.yw * s2_cache.yw);
dst_cache.yw += mad(-s1_cache.xz, s2_cache.yw, s1_cache.yw * s2_cache.xz);
}
vstore4(dst_cache, dst_idx, dst);
}
__kernel void TEMPLATE(complex_conjugate_multiplication_2d_SLM,Dtype)(
__global Dtype* restrict dst, const int_tp offset_dst,
const __global Dtype* restrict src1, const int_tp offset_src1,
__local Dtype* local_src1,
const __global Dtype* restrict src2, const int_tp offset_src2,
const int_tp out_gr, const int_tp map_size, const int_tp ch_gr) {
int_tp gId = get_global_id(0);
if (gId >= map_size) return; /* Do not remove this */
int_tp out = get_global_id(1);
if (out >= out_gr) return; /* Do not remove this */
dst += offset_dst;
src1 += offset_src1;
src2 += offset_src2;
int_tp tId = get_local_id(0);
int_tp local_out = get_local_id(1);
int_tp tile_size = get_local_size(0);
Dtype4 s1_cache;
if (local_out == 0) {
for (int_tp c = 0; c < ch_gr; ++c) {
s1_cache = vload4(map_size * c + gId, src1);
vstore4(s1_cache, tile_size * c + tId, local_src1);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int_tp dst_map_offset = map_size * out;
int_tp dst_idx = (dst_map_offset + gId) << 2;
Dtype4 dst_cache = 0.f;
Dtype4 s2_cache;
int_tp ch_offset = 0;
int_tp map_offset = dst_map_offset * ch_gr;
for (int_tp c = 0; c < ch_gr; ++c) {
ch_offset = map_size * c;
s1_cache = vload4(tile_size * c + tId, local_src1);
s2_cache = vload4(map_offset + ch_offset + gId, src2);
dst_cache.xz += mad( s1_cache.xz, s2_cache.xz, s1_cache.yw * s2_cache.yw);
dst_cache.yw += mad(-s1_cache.xz, s2_cache.yw, s1_cache.yw * s2_cache.xz);
}
((__global Dtype4*)(&dst[dst_idx]))[0] += dst_cache;
}
__kernel void TEMPLATE(complex_conjugate_multiplication_3d,Dtype)(__global Dtype* dst,
const int_tp offset_dst,
const __global Dtype* src1, const int_tp offset_src1,
const __global Dtype* src2, const int_tp offset_src2,
const int_tp out_gr, const int_tp size, const int_tp ch_gr) {
dst += offset_dst;
src1 += offset_src1;
src2 += offset_src2;
int_tp gId = get_global_id(0);
int_tp out = get_global_id(1);
int_tp ch = get_global_id(2);
Dtype4 dst_cache = 0.f;
Dtype4 s1_cache = ((__global Dtype4*)(&(src1[(size*ch+gId)<<2])))[0];
Dtype4 s2_cache = ((__global Dtype4*)(&(src2[(size*(out*ch_gr+ch)+gId)<<2])))[0];
dst_cache.x = s1_cache.x * s2_cache.x + s1_cache.y * s2_cache.y;
dst_cache.y = -s1_cache.x * s2_cache.y + s1_cache.y * s2_cache.x;
dst_cache.z = s1_cache.z * s2_cache.z + s1_cache.w * s2_cache.w;
dst_cache.w = -s1_cache.z * s2_cache.w + s1_cache.w * s2_cache.z;
((__global Dtype4*)(&dst[(size*out+gId)<<2]))[0] += dst_cache;
}
__kernel void TEMPLATE(complex_conjugate_multiplication_3d_SLM,Dtype)(__global Dtype* dst,
const int_tp offset_dst, __local Dtype* local_dst,
const __global Dtype* src1, const int_tp offset_src1,
__local Dtype* local_src1, const __global Dtype* src2,
const int_tp offset_src2, const int_tp out_gr, const int_tp map_size,
const int_tp ch_gr) {
int_tp gId = get_global_id(0);
if (gId >= map_size) return; /* Do not remove this */
int_tp out = get_global_id(1);
if (out >= out_gr) return; /* Do not remove this */
int_tp ch = get_global_id(2);
if (ch >= ch_gr) return; /* Do not remove this */
dst += offset_dst;
src1 += offset_src1;
src2 += offset_src2;
int_tp tId = get_local_id(0);
int_tp local_out = get_local_id(1);
int_tp tile_size = get_local_size(0);
Dtype4 s1_cache;
if (local_out == 0) {
s1_cache = vload4(map_size * ch + gId, src1);
vstore4(s1_cache, tile_size * ch + tId, local_src1);
}
barrier(CLK_LOCAL_MEM_FENCE);
int_tp dst_map_offset = map_size * out;
int_tp dst_idx = (dst_map_offset + gId) << 2;
Dtype4 dst_cache = 0.f;
Dtype4 s2_cache;
s1_cache = vload4(tile_size * ch + tId, local_src1);
s2_cache = vload4((dst_map_offset * ch_gr) + (map_size * ch) + gId, src2);
dst_cache.x += s1_cache.x * s2_cache.x + s1_cache.y * s2_cache.y;
dst_cache.y += -s1_cache.x * s2_cache.y + s1_cache.y * s2_cache.x;
dst_cache.z += s1_cache.z * s2_cache.z + s1_cache.w * s2_cache.w;
dst_cache.w += -s1_cache.z * s2_cache.w + s1_cache.w * s2_cache.z;
((__global Dtype4*)(&dst[dst_idx]))[0] += dst_cache;
}
__kernel void TEMPLATE(complex_multiplication_1d,Dtype)(__global Dtype* dst,
const int_tp offset_dst,
const __global Dtype* src1, const int_tp offset_src1,
const __global Dtype* src2, const int_tp offset_src2,
const int_tp size, const int_tp ch_gr) {
dst += offset_dst;
src1 += offset_src1;
src2 += offset_src2;
int_tp gId = get_global_id(0);
Dtype4 s2_cache;
Dtype4 dst_cache = 0.f;
int_tp idx_with_ch;
Dtype4 s1_cache = vload4(gId, src1);
for (int_tp ch = 0; ch < ch_gr; ++ch) {
idx_with_ch = size * ch + gId;
s2_cache = vload4(idx_with_ch, src2);
dst_cache.xz = s1_cache.xz * s2_cache.xz - s1_cache.yw * s2_cache.yw;
dst_cache.yw = s1_cache.xz * s2_cache.yw + s1_cache.yw * s2_cache.xz;
((__global Dtype4*)(&dst[idx_with_ch<<2]))[0] += dst_cache;
}
}
__kernel void TEMPLATE(complex_multiplication_2d_SLM,Dtype)(__global Dtype* restrict dst,
const int_tp offset_dst, __local Dtype* local_dst,
const __global Dtype* restrict src1, const int_tp offset_src1,
const __global Dtype* restrict src2, const int_tp offset_src2,
const int_tp num_output, const int_tp size, const int_tp ch_gr) {
int_tp gId = get_global_id(0);
if (gId >= size) return;
int_tp out = get_global_id(1);
if (out >= num_output) return;
dst += offset_dst;
src1 += offset_src1;
src2 += offset_src2;
int_tp tId = get_local_id(0);
int_tp tOut = get_local_id(1);
int_tp tile_size = get_local_size(0);
int_tp local_out_size = get_local_size(1);
int_tp out_offset = out * size;
int_tp out_ch_offset = out_offset * ch_gr;
int_tp tile_size_in_all_ch = tile_size * ch_gr;
int_tp local_out_ch_offset = tOut * tile_size_in_all_ch;
int_tp src2_idx, local_dst_idx;
Dtype4 s2_cache, dst_cache;
int_tp src1_idx = out_offset + gId;
Dtype4 s1_cache = vload4(src1_idx, src1);
for (int_tp ch = 0; ch < ch_gr; ++ch) {
src2_idx = out_ch_offset + ch * size + gId;
s2_cache = vload4(src2_idx, src2);
dst_cache.xz = s1_cache.xz * s2_cache.xz - s1_cache.yw * s2_cache.yw;
dst_cache.yw = s1_cache.xz * s2_cache.yw + s1_cache.yw * s2_cache.xz;
local_dst_idx = local_out_ch_offset + ch * tile_size + tId;
vstore4(dst_cache, local_dst_idx, local_dst);
}
barrier(CLK_LOCAL_MEM_FENCE);
int_tp start_idx, half_start_idx;
int_tp ch_offset;
int_tp this_idx, that_idx;
for (int_tp offset = local_out_size >>= 1; offset > 0; offset >>=1) {
if (tOut < offset) {
start_idx = tOut * tile_size_in_all_ch + tId;
half_start_idx = (tOut + offset) * tile_size_in_all_ch + tId;
for (int_tp ch = 0; ch < ch_gr; ++ch) {
ch_offset = ch * tile_size;
this_idx = (start_idx + ch_offset) << 2;
that_idx = (half_start_idx + ch_offset) << 2;
((__local Dtype4*)(&local_dst[this_idx]))[0] +=
((__local Dtype4*)(&local_dst[that_idx]))[0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tOut == 0) {
for (int_tp ch = 0; ch < ch_gr; ++ch) {
dst_cache = vload4(tile_size * ch + tId, local_dst);
((__global Dtype4*)(&dst[(size * ch + gId)<<2]))[0] += dst_cache;
}
}
}
__kernel void TEMPLATE(complex_multiplication_3d,Dtype)(__global Dtype* dst,
const int_tp offset_dst,
const __global Dtype* src1, const int_tp offset_src1,
const __global Dtype* src2, const int_tp offset_src2,
const int_tp size, const int_tp ch_gr, const int_tp out_gr, const int_tp num_output) {
dst += offset_dst;
src1 += offset_src1;
src2 += offset_src2;
int_tp gId = get_global_id(0);
int_tp ch = get_global_id(1);
int_tp out = get_global_id(2);
int_tp g = out / out_gr;
ch += (g * ch_gr);
int_tp c_offset = ch - ((ch / ch_gr) * ch_gr);
__global Dtype2* dst_ch = ((__global Dtype2*)(dst)) + (size * ch);
__global Dtype2* src1_out = ((__global Dtype2*)(src1)) + (size * out);
__global Dtype2* src2_out_ch = ((__global Dtype2*)(src2)) + (size * (out * ch_gr + c_offset));
Dtype2 s1_cache = src1_out[gId];
Dtype2 s2_cache = src2_out_ch[gId];
Dtype2 dst_cache = 0.f;
dst_cache.x = s1_cache.x * s2_cache.x - s1_cache.y * s2_cache.y;
dst_cache.y = s1_cache.x * s2_cache.y + s1_cache.y * s2_cache.x;
dst_ch[gId] += dst_cache;
}
/* Convert [RRRR...GGGG...BBBB...] to [RGBRGBRGBRGB...] */
/* Reshape 2 */
__kernel void TEMPLATE(convert_data_to_channel_major,Dtype)(__global Dtype2* dst,
const __global Dtype2* src, const int_tp size, const int_tp ch_gr) {
int_tp gId = get_global_id(0);
__global Dtype* dst_ptr = (__global Dtype*)(dst + (gId * ch_gr));
const __global Dtype* src_ptr = (const __global Dtype*)(src + gId);
Dtype2 s;
int_tp src_idx = 0;
for (int_tp i = 0; i < ch_gr; ++i) {
s = vload2(src_idx, src_ptr);
vstore2(s, i, dst_ptr);
src_idx += size;
}
}
/* Reshape 1 */
/*__kernel void TEMPLATE(convert_data_to_channel_major(__global Dtype4* dst,
const __global Dtype4* src, const int_tp size, const int_tp ch_gr) {
int_tp gId = get_global_id(0);
const __global Dtype4* src_ptr4 = src + gId;
__global Dtype4* dst_ptr4 = dst + (gId * ch_gr);
for (int_tp i = 0; i < ch_gr; ++i) {
dst_ptr4[i] = src_ptr4[i*size];
}
}
*/
/* Convert multiple [RRRR...GGGG...BBBB...] to multiple [RGBRGBRGBRGB...] */
/* Reshape 2 */
__kernel void TEMPLATE(convert_weight_to_channel_major,Dtype)(__global Dtype2* dst,
const __global Dtype2* src, const int_tp size, const int_tp ch_gr,
const int_tp num_output) {
int_tp gId = get_global_id(0);
int_tp out = get_global_id(1);
int_tp out_offset = out * (size * ch_gr);
__global Dtype* dst_ptr = (__global Dtype*)(dst + out_offset + (gId * ch_gr));
const __global Dtype* src_ptr =
(const __global Dtype*)(src + out_offset + gId);
Dtype2 s;
int_tp src_idx = 0;
for (int_tp i = 0; i < ch_gr; ++i) {
s = vload2(src_idx, src_ptr);
vstore2(s, i, dst_ptr);
src_idx += size;
}
}
/* Reshape 1 */
/*
__kernel void TEMPLATE(convert_weight_to_channel_major(__global Dtype4* dst,
const __global Dtype4* src, const int_tp size, const int_tp ch_gr,
const int_tp out_gr) {
int_tp gId = get_global_id(0);
int_tp out = get_global_id(1);
int_tp out_offset = out * (size * ch_gr);
__global Dtype4* dst_ptr4 = dst + out_offset + (gId * ch_gr);
const __global Dtype4* src_ptr4 = src + out_offset + gId;
for (int_tp i = 0; i < ch_gr; ++i) {
dst_ptr4[i] = src_ptr4[size * i];
}
}
*/
/* Cdotc per element */
/* Reshape 1 */
/*
__kernel void TEMPLATE(batchedCdotc(__global Dtype4* dst,
const __global Dtype4* src1, const __global Dtype4* src2,
const int_tp size, const int_tp ch_gr, const int_tp out_gr) {
int_tp gId = get_global_id(0);
int_tp out = get_global_id(1);
int_tp ch_offset = gId * ch_gr;
int_tp out_offset = out * size;
const __global Dtype* src1_ptr = (const __global Dtype*)(src1 + ch_offset);
const __global Dtype* src2_ptr = (const __global Dtype*)(src2 + (out_offset * ch_gr) + ch_offset);
Dtype4 cdotc = 0.f;
Dtype4 s1, s2;
for (int_tp c = 0; c < ch_gr; ++c) {
s1 = vload4(c, src1_ptr);
s2 = vload4(c, src2_ptr);
cdotc.xz += mad( s1.xz, s2.xz, s1.yw * s2.yw);
cdotc.yw += mad(-s1.xz, s2.yw, s1.yw * s2.xz);
}
__global Dtype4* dst_ptr4 = dst + out_offset + gId;
dst_ptr4[0] += cdotc;
}
*/
/* Cdotc per two elements */
/* Reshape 2 */
__kernel void TEMPLATE(batchedCdotc,Dtype)(__global Dtype2* dst,
const __global Dtype2* src1, const __global Dtype2* src2,
const int_tp size, const int_tp ch_gr, const int_tp out_gr) {
int_tp gId = get_global_id(0);
int_tp out = get_global_id(1);
int_tp ch_offset = gId * ch_gr;
const __global Dtype* src1_ptr = (const __global Dtype*)(src1 + ch_offset);
const __global Dtype* src2_ptr =
(const __global Dtype*)(src2 + (out * size * ch_gr) + ch_offset);
Dtype4 cdotc4 = 0.f;
Dtype2 cdotc = 0.f;
Dtype4 s1, s2;
int_tp n = ch_gr >> 1;
int_tp r = ch_gr - (n << 1);
for (int_tp i = 0; i < n; ++i) {
s1 = vload4(i, src1_ptr);
s2 = vload4(i, src2_ptr);
cdotc4.xz += mad( s1.xz, s2.xz, s1.yw * s2.yw);
cdotc4.yw += mad(-s1.xz, s2.yw, s1.yw * s2.xz);
}
cdotc.x += dot(cdotc4.xz, (float2)(1));
cdotc.y += dot(cdotc4.yw, (float2)(1));
if (r == 1) {
const __global Dtype* src1_ptr2 =
(const __global Dtype*)(((const __global Dtype4*)(src1_ptr)) + n);
const __global Dtype* src2_ptr2 =
(const __global Dtype*)(((const __global Dtype4*)(src2_ptr)) + n);
Dtype2 t1 = vload2(0, src1_ptr2);
Dtype2 t2 = vload2(0, src2_ptr2);
cdotc.x += mad( t1.x, t2.x, t1.y * t2.y);
cdotc.y += mad(-t1.x, t2.y, t1.y * t2.x);
}
__global Dtype* dst_ptr = (__global Dtype*)(dst + (out * size) + gId);
vstore2(cdotc, 0, dst_ptr);
}
#endif
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(fillbuffer,Dtype)(const int_tp n, const char alpha, __global char* x,
const int_tp offx) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
x[index + offx] = alpha;
}
}
__kernel void TEMPLATE(fill,Dtype)(const int_tp n, const Dtype alpha, __global Dtype* x,
const int_tp offx) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
x[index + offx] = alpha;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(im2col,Dtype)(const int_tp n,
__global const Dtype* data_im,
const int_tp data_im_off,
const int_tp height, const int_tp width,
const int_tp kernel_h,
const int_tp kernel_w, const int_tp pad_h,
const int_tp pad_w, const int_tp stride_h,
const int_tp stride_w,
const int_tp dilation_h,
const int_tp dilation_w,
const int_tp height_col,
const int_tp width_col,
__global Dtype* data_col,
const int_tp data_col_off) {
for (int_tp index = get_global_id(0); index < n;
index += get_global_size(0)) {
const int_tp h_index = index / width_col;
const int_tp h_col = h_index % height_col;
const int_tp w_col = index % width_col;
const int_tp c_im = h_index / height_col;
const int_tp c_col = c_im * kernel_h * kernel_w;
const int_tp h_offset = h_col * stride_h - pad_h;
const int_tp w_offset = w_col * stride_w - pad_w;
__global Dtype* data_col_ptr = data_col + data_col_off;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
__global const Dtype* data_im_ptr = data_im + data_im_off;
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (int_tp i = 0; i < kernel_h; ++i) {
for (int_tp j = 0; j < kernel_w; ++j) {
int_tp h_im = h_offset + i * dilation_h;
int_tp w_im = w_offset + j * dilation_w;
*data_col_ptr =
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
data_col_ptr += height_col * width_col;
}
}
}
}
__kernel void TEMPLATE(col2im,Dtype)(const int_tp n,
__global const Dtype* data_col,
const int_tp data_col_off,
const int_tp height, const int_tp width,
const int_tp channels,
const int_tp kernel_h,
const int_tp kernel_w, const int_tp pad_h,
const int_tp pad_w, const int_tp stride_h,
const int_tp stride_w,
const int_tp dilation_h,
const int_tp dilation_w,
const int_tp height_col,
const int_tp width_col,
__global Dtype* data_im,
const int_tp data_im_off) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
Dtype val = 0;
const int_tp w_im = index % width + pad_w;
const int_tp h_im = (index / width) % height + pad_h;
const int_tp c_im = index / (width * height);
int_tp kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
int_tp kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
// compute the start and end of the output
const int_tp w_col_start =
(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
const int_tp w_col_end = min(w_im / stride_w + 1, width_col);
const int_tp h_col_start =
(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
const int_tp h_col_end = min(h_im / stride_h + 1, height_col);
// TODO: use LCM of stride and dilation to avoid unnecessary loops
for (int_tp h_col = h_col_start; h_col < h_col_end; h_col += 1) {
for (int_tp w_col = w_col_start; w_col < w_col_end; w_col += 1) {
int_tp h_k = (h_im - h_col * stride_h);
int_tp w_k = (w_im - w_col * stride_w);
if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
h_k /= dilation_h;
w_k /= dilation_w;
int_tp data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
height_col + h_col) * width_col + w_col;
val += data_col[data_col_off + data_col_index];
}
}
}
data_im[data_im_off + index] = val;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(im2col_nd, Dtype)(const int_tp n, const int_tp num_axes,
const int_tp channel_axis,
__global const Dtype* data_im,
const int_tp data_im_off,
__global const int_tp* im_shape,
__global const int_tp* col_shape,
__global const int_tp* kernel_shape,
__global const int_tp* pad,
__global const int_tp* stride,
__global const int_tp* dilation,
__global Dtype* data_col,
const int_tp data_col_off) {
int_tp d_temp[6];
int_tp d_iter[6];
int_tp i;
__global const int_tp* im_shape_ptr = im_shape + channel_axis;
__global const int_tp* col_shape_ptr = col_shape + channel_axis;
__local int_tp shared_dilation[6];
__local int_tp shared_kernel_shape[6];
__local int_tp shared_pad[6];
__local int_tp shared_stride[6];
__local int_tp shared_col_shape[6 + 1];
__local int_tp shared_im_shape[6 + 1];
for (int li = get_local_id(0); li < num_axes; li += get_local_size(0)) {
shared_dilation[li] = dilation[li];
shared_kernel_shape[li] = kernel_shape[li];
shared_pad[li] = pad[li];
shared_stride[li] = stride[li];
}
for (int li = get_local_id(0); li < num_axes + 1; li += get_local_size(0)) {
shared_col_shape[li] = col_shape_ptr[li];
shared_im_shape[li] = im_shape_ptr[li];
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int_tp index = get_global_id(0); index < n;
index += get_global_size(0)) {
// Initialize channel_in, computed in the loop below, with intermediate
// computations used to compute the spatial indices.
int_tp channel_in = index;
int_tp channel_out = 1;
for (i = num_axes - 1; i >= 0; --i) {
d_temp[i] = channel_in % shared_col_shape[i + 1];
channel_in /= shared_col_shape[i + 1];
channel_out *= shared_kernel_shape[i];
}
channel_out *= channel_in;
int_tp data_col_inc = 1;
for (i = 0; i < num_axes; ++i) {
channel_out *= shared_col_shape[i + 1];
channel_out += d_temp[i];
d_temp[i] = d_temp[i] * shared_stride[i] - shared_pad[i];
channel_in *= shared_im_shape[i + 1];
channel_in += d_temp[i];
data_col_inc *= shared_col_shape[i + 1];
d_iter[i] = 0;
}
__global Dtype* data_col_ptr = data_col + data_col_off + channel_out;
__global const Dtype* data_im_ptr = data_im + data_im_off + channel_in;
bool incremented;
do {
bool in_range = true;
for (i = 0; i < num_axes; ++i) {
const int_tp d_iter_im = d_iter[i] * shared_dilation[i] + d_temp[i];
in_range &= d_iter_im >= 0 && d_iter_im < shared_im_shape[i + 1];
if (!in_range) {
break;
}
}
if (in_range) {
int_tp data_im_offset = d_iter[0] * shared_dilation[0];
for (i = 1; i < num_axes; ++i) {
data_im_offset *= shared_im_shape[i + 1];
data_im_offset += d_iter[i] * shared_dilation[i];
}
*data_col_ptr = data_im_ptr[data_im_offset];
} else {
*data_col_ptr = 0;
}
data_col_ptr += data_col_inc;
incremented = false;
for (i = num_axes - 1; i >= 0; --i) {
const int_tp d_max = shared_kernel_shape[i];
if (d_iter[i] == d_max - 1) {
d_iter[i] = 0;
} else { // d_iter[i] < d_max - 1
++d_iter[i];
incremented = true;
break;
}
} // for (int_tp i = num_axes - 1; i >= 0; --i)
} while (incremented); // do
}
}
__kernel void TEMPLATE(col2im_nd, Dtype)(const int_tp n, const int_tp num_axes,
const int_tp channel_axis,
__global const Dtype* data_col,
const int_tp data_col_off,
__global const int_tp* im_shape,
__global const int_tp* col_shape,
__global const int_tp* kernel_shape,
__global const int_tp* pad,
__global const int_tp* stride,
__global const int_tp* dilation,
__global Dtype* data_im,
const int_tp data_im_off) {
int_tp d_im[6];
int_tp d_col_iter[6];
int_tp d_col_start[6];
int_tp d_col_end[6];
__global const int_tp* im_shape_ptr = im_shape + channel_axis;
__global const int_tp* col_shape_ptr = col_shape + channel_axis;
__local int_tp shared_dilation[6];
__local int_tp shared_kernel_shape[6];
__local int_tp shared_pad[6];
__local int_tp shared_stride[6];
__local int_tp shared_col_shape[6 + 1];
__local int_tp shared_im_shape[6 + 1];
for (int li = get_local_id(0); li < num_axes; li += get_local_size(0)) {
shared_dilation[li] = dilation[li];
shared_kernel_shape[li] = kernel_shape[li];
shared_pad[li] = pad[li];
shared_stride[li] = stride[li];
}
for (int li = get_local_id(0); li < num_axes + 1; li += get_local_size(0)) {
shared_col_shape[li] = col_shape_ptr[li];
shared_im_shape[li] = im_shape_ptr[li];
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
// Initialize channel_in, computed in the loop below, with intermediate
// computations used to compute the spatial indices.
int_tp c_im = index;
// Calculate d_im (image dimensions).
for (int_tp i = num_axes - 1; i >= 0; --i) {
d_im[i] = c_im % shared_im_shape[i + 1] + shared_pad[i];
c_im /= shared_im_shape[i + 1];
}
// Calculate col start/end indices.
bool done = false;
for (int_tp i = 0; i < num_axes; ++i) {
const int_tp kernel_extent = shared_dilation[i]
* (shared_kernel_shape[i] - 1) + 1;
d_col_start[i] = d_col_iter[i] =
(d_im[i] < kernel_extent) ?
0 : (d_im[i] - kernel_extent) / shared_stride[i] + 1;
d_col_end[i] = min(d_im[i] / shared_stride[i] + 1,
shared_col_shape[i + 1]);
if (d_col_start[i] >= d_col_end[i]) {
// Skip computation if the dimension is 0 at any spatial axis --
// final val will be 0.
data_im[index] = 0;
done = true;
break; // for (int_tp i = 0; i < num_axes; ++i)
}
}
if (!done) {
// Loop over the col to compute the output val.
Dtype val = 0;
bool incremented = true;
bool skip = false;
do {
// Compute the final offset.
int_tp final_offset = 0;
int_tp kernel_shape_prod = 1;
int_tp kernel_index;
for (int_tp i = num_axes - 1; i >= 0; --i) {
kernel_index = d_im[i] - d_col_iter[i] * shared_stride[i];
if (kernel_index % shared_dilation[i]) {
skip = true;
break;
} else {
kernel_index /= shared_dilation[i];
final_offset += kernel_index * kernel_shape_prod;
kernel_shape_prod *= shared_kernel_shape[i];
}
}
if (!skip) {
final_offset += kernel_shape_prod * c_im;
for (int_tp i = 0; i < num_axes; ++i) {
final_offset *= shared_col_shape[i + 1];
final_offset += d_col_iter[i];
}
val += data_col[data_col_off + final_offset];
}
skip = false;
incremented = false;
for (int_tp i = num_axes - 1; i >= 0; --i) {
const int_tp d_max = d_col_end[i];
if (d_col_iter[i] == d_max - 1) {
d_col_iter[i] = d_col_start[i];
} else { // d_col_iter[i] < d_max - 1
++d_col_iter[i];
incremented = true;
break; // for (int_tp i = num_axes - 1; i >= 0; --i)
}
} // for (int_tp i = num_axes - 1; i >= 0; --i)
} while (incremented);
data_im[data_im_off + index] = val;
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(lrn_compute_output,Dtype)(const int_tp nthreads,
__global const Dtype* in,
__global const Dtype* scale,
const Dtype negative_beta,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
out[index] = in[index] * pow(scale[index], negative_beta);
}
}
__kernel void TEMPLATE(lrn_fill_scale,Dtype)(const int_tp nthreads, __global const Dtype* in,
const int_tp num, const int_tp channels,
const int_tp height, const int_tp width, const int_tp size,
const Dtype alpha_over_size, const Dtype k,
__global Dtype* const scale) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
// find out the local offset
const int_tp w = index % width;
const int_tp h = (index / width) % height;
const int_tp n = index / width / height;
const int_tp offset = (n * channels * height + h) * width + w;
const int_tp step = height * width;
__global const Dtype* in_off = in + offset;
__global Dtype* scale_off = scale + offset;
int_tp head = 0;
const int_tp pre_pad = (size - 1) / 2;
const int_tp post_pad = size - pre_pad - 1;
Dtype accum_scale = 0;
// fill the scale at [n, :, h, w]
// accumulate values
while (head < post_pad && head < channels) {
accum_scale += in_off[head * step] * in_off[head * step];
++head;
}
// both add and subtract
while (head < channels) {
accum_scale += in_off[head * step] * in_off[head * step];
if (head - size >= 0) {
accum_scale -= in_off[(head - size) * step]
* in_off[(head - size) * step];
}
scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
++head;
}
// subtract only
while (head < channels + post_pad) {
if (head - size >= 0) {
accum_scale -= in_off[(head - size) * step]
* in_off[(head - size) * step];
}
scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
++head;
}
}
}
__kernel void TEMPLATE(lrn_compute_diff,Dtype)(const int_tp nthreads,
__global const Dtype* bottom_data,
__global const Dtype* top_data,
__global const Dtype* scale,
__global const Dtype* top_diff, const int_tp num,
const int_tp channels, const int_tp height,
const int_tp width, const int_tp size,
const Dtype negative_beta,
const Dtype cache_ratio,
__global Dtype* bottom_diff) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
// find out the local offset
const int_tp w = index % width;
const int_tp h = (index / width) % height;
const int_tp n = index / width / height;
const int_tp offset = (n * channels * height + h) * width + w;
const int_tp step = height * width;
__global const Dtype* bottom_off = bottom_data + offset;
__global const Dtype* top_off = top_data + offset;
__global const Dtype* scale_off = scale + offset;
__global const Dtype* top_diff_off = top_diff + offset;
__global Dtype* bottom_diff_off = bottom_diff + offset;
int_tp head = 0;
const int_tp pre_pad = size - (size + 1) / 2;
const int_tp post_pad = size - pre_pad - 1;
Dtype accum_ratio = 0;
// accumulate values
while (head < post_pad && head < channels) {
accum_ratio += top_diff_off[head * step] * top_off[head * step]
/ scale_off[head * step];
++head;
}
// both add and subtract
while (head < channels) {
accum_ratio += top_diff_off[head * step] * top_off[head * step]
/ scale_off[head * step];
if (head - size >= 0) {
accum_ratio -= top_diff_off[(head - size) * step]
* top_off[(head - size) * step] / scale_off[(head - size) * step];
}
bottom_diff_off[(head - post_pad) * step] = top_diff_off[(head - post_pad)
* step] * pow(scale_off[(head - post_pad) * step], negative_beta)
- cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
++head;
}
// subtract only
while (head < channels + post_pad) {
if (head - size >= 0) {
accum_ratio -= top_diff_off[(head - size) * step]
* top_off[(head - size) * step] / scale_off[(head - size) * step];
}
bottom_diff_off[(head - post_pad) * step] = top_diff_off[(head - post_pad)
* step] * pow(scale_off[(head - post_pad) * step], negative_beta)
- cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
++head;
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
inline Dtype TEMPLATE(lstm_sigmoid,Dtype)(const Dtype x) {
return (Dtype)1 / ((Dtype)1 + exp(-x));
}
inline Dtype TEMPLATE(lstm_tanh,Dtype)(const Dtype x) {
return (Dtype)2 * TEMPLATE(lstm_sigmoid,Dtype)((Dtype)2 * x) - (Dtype)1;
}
__kernel void TEMPLATE(lstm_acts_forward,Dtype)(const int_tp nthreads, const int_tp dim,
__global const Dtype* X, __global Dtype* X_acts) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp x_dim = 4 * dim;
const int_tp d = index % x_dim;
if (d < 3 * dim) {
X_acts[index] = TEMPLATE(lstm_sigmoid,Dtype)(X[index]);
} else {
X_acts[index] = TEMPLATE(lstm_tanh,Dtype)(X[index]);
}
}
}
__kernel void TEMPLATE(lstm_unit_forward,Dtype)(const int_tp nthreads, const int_tp dim,
__global const Dtype* C_prev, __global const Dtype* X, __global const Dtype* cont,
__global Dtype* C, __global Dtype* H) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp n = index / dim;
const int_tp d = index % dim;
__global const Dtype* X_offset = X + 4 * dim * n;
const Dtype i = X_offset[d];
const Dtype f = X_offset[1 * dim + d];
const Dtype o = X_offset[2 * dim + d];
const Dtype g = X_offset[3 * dim + d];
const Dtype c_prev = C_prev[index];
const Dtype c = cont[n] * f * c_prev + i * g;
C[index] = c;
const Dtype tanh_c = TEMPLATE(lstm_tanh,Dtype)(c);
H[index] = o * tanh_c;
}
}
__kernel void TEMPLATE(lstm_unit_backward,Dtype)(const int_tp nthreads, const int_tp dim,
__global const Dtype* C_prev, __global const Dtype* X, __global const Dtype* C, __global const Dtype* H,
__global const Dtype* cont, __global const Dtype* C_diff, __global const Dtype* H_diff,
__global Dtype* C_prev_diff, __global Dtype* X_diff) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp n = index / dim;
const int_tp d = index % dim;
__global const Dtype* X_offset = X + 4 * dim * n;
const Dtype i = X_offset[d];
const Dtype f = X_offset[1 * dim + d];
const Dtype o = X_offset[2 * dim + d];
const Dtype g = X_offset[3 * dim + d];
const Dtype c_prev = C_prev[index];
const Dtype c = C[index];
const Dtype tanh_c = TEMPLATE(lstm_tanh,Dtype)(c);
__global Dtype* c_prev_diff = C_prev_diff + index;
__global Dtype* X_diff_offset = X_diff + 4 * dim * n;
__global Dtype* i_diff = X_diff_offset + d;
__global Dtype* f_diff = X_diff_offset + 1 * dim + d;
__global Dtype* o_diff = X_diff_offset + 2 * dim + d;
__global Dtype* g_diff = X_diff_offset + 3 * dim + d;
const Dtype c_term_diff =
C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
const Dtype cont_n = cont[n];
*c_prev_diff = cont_n * c_term_diff * f;
*i_diff = c_term_diff * g;
*f_diff = cont_n * c_term_diff * c_prev;
*o_diff = H_diff[index] * tanh_c;
*g_diff = c_term_diff * i;
}
}
__kernel void TEMPLATE(lstm_acts_backward,Dtype)(const int_tp nthreads, const int_tp dim,
__global const Dtype* X_acts, __global const Dtype* X_acts_diff, __global Dtype* X_diff) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp x_dim = 4 * dim;
const int_tp d = index % x_dim;
const Dtype X_act = X_acts[index];
if (d < 3 * dim) {
X_diff[index] = X_acts_diff[index] * X_act * ((Dtype)1 - X_act);
} else {
X_diff[index] = X_acts_diff[index] * ((Dtype)1 - X_act * X_act);
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(mul,Dtype)(const int_tp n, __global const Dtype* a,
const int_tp offa,
__global Dtype* b,
const int_tp offb, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[index + offy] = a[index + offa] * b[index + offb];
}
}
__kernel void TEMPLATE(div,Dtype)(const int_tp n, __global const Dtype* a,
const int_tp offa,
__global Dtype* b,
const int_tp offb, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[index + offy] = a[index + offa] / b[index + offb];
}
}
__kernel void TEMPLATE(add_scalar,Dtype)(const int_tp N, const Dtype alpha,
__global Dtype* Y,
const int_tp offY) {
for (int_tp index = get_global_id(0); index < N; index += get_global_size(0)) {
Y[offY + index] += alpha;
}
}
__kernel void TEMPLATE(add,Dtype)(const int_tp n, __global const Dtype* a,
const int_tp offa, __global const Dtype* b,
const int_tp offb, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[offy + index] = a[offa + index] + b[offb + index];
}
}
__kernel void TEMPLATE(sub,Dtype)(const int_tp n, __global const Dtype* a,
const int_tp offa, __global const Dtype* b,
const int_tp offb, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[offy + index] = a[offa + index] - b[offb + index];
}
}
__kernel void TEMPLATE(abs,Dtype)(const int_tp n, __global const Dtype* a,
const int_tp offa, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[offy + index] = fabs((Dtype)(a[offa + index]));
}
}
__kernel void TEMPLATE(exp,Dtype)(const int_tp n, __global const Dtype* a,
const int_tp offa, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[offy + index] = exp(a[offa + index]);
}
}
__kernel void TEMPLATE(log,Dtype)(const int_tp n, __global const Dtype* a,
const int_tp offa, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[offy + index] = log((Dtype)(a[offa + index]));
}
}
__kernel void TEMPLATE(powx,Dtype)(const int_tp n, __global const Dtype* a,
const int_tp offa, Dtype alpha,
__global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
if(alpha == 2.0) {
y[offy + index] = pow((Dtype)fabs(a[offa + index]), (Dtype)alpha);
} else {
y[offy + index] = pow((Dtype)a[offa + index], (Dtype)alpha);
}
}
}
__kernel void TEMPLATE(sign,Dtype)(const int_tp n, __global const Dtype* x,
const int_tp offx, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[index + offy] = (0.0 < x[index + offx])
- (x[index + offx] < 0.0);
}
}
__kernel void TEMPLATE(sgnbit,Dtype)(const int_tp n, __global const Dtype* x,
const int_tp offx, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[index + offy] = signbit(x[index + offx]);
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(merge_copy_forward_stack, Dtype)(const int_tp nthreads,
const int_tp dims,
__global const Dtype* bottom_a,
const int_tp forward_a,
__global const Dtype* bottom_b,
const int_tp forward_b,
__global Dtype* top,
const int_tp num,
const int_tp channels_a,
const int_tp channels_b,
__global const int_tp* shape_a,
__global const int_tp* shape_b) {
int_tp pad[6];
int_tp tmp_idx[6];
int_tp size_a = 1;
int_tp size_b = 1;
for (int_tp i = 0; i < dims; ++i) {
pad[i] = (shape_b[i] - shape_a[i]) / 2;
size_a *= shape_a[i];
size_b *= shape_b[i];
}
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
int_tp batch_id = index / ((channels_a + channels_b) * size_a);
int_tp bottom_id = ((index - batch_id * (channels_a + channels_b) * size_a)
/ (channels_a * size_a)) % 2;
int_tp counter = index;
for (int_tp i = dims - 1; i >= 0; --i) {
tmp_idx[i] = counter % shape_a[i];
counter /= shape_a[i];
}
if (bottom_id == 0) {
int_tp channel_id = (index / size_a) % channels_a;
int_tp aidx = batch_id * channels_a + channel_id;
for (int_tp i = 0; i < dims; ++i) {
aidx *= shape_a[i];
aidx += tmp_idx[i];
}
top[index] = (forward_a == 1) ? bottom_a[aidx] : 0;
} else {
int_tp channel_id = (index / size_a) % channels_b;
int_tp bidx = (batch_id * channels_b + channel_id) * size_b;
int_tp btemp = 1;
for (int_tp i = dims - 1; i >= 0; --i) {
bidx += btemp * (tmp_idx[i] + pad[i]);
btemp *= shape_b[i];
}
top[index] = (forward_b == 1) ? bottom_b[bidx] : 0;
}
}
}
__kernel void TEMPLATE(merge_copy_backward_stack,Dtype)(const int_tp nthreads,
const int_tp dims,
__global Dtype* bottom_a,
const int_tp backward_a,
__global Dtype* bottom_b,
const int_tp backward_b,
__global const Dtype* top,
const int_tp num,
const int_tp channels_a,
const int_tp channels_b,
__global const int_tp* shape_a,
__global const int_tp* shape_b) {
int_tp pad[6];
int_tp tmp_idx[6];
int_tp size_a = 1;
int_tp size_b = 1;
for (int_tp i = 0; i < dims; ++i) {
pad[i] = (shape_b[i] - shape_a[i]) / 2;
size_a *= shape_a[i];
size_b *= shape_b[i];
}
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
int_tp batch_id = index / ((channels_a + channels_b) * size_a);
int_tp bottom_id = ((index - batch_id * (channels_a + channels_b) * size_a)
/ (channels_a * size_a)) % 2;
int_tp counter = index;
for (int_tp i = dims - 1; i >= 0; --i) {
tmp_idx[i] = counter % shape_a[i];
counter /= shape_a[i];
}
if (bottom_id == 0) {
int_tp channel_id = (index / size_a) % channels_a;
int_tp aidx = batch_id * channels_a + channel_id;
for (int_tp i = 0; i < dims; ++i) {
aidx *= shape_a[i];
aidx += tmp_idx[i];
}
bottom_a[aidx] = (backward_a == 1) ? top[index] : 0;
} else {
int_tp channel_id = (index / size_a) % channels_b;
int_tp bidx = (batch_id * channels_b + channel_id) * size_b;
int_tp btemp = 1;
for (int_tp i = dims - 1; i >= 0; --i) {
bidx += btemp * (tmp_idx[i] + pad[i]);
btemp *= shape_b[i];
}
bottom_b[bidx] = (backward_b == 1) ? top[index] : 0;
}
}
}
__kernel void TEMPLATE(merge_copy_forward_add, Dtype)(const int_tp nthreads,
const int_tp dims,
__global const Dtype* bottom_a,
const int_tp forward_a,
__global const Dtype* bottom_b,
const int_tp forward_b,
__global Dtype* top,
const int_tp num,
const int_tp channels,
__global const int_tp* shape_a,
__global const int_tp* shape_b) {
int_tp pad[6];
int_tp tmp_idx[6];
int_tp size_a = 1;
int_tp size_b = 1;
for (int_tp i = 0; i < dims; ++i) {
pad[i] = (shape_b[i] - shape_a[i]) / 2;
size_a *= shape_a[i];
size_b *= shape_b[i];
}
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
int_tp batch_id = index / (channels * size_a);
int_tp counter = index;
for (int_tp i = dims - 1; i >= 0; --i) {
tmp_idx[i] = counter % shape_a[i];
counter /= shape_a[i];
}
top[index] = 0;
int_tp channel_id = (index / size_a) % channels;
int_tp aidx = batch_id * channels + channel_id;
for (int_tp i = 0; i < dims; ++i) {
aidx *= shape_a[i];
aidx += tmp_idx[i];
}
top[index] = forward_a ? top[index] + bottom_a[aidx] : top[index];
int_tp bidx = (batch_id * channels + channel_id) * size_b;
int_tp btemp = 1;
for (int_tp i = dims - 1; i >= 0; --i) {
bidx += btemp * (tmp_idx[i] + pad[i]);
btemp *= shape_b[i];
}
top[index] = forward_b ? top[index] + bottom_b[bidx] : top[index];
}
}
__kernel void TEMPLATE(merge_copy_backward_add,Dtype)(const int_tp nthreads,
const int_tp dims,
__global Dtype* bottom_a,
const int_tp backward_a,
__global Dtype* bottom_b,
const int_tp backward_b,
__global const Dtype* top,
const int_tp num,
const int_tp channels,
__global const int_tp* shape_a,
__global const int_tp* shape_b) {
int_tp pad[6];
int_tp tmp_idx[6];
int_tp size_a = 1;
int_tp size_b = 1;
for (int_tp i = 0; i < dims; ++i) {
pad[i] = (shape_b[i] - shape_a[i]) / 2;
size_a *= shape_a[i];
size_b *= shape_b[i];
}
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
int_tp batch_id = index / (channels * size_a);
int_tp counter = index;
for (int_tp i = dims - 1; i >= 0; --i) {
tmp_idx[i] = counter % shape_a[i];
counter /= shape_a[i];
}
int_tp channel_id = (index / size_a) % channels;
int_tp aidx = batch_id * channels + channel_id;
for (int_tp i = 0; i < dims; ++i) {
aidx *= shape_a[i];
aidx += tmp_idx[i];
}
bottom_a[aidx] = backward_a ? top[index] : 0;
int_tp bidx = (batch_id * channels + channel_id) * size_b;
int_tp btemp = 1;
for (int_tp i = dims - 1; i >= 0; --i) {
bidx += btemp * (tmp_idx[i] + pad[i]);
btemp *= shape_b[i];
}
bottom_b[bidx] = backward_b ? top[index] : 0;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(max_pool_forward,Dtype)(
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h,
const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h,
const int_tp pad_w,
__global Dtype* top_data,
const int use_mask, __global int_tp* mask, __global Dtype* top_mask) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp pw = index % pooled_width;
const int_tp ph = (index / pooled_width) % pooled_height;
const int_tp c = (index / pooled_width / pooled_height) % channels;
const int_tp n = index / pooled_width / pooled_height / channels;
int_tp hstart = ph * stride_h - pad_h;
int_tp wstart = pw * stride_w - pad_w;
const int_tp hend = min(hstart + kernel_h, height);
const int_tp wend = min(wstart + kernel_w, width);
hstart = max(hstart, (int_tp)0);
wstart = max(wstart, (int_tp)0);
Dtype maxval = -FLT_MAX;
int_tp maxidx = -1;
__global const Dtype* bottom_slice = bottom_data
+ (n * channels + c) * height * width;
for (int_tp h = hstart; h < hend; ++h) {
for (int_tp w = wstart; w < wend; ++w) {
if (bottom_slice[h * width + w] > maxval) {
maxidx = h * width + w;
maxval = bottom_slice[maxidx];
}
}
}
top_data[index] = maxval;
if (use_mask == 1) {
mask[index] = maxidx;
} else {
top_mask[index] = maxidx;
}
}
}
__kernel void TEMPLATE(ave_pool_forward,Dtype)(
const int_tp nthreads, __global const Dtype* const bottom_data, const int_tp num,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h,
const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h,
const int_tp pad_w, __global Dtype* top_data) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
{
const int_tp pw = index % pooled_width;
const int_tp ph = (index / pooled_width) % pooled_height;
const int_tp c = (index / pooled_width / pooled_height) % channels;
const int_tp n = index / pooled_width / pooled_height / channels;
int_tp hstart = ph * stride_h - pad_h;
int_tp wstart = pw * stride_w - pad_w;
int_tp hend = min(hstart + kernel_h, height + pad_h);
int_tp wend = min(wstart + kernel_w, width + pad_w);
const int_tp pool_size = (hend - hstart) * (wend - wstart);
hstart = max(hstart, (int_tp)0);
wstart = max(wstart, (int_tp)0);
hend = min(hend, height);
wend = min(wend, width);
Dtype aveval = 0;
__global const Dtype* bottom_slice = bottom_data
+ (n * channels + c) * height * width;
for (int_tp h = hstart; h < hend; ++h) {
for (int_tp w = wstart; w < wend; ++w) {
aveval += bottom_slice[h * width + w];
}
}
top_data[index] = aveval / pool_size;
}
}
}
__kernel void TEMPLATE(sto_pool_forward_train,Dtype)(
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h,
const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w,
__global Dtype* rand_idx,
__global Dtype* top_data) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp pw = index % pooled_width;
const int_tp ph = (index / pooled_width) % pooled_height;
const int_tp c = (index / pooled_width / pooled_height) % channels;
const int_tp n = index / pooled_width / pooled_height / channels;
const int_tp hstart = ph * stride_h;
const int_tp hend = min(hstart + kernel_h, height);
const int_tp wstart = pw * stride_w;
const int_tp wend = min(wstart + kernel_w, width);
Dtype cumsum = 0.;
__global const Dtype* bottom_slice = bottom_data
+ (n * channels + c) * height * width;
// First pass: get sum
for (int_tp h = hstart; h < hend; ++h) {
for (int_tp w = wstart; w < wend; ++w) {
cumsum += bottom_slice[h * width + w];
}
}
const float thres = rand_idx[index] * cumsum;
// Second pass: get value, and set index.
cumsum = 0;
for (int_tp h = hstart; h < hend; ++h) {
for (int_tp w = wstart; w < wend; ++w) {
cumsum += bottom_slice[h * width + w];
if (cumsum >= thres) {
rand_idx[index] = ((n * channels + c) * height + h) * width + w;
top_data[index] = bottom_slice[h * width + w];
h = hend;
w = wend;
}
}
}
}
}
__kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
const int_tp nthreads, __global const Dtype* const bottom_data, const int_tp num,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h,
const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w,
__global Dtype* top_data) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp pw = index % pooled_width;
const int_tp ph = (index / pooled_width) % pooled_height;
const int_tp c = (index / pooled_width / pooled_height) % channels;
const int_tp n = index / pooled_width / pooled_height / channels;
const int_tp hstart = ph * stride_h;
const int_tp hend = min(hstart + kernel_h, height);
const int_tp wstart = pw * stride_w;
const int_tp wend = min(wstart + kernel_w, width);
// We set cumsum to be 0 to avoid divide-by-zero problems
Dtype cumsum = FLT_MIN;
Dtype cumvalues = 0.;
__global const Dtype* bottom_slice = bottom_data
+ (n * channels + c) * height * width;
// First pass: get sum
for (int_tp h = hstart; h < hend; ++h) {
for (int_tp w = wstart; w < wend; ++w) {
cumsum += bottom_slice[h * width + w];
cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
}
}
top_data[index] = cumvalues / cumsum;
}
}
__kernel void TEMPLATE(max_pool_backward,Dtype)(const int_tp nthreads,
__global const Dtype* top_diff,
const int use_mask,
__global const int_tp* mask,
__global const Dtype* top_mask,
const int_tp num,
const int_tp channels,
const int_tp height,
const int_tp width,
const int_tp pooled_height,
const int_tp pooled_width,
const int_tp kernel_h,
const int_tp kernel_w,
const int_tp stride_h,
const int_tp stride_w,
const int_tp pad_h,
const int_tp pad_w,
__global Dtype* bottom_diff) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
// find out the local index
// find out the local offset
const int_tp w = index % width;
const int_tp h = (index / width) % height;
const int_tp c = (index / width / height) % channels;
const int_tp n = index / width / height / channels;
const int_tp phstart =
(h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
const int_tp phend = min((h + pad_h) / stride_h + 1, pooled_height);
const int_tp pwstart =
(w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
const int_tp pwend = min((w + pad_w) / stride_w + 1, pooled_width);
Dtype gradient = 0;
const int_tp offset = (n * channels + c) * pooled_height * pooled_width;
__global const Dtype* top_diff_slice = top_diff + offset;
if (use_mask == 1) {
__global const int_tp* mask_slice = mask + offset;
for (int_tp ph = phstart; ph < phend; ++ph) {
for (int_tp pw = pwstart; pw < pwend; ++pw) {
if (mask_slice[ph * pooled_width + pw] == h * width + w) {
gradient += top_diff_slice[ph * pooled_width + pw];
}
}
}
} else {
__global const Dtype* top_mask_slice = top_mask + offset;
for (int_tp ph = phstart; ph < phend; ++ph) {
for (int_tp pw = pwstart; pw < pwend; ++pw) {
if (top_mask_slice[ph * pooled_width + pw] == h * width + w) {
gradient += top_diff_slice[ph * pooled_width + pw];
}
}
}
}
bottom_diff[index] = gradient;
}
}
__kernel void TEMPLATE(ave_pool_backward,Dtype)(const int_tp nthreads,
__global const Dtype* top_diff,
const int_tp num,
const int_tp channels,
const int_tp height,
const int_tp width,
const int_tp pooled_height,
const int_tp pooled_width,
const int_tp kernel_h,
const int_tp kernel_w,
const int_tp stride_h,
const int_tp stride_w,
const int_tp pad_h,
const int_tp pad_w,
__global Dtype* bottom_diff) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
// find out the local index
// find out the local offset
const int_tp w = index % width + pad_w;
const int_tp h = (index / width) % height + pad_h;
const int_tp c = (index / width / height) % channels;
const int_tp n = index / width / height / channels;
const int_tp phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
const int_tp phend = min(h / stride_h + 1, pooled_height);
const int_tp pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
const int_tp pwend = min(w / stride_w + 1, pooled_width);
Dtype gradient = 0.0;
__global const Dtype* const top_diff_slice = top_diff
+ (n * channels + c) * pooled_height * pooled_width;
for (int_tp ph = phstart; ph < phend; ++ph) {
for (int_tp pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size
int_tp hstart = ph * stride_h - pad_h;
int_tp wstart = pw * stride_w - pad_w;
int_tp hend = min(hstart + kernel_h, height + pad_h);
int_tp wend = min(wstart + kernel_w, width + pad_w);
int_tp pool_size = (hend - hstart) * (wend - wstart);
gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
}
}
bottom_diff[index] = gradient;
}
}
__kernel void TEMPLATE(sto_pool_backward,Dtype)(
const int_tp nthreads, __global const Dtype* rand_idx,
__global const Dtype* const top_diff, const int_tp num,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp pooled_height, const int_tp pooled_width,
const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h,
const int_tp stride_w, __global Dtype* bottom_diff) {
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
// find out the local index
// find out the local offset
const int_tp w = index % width;
const int_tp h = (index / width) % height;
const int_tp c = (index / width / height) % channels;
const int_tp n = index / width / height / channels;
const int_tp phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
const int_tp phend = min(h / stride_h + 1, pooled_height);
const int_tp pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
const int_tp pwend = min(w / stride_w + 1, pooled_width);
Dtype gradient = 0.0;
__global const Dtype* rand_idx_slice = rand_idx
+ (n * channels + c) * pooled_height * pooled_width;
__global const Dtype* top_diff_slice = top_diff
+ (n * channels + c) * pooled_height * pooled_width;
for (int_tp ph = phstart; ph < phend; ++ph) {
for (int_tp pw = pwstart; pw < pwend; ++pw) {
gradient += top_diff_slice[ph * pooled_width + pw]
* (index == (int_tp) (rand_idx_slice[ph * pooled_width + pw])?1.0:0.0);
}
}
bottom_diff[index] = gradient;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(max_pool_forward_nd, Dtype)(const int_tp n,
const int_tp num_axes,
__global const Dtype* bottom_data,
const int_tp channels,
__global const int_tp* size,
__global const int_tp* pooled_size,
__global const int_tp* kernel_size,
__global const int_tp* ext_kernel_size,
__global const int_tp* stride,
__global const int_tp* dilation,
__global const int_tp* pad,
__global Dtype* top_data,
const int use_mask,
__global int_tp* mask, __global Dtype* top_mask) {
int_tp d_idx[6];
int_tp d_start[6];
int_tp d_end[6];
int_tp d_iter[6];
int_tp i;
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
int_tp offset = 1;
int_tp num = index;
bool do_continue = false;
for (i = num_axes - 1; i >= 0; --i) {
d_idx[i] = num % pooled_size[i];
d_start[i] = d_idx[i] * stride[i] - pad[i];
d_end[i] = min(d_start[i] + ext_kernel_size[i], size[i]);
d_start[i] = max(d_start[i], (int_tp)0);
num /= pooled_size[i];
offset *= size[i];
d_iter[i] = d_start[i];
if (d_start[i] >= d_end[i]) {
top_data[index] = -FLT_MAX;
if (use_mask) {
mask[index] = -1;
} else {
top_mask[index] = -1;
}
do_continue = true;
}
}
if(do_continue) {
continue;
}
int_tp chan = num % channels;
num /= channels;
offset *= (num * channels + chan);
Dtype maxval = -FLT_MAX;
int_tp maxidx = -1;
int_tp final_offset = 0;
bool incremented;
do {
final_offset = offset;
int_tp size_prod = 1;
for (i = num_axes - 1; i >= 0; --i) {
final_offset += d_iter[i] * size_prod;
size_prod *= size[i];
}
if (bottom_data[final_offset] > maxval) {
maxidx = final_offset;
maxval = bottom_data[maxidx];
}
incremented = false;
for (i = num_axes - 1; i >= 0; --i) {
if (d_iter[i] >= d_end[i] - dilation[i]) {
d_iter[i] = d_start[i];
} else {
d_iter[i] += dilation[i];
incremented = true;
break;
}
}
} while (incremented);
top_data[index] = maxval;
if (use_mask == 1) {
mask[index] = maxidx;
} else {
top_mask[index] = maxidx;
}
}
}
__kernel void TEMPLATE(max_pool_backward_nd, Dtype)(const int_tp n,
const int_tp num_axes,
__global const Dtype* top_diff,
const int use_mask,
__global const int_tp* mask,
__global const Dtype* top_mask,
const int_tp channels,
__global const int_tp* size,
__global const int_tp* pooled_size,
__global const int_tp* kernel_size,
__global const int_tp* ext_kernel_size,
__global const int_tp* stride,
__global const int_tp* dilation,
__global const int_tp* pad,
__global Dtype* bottom_diff) {
int_tp d_idx[6];
int_tp d_start[6];
int_tp d_end[6];
int_tp d_iter[6];
int_tp i;
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
// find out the local index
// find out the local offset
int_tp offset = 1;
int_tp num = index;
for (i = num_axes - 1; i >= 0; --i) {
d_idx[i] = num % size[i];
if (dilation[i] > 1) {
d_start[i] =
(d_idx[i] < ext_kernel_size[i]) ?
d_idx[i] % dilation[i] : (d_idx[i] - ext_kernel_size[i]) + 1;
d_end[i] =
(d_idx[i] >= pooled_size[i]) ?
(pooled_size[i] - 1)
- (pooled_size[i] - 1 - d_start[i]) % dilation[i] :
d_idx[i];
} else {
d_start[i] =
(d_idx[i] + pad[i] < kernel_size[i]) ?
0 : (d_idx[i] + pad[i] - kernel_size[i]) / stride[i] + 1;
d_end[i] = min((int_tp) ((d_idx[i] + pad[i]) / stride[i] + 1),
(int_tp) (pooled_size[i]));
}
num /= size[i];
offset *= pooled_size[i];
d_iter[i] = d_start[i];
if (d_start[i] > d_end[i]) {
bottom_diff[index] = 0;
return;
}
}
int_tp chan = num % channels;
num /= channels;
offset *= (num * channels + chan);
Dtype gradient = 0;
int_tp final_offset = 0;
int_tp im_offset = 0;
bool incremented;
do {
final_offset = offset;
im_offset = 0;
int_tp size_prod = 1;
int_tp pooled_size_prod = 1;
for (i = num_axes - 1; i >= 0; --i) {
final_offset += d_iter[i] * pooled_size_prod;
im_offset += d_idx[i] * size_prod;
size_prod *= size[i];
pooled_size_prod *= pooled_size[i];
}
if (use_mask) {
if (mask[final_offset] == im_offset) {
gradient += top_diff[final_offset];
}
} else {
if (top_mask[final_offset] == im_offset) {
gradient += top_diff[final_offset];
}
}
incremented = false;
for (i = num_axes - 1; i >= 0; --i) {
if (d_iter[i] > d_end[i] - dilation[i]) {
d_iter[i] = d_start[i];
} else {
d_iter[i] += dilation[i];
incremented = true;
break;
}
}
} while (incremented);
bottom_diff[index] = gradient;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(max_pool_forward_sk,Dtype)(const int_tp nthreads,
__global Dtype* bottom_data,
const int_tp num,
const int_tp channels,
const int_tp height,
const int_tp width,
const int_tp pooled_height,
const int_tp pooled_width,
const int_tp kernel_h,
const int_tp kernel_w,
const int_tp ext_kernel_h,
const int_tp ext_kernel_w,
const int_tp stride_h,
const int_tp stride_w,
const int_tp dilation_h,
const int_tp dilation_w,
const int_tp pad_h,
const int_tp pad_w,
__global Dtype* top_data,
const int use_mask,
__global int_tp* mask,
__global Dtype* top_mask) {
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
int_tp pw = index % pooled_width;
int_tp ph = (index / pooled_width) % pooled_height;
int_tp c = (index / pooled_width / pooled_height) % channels;
int_tp n = index / pooled_width / pooled_height / channels;
int_tp hstart = ph * stride_h - pad_h;
int_tp wstart = pw * stride_w - pad_w;
int_tp hend = min(hstart + ext_kernel_h, height);
int_tp wend = min(wstart + ext_kernel_w, width);
hstart = max(hstart, (int_tp) 0);
wstart = max(wstart, (int_tp) 0);
Dtype maxval = -FLT_MAX;
int_tp maxidx = -1;
__global Dtype* bottom_data_ptr = bottom_data
+ (n * channels + c) * height * width;
for (int_tp h = hstart; h < hend; h += dilation_h) {
for (int_tp w = wstart; w < wend; w += dilation_w) {
if (bottom_data_ptr[h * width + w] > maxval) {
maxidx = h * width + w;
maxval = bottom_data_ptr[maxidx];
}
}
}
top_data[index] = maxval;
if (use_mask == 1) {
mask[index] = maxidx;
} else {
top_mask[index] = maxidx;
}
}
}
__kernel void TEMPLATE(max_pool_backward_sk,Dtype)(
const int_tp nthreads, __global const Dtype* top_diff, const int use_mask,
__global const int_tp* mask, __global const Dtype* top_mask,
const int_tp num, const int_tp channels, const int_tp height,
const int_tp width, const int_tp pooled_height, const int_tp pooled_width,
const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h,
const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w,
const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h,
const int_tp pad_w,
__global Dtype* bottom_diff) {
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
__global const int_tp* mask_ptr = mask;
__global const Dtype* top_diff_ptr = top_diff;
// find out the local index
// find out the local offset
int_tp w = index % width;
int_tp h = (index / width) % height;
int_tp c = (index / width / height) % channels;
int_tp n = index / width / height / channels;
int_tp pooled_height_1 = pooled_height - 1;
int_tp pooled_width_1 = pooled_width - 1;
int_tp phstart =
(h < ext_kernel_h) ? h % dilation_h : (h - ext_kernel_h) + 1;
int_tp phend =
(h >= pooled_height) ?
pooled_height_1 - (pooled_height_1 - phstart) % dilation_h : h;
int_tp pwstart =
(w < ext_kernel_w) ? w % dilation_w : (w - ext_kernel_w) + 1;
int_tp pwend =
(w >= pooled_width) ?
pooled_width_1 - (pooled_width_1 - pwstart) % dilation_w : w;
Dtype gradient = 0;
int_tp offset = (n * channels + c) * pooled_height * pooled_width;
top_diff_ptr += offset;
if (use_mask == 1) {
mask_ptr += offset;
for (int_tp ph = phstart; ph <= phend; ph += dilation_h) {
for (int_tp pw = pwstart; pw <= pwend; pw += dilation_w) {
if (mask_ptr[ph * pooled_width + pw] == h * width + w) {
gradient += top_diff_ptr[ph * pooled_width + pw];
}
}
}
} else {
for (int_tp ph = phstart; ph <= phend; ph += dilation_h) {
for (int_tp pw = pwstart; pw <= pwend; pw += dilation_w) {
if (top_mask[ph * pooled_width + pw] == h * width + w) {
gradient += top_diff_ptr[ph * pooled_width + pw];
}
}
}
}
bottom_diff[index] = gradient;
}
}
__kernel void TEMPLATE(ave_pool_forward_sk,Dtype)(
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp pooled_height, const int_tp pooled_width,
const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h,
const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w,
const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h,
const int_tp pad_w,
__global Dtype* top_data) {
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
int_tp pw = index % pooled_width;
int_tp ph = (index / pooled_width) % pooled_height;
int_tp c = (index / pooled_width / pooled_height) % channels;
int_tp n = index / pooled_width / pooled_height / channels;
int_tp hstart = ph * stride_h - pad_h;
int_tp wstart = pw * stride_w - pad_w;
int_tp hend = min(hstart + ext_kernel_h, height + pad_h);
int_tp wend = min(wstart + ext_kernel_w, width + pad_w);
hstart = max(hstart, (int_tp)0);
wstart = max(wstart, (int_tp)0);
hend = min(hend, height);
wend = min(wend, width);
Dtype aveval = 0;
__global const Dtype* bottom_data_ptr = bottom_data;
bottom_data_ptr += (n * channels + c) * height * width;
int_tp pool_size = 0;
for (int_tp h = hstart; h < hend; ++h) {
for (int_tp w = wstart; w < wend; ++w) {
aveval += bottom_data_ptr[h * width + w];
++pool_size;
}
}
top_data[index] = aveval / pool_size;
}
}
__kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)(
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp pooled_height, const int_tp pooled_width,
const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h,
const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w,
const int_tp dilation_h, const int_tp dilation_w, __global Dtype* rand_idx,
__global Dtype* top_data) {
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
int_tp pw = index % pooled_width;
int_tp ph = (index / pooled_width) % pooled_height;
int_tp c = (index / pooled_width / pooled_height) % channels;
int_tp n = index / pooled_width / pooled_height / channels;
int_tp hstart = ph * stride_h;
int_tp hend = min(hstart + ext_kernel_h, height);
int_tp wstart = pw * stride_w;
int_tp wend = min(wstart + ext_kernel_w, width);
Dtype cumsum = 0.;
__global const Dtype* bottom_data_ptr = bottom_data;
bottom_data_ptr += (n * channels + c) * height * width;
// First pass: get sum
for (int_tp h = hstart; h < hend; h += dilation_h) {
for (int_tp w = wstart; w < wend; w += dilation_w) {
cumsum += bottom_data_ptr[h * width + w];
}
}
float thres = rand_idx[index] * cumsum;
// Second pass: get value, and set index.
cumsum = 0;
for (int_tp h = hstart; h < hend; h += dilation_h) {
for (int_tp w = wstart; w < wend; w += dilation_w) {
cumsum += bottom_data_ptr[h * width + w];
if (cumsum >= thres) {
rand_idx[index] = ((n * channels + c) * height + h) * width + w;
top_data[index] = bottom_data_ptr[h * width + w];
h = hend;
w = wend;
}
}
}
}
}
__kernel void TEMPLATE(sto_pool_forward_test_sk,Dtype)(
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp pooled_height, const int_tp pooled_width,
const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h,
const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w,
const int_tp dilation_h, const int_tp dilation_w,
__global Dtype* top_data) {
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
int_tp pw = index % pooled_width;
int_tp ph = (index / pooled_width) % pooled_height;
int_tp c = (index / pooled_width / pooled_height) % channels;
int_tp n = index / pooled_width / pooled_height / channels;
int_tp hstart = ph * stride_h;
int_tp hend = min(hstart + ext_kernel_h, height);
int_tp wstart = pw * stride_w;
int_tp wend = min(wstart + ext_kernel_w, width);
// We set cumsum to be 0 to avoid divide-by-zero problems
Dtype cumsum = FLT_MIN;
Dtype cumvalues = 0.;
__global const Dtype* bottom_data_ptr = bottom_data;
bottom_data_ptr += (n * channels + c) * height * width;
// First pass: get sum
for (int_tp h = hstart; h < hend; h += dilation_h) {
for (int_tp w = wstart; w < wend; w += dilation_w) {
cumsum += bottom_data_ptr[h * width + w];
cumvalues += bottom_data_ptr[h * width + w]
* bottom_data_ptr[h * width + w];
}
}
top_data[index] = cumvalues / cumsum;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(slice,Dtype)(const int_tp nthreads,
__global const Dtype* in_data,
const int forward, const int_tp num_slices,
const int_tp slice_size,
const int_tp bottom_slice_axis,
const int_tp top_slice_axis,
const int_tp offset_slice_axis,
__global Dtype* out_data) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp total_slice_size = slice_size * top_slice_axis;
const int_tp slice_num = index / total_slice_size;
const int_tp slice_index = index % total_slice_size;
const int_tp bottom_index = slice_index
+ (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;
if (forward == 1) {
out_data[index] = in_data[bottom_index];
} else {
out_data[bottom_index] = in_data[index];
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(softmax_loss_forward,Dtype)(
int_tp n, __global const Dtype* prob_data, __global const Dtype* label,
__global Dtype* loss,
const int_tp num, const int_tp dim, const int_tp spatial_dim,
const int has_ignore_label_, const int_tp ignore_label_,
__global Dtype* counts) {
for (int_tp index = get_global_id(0); index < n;
index += get_global_size(0)) {
const int_tp n = index / spatial_dim;
const int_tp s = index % spatial_dim;
const int_tp label_value = (int_tp) (label[n * spatial_dim + s]);
if (has_ignore_label_ == 1 && label_value == ignore_label_) {
loss[index] = 0;
counts[index] = 0;
} else {
loss[index] = -log((Dtype)(
max((Dtype) (prob_data[n * dim + label_value * spatial_dim + s]),
(Dtype) FLT_MIN)));
counts[index] = 1;
}
}
}
__kernel void TEMPLATE(softmax_loss_backward,Dtype)(const int_tp nthreads,
__global const Dtype* top,
__global const Dtype* label,
__global Dtype* bottom_diff,
const int_tp num,
const int_tp dim,
const int_tp spatial_dim,
const int has_ignore_label_,
const int_tp ignore_label_,
__global Dtype* counts) {
const int_tp channels = dim / spatial_dim;
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
const int_tp n = index / spatial_dim;
const int_tp s = index % spatial_dim;
const int_tp label_value = (int_tp) (label[n * spatial_dim + s]);
if (has_ignore_label_ == 1 && label_value == ignore_label_) {
for (int_tp c = 0; c < channels; ++c) {
bottom_diff[n * dim + c * spatial_dim + s] = 0;
}
counts[index] = 0;
} else {
bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
counts[index] = 1;
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(ada_delta_update,Dtype)(int_tp N, __global Dtype* g,
__global Dtype* h,
__global Dtype* h2,
Dtype momentum,
Dtype delta,
Dtype local_rate) {
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
Dtype gi = g[i];
Dtype hi = h[i] = momentum * h[i] + (1.0 - momentum) * gi * gi;
gi = gi * sqrt((h2[i] + delta) / (hi + delta));
h2[i] = momentum * h2[i] + (1.0 - momentum) * gi * gi;
g[i] = local_rate * gi;
}
}
__kernel void TEMPLATE(ada_grad_update,Dtype)(int_tp N, __global Dtype* g,
__global Dtype* h,
Dtype delta,
Dtype local_rate) {
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
Dtype gi = g[i];
Dtype hi = h[i] = h[i] + gi * gi;
g[i] = local_rate * gi / (sqrt(hi) + delta);
}
}
__kernel void TEMPLATE(adam_update,Dtype)(int_tp N, __global Dtype* g,
__global Dtype* m,
__global Dtype* v,
Dtype beta1,
Dtype beta2,
Dtype eps_hat,
Dtype corrected_local_rate) {
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
Dtype gi = g[i];
Dtype mi = m[i] = m[i] * beta1 + gi * (1 - beta1);
Dtype vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2);
g[i] = corrected_local_rate * mi / (sqrt(vi) + eps_hat);
}
}
__kernel void TEMPLATE(nesterov_update,Dtype)(int_tp N, __global Dtype* g,
__global Dtype* h,
Dtype momentum,
Dtype local_rate) {
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
Dtype hi = h[i];
Dtype hi_new = h[i] = momentum * hi + local_rate * g[i];
g[i] = (1 + momentum) * hi_new - momentum * hi;
}
}
__kernel void TEMPLATE(rms_prop_update,Dtype)(int_tp N, __global Dtype* g,
__global Dtype* h,
Dtype rms_decay,
Dtype delta,
Dtype local_rate) {
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
Dtype gi = g[i];
Dtype hi = h[i] = rms_decay * h[i] + (1 - rms_decay) * gi * gi;
g[i] = local_rate * g[i] / (sqrt(hi) + delta);
}
}
__kernel void TEMPLATE(sgd_update,Dtype)(int_tp N, __global Dtype* g,
__global Dtype* h,
Dtype momentum,
Dtype local_rate) {
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
g[i] = h[i] = momentum * h[i] + local_rate * g[i];
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(tile,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data,
const int_tp tile_size, const int_tp num_tiles,
const int_tp bottom_tile_axis,
__global Dtype* top_data) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp d = index % tile_size;
const int_tp b = (index / tile_size / num_tiles) % bottom_tile_axis;
const int_tp n = index / tile_size / num_tiles / bottom_tile_axis;
const int_tp bottom_index = (n * bottom_tile_axis + b) * tile_size + d;
top_data[index] = bottom_data[bottom_index];
}
}
__kernel void TEMPLATE(tile_backward,Dtype)(const int_tp nthreads,
__global const Dtype* top_diff,
const int_tp tile_size,
const int_tp num_tiles,
const int_tp bottom_tile_axis,
__global Dtype* bottom_diff) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp d = index % tile_size;
const int_tp b = (index / tile_size) % bottom_tile_axis;
const int_tp n = index / tile_size / bottom_tile_axis;
bottom_diff[index] = 0;
int_tp top_index = (n * num_tiles * bottom_tile_axis + b) * tile_size + d;
for (int_tp t = 0; t < num_tiles; ++t) {
bottom_diff[index] += top_diff[top_index];
top_index += bottom_tile_axis * tile_size;
}
}
}
#ifdef DOUBLE_SUPPORT_AVAILABLE
#undef Dtype
#define Dtype double
#undef TYPE
#define TYPE TYPE_DOUBLE
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(relu_forward,Dtype)(const int_tp n,
__global const Dtype* in,
__global Dtype* out,
Dtype negative_slope) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;
}
}
__kernel void TEMPLATE(relu_backward,Dtype)(const int_tp n,
__global const Dtype* in_diff,
__global const Dtype* in_data,
__global Dtype* out_diff,
Dtype negative_slope) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out_diff[index] = in_diff[index]
* ((in_data[index] > 0?1.0:0.0) + (in_data[index] <= 0?1.0:0.0) * negative_slope);
}
}
__kernel void TEMPLATE(tanh_forward,Dtype)(const int_tp n,
__global const Dtype* in,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out[index] = tanh(in[index]);
}
}
__kernel void TEMPLATE(tanh_backward,Dtype)(const int_tp n,
__global const Dtype* in_diff,
__global const Dtype* out_data,
__global Dtype* out_diff) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
Dtype tanhx = out_data[index];
out_diff[index] = in_diff[index] * (1 - tanhx * tanhx);
}
}
__kernel void TEMPLATE(sigmoid_forward,Dtype)(const int_tp n,
__global const Dtype* in,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out[index] = 1.0 / (1.0 + exp(-in[index]));
}
}
__kernel void TEMPLATE(sigmoid_backward,Dtype)(const int_tp n,
__global const Dtype* in_diff,
__global const Dtype* out_data,
__global Dtype* out_diff) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
const Dtype sigmoid_x = out_data[index];
out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);
}
}
__kernel void TEMPLATE(threshold,Dtype)(const int_tp n, const Dtype threshold,
__global const Dtype* in,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out[index] = in[index] > threshold ? 1.0 : 0.0;
}
}
__kernel void TEMPLATE(prelu_forward,Dtype)(const int_tp n, const int_tp channels,
const int_tp dim,
__global const Dtype* in,
__global Dtype* out,
__global const Dtype* slope_data,
const int_tp div_factor) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
int_tp c = (index / dim) % channels / div_factor;
out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
}
}
__kernel void TEMPLATE(prelu_backward,Dtype)(const int_tp n, const int_tp channels,
const int_tp dim,
__global const Dtype* in_diff,
__global const Dtype* in_data,
__global Dtype* out_diff,
__global const Dtype* slope_data,
const int_tp div_factor) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
int_tp c = (index / dim) % channels / div_factor;
out_diff[index] = in_diff[index]
* ((in_data[index] > 0?1.0:0.0) + (in_data[index] <= 0?1.0:0.0) * slope_data[c]);
}
}
__kernel void TEMPLATE(prelu_param_backward,Dtype)(const int_tp n, const int_tp rows,
const int_tp rowPitch,
__global const Dtype* in_diff,
__global const Dtype* in_data,
__global Dtype* out_diff) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0?1.0:0.0);
for (int k = 1; k < rows; k++) {
out_diff[index] += in_diff[index + k * rowPitch]
* in_data[index + k * rowPitch]
* (in_data[index + k * rowPitch] <= 0?1.0:0.0);
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(gpu_set,Dtype)(const int_tp n, const Dtype alpha, __global Dtype* y) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[index] = alpha;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(br_forward,Dtype)(const int_tp count, const int_tp inner_dim,
__global const Dtype* in,
__global const Dtype* permut,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < count;
index += get_global_size(0)) {
int_tp n = index / (inner_dim);
int_tp in_n = (int_tp) (permut[n]);
out[index] = in[in_n * (inner_dim) + index % (inner_dim)];
}
}
__kernel void TEMPLATE(br_backward,Dtype)(const int_tp count, const int_tp inner_dim,
__global const Dtype* in,
__global const Dtype* top_indexes,
__global const Dtype* begins,
__global const Dtype* counts,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < count;
index += get_global_size(0)) {
int_tp n = index / (inner_dim);
out[index] = 0;
int_tp lower = (int_tp) (begins[n]);
int_tp upper = lower + (int_tp) (counts[n]);
for (int_tp i = lower; i < upper; ++i) {
int_tp in_n = (int_tp) (top_indexes[i]);
out[index] += in[in_n * (inner_dim) + index % (inner_dim)];
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(null_kernel,Dtype)(Dtype arg) {
Dtype out = arg;
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(bias_forward,Dtype)(const int_tp n,
__global const Dtype* in,
__global const Dtype* bias,
const int_tp bias_dim,
const int_tp inner_dim,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < n;
index += get_global_size(0)) {
const int_tp bias_index = (index / inner_dim) % bias_dim;
out[index] = in[index] + bias[bias_index];
}
}
__kernel void TEMPLATE(scale_forward,Dtype)(const int_tp n,
__global const Dtype* in,
__global const Dtype* scale,
const int_tp scale_dim,
const int_tp inner_dim,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < n;
index += get_global_size(0)) {
const int_tp scale_index = (index / inner_dim) % scale_dim;
out[index] = in[index] * scale[scale_index];
}
}
__kernel void TEMPLATE(scale_bias_forward,Dtype)(const int_tp n,
__global const Dtype* in,
__global const Dtype* scale,
__global const Dtype* bias,
const int_tp scale_dim,
const int_tp inner_dim,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < n;
index += get_global_size(0)) {
const int_tp scale_index = (index / inner_dim) % scale_dim;
out[index] = in[index] * scale[scale_index] + bias[scale_index];
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(bnll_forward,Dtype)(const int_tp n,
__global const Dtype* in,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
if (in[index] > 0.0f) {
out[index] = in[index] + log((Dtype) (1.0 + exp(-in[index])));
} else {
out[index] = log((Dtype) (1.0 + exp(in[index])));
}
}
}
__kernel void TEMPLATE(bnll_backward,Dtype)(const int_tp n,
__global const Dtype* in_diff,
__global const Dtype* in_data,
__global Dtype* out_diff) {
Dtype kBNLL_THRESHOLD = 50.;
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
Dtype expval = exp(min(in_data[index], kBNLL_THRESHOLD));
out_diff[index] = in_diff[index] * expval / (expval + 1.);
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(kernel_channel_max,Dtype)(const int_tp num, const int_tp channels,
const int_tp spatial_dim,
__global const Dtype* data,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < num * spatial_dim; index +=
get_global_size(0)) {
int_tp n = index / spatial_dim;
int_tp s = index % spatial_dim;
float maxval = -FLT_MAX;
for (int_tp c = 0; c < channels; ++c) {
maxval = max((Dtype)(data[(n * channels + c) * spatial_dim + s]), (Dtype)maxval);
}
out[index] = maxval;
}
}
__kernel void TEMPLATE(kernel_channel_subtract,Dtype)(const int_tp count, const int_tp num,
const int_tp channels,
const int_tp spatial_dim,
__global const Dtype* channel_max,
__global Dtype* data) {
for (int_tp index = get_global_id(0); index < count;
index += get_global_size(0)) {
int_tp n = index / channels / spatial_dim;
int_tp s = index % spatial_dim;
data[index] -= channel_max[n * spatial_dim + s];
}
}
__kernel void TEMPLATE(kernel_exp,Dtype)(const int_tp count, __global const Dtype* data,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < count;
index += get_global_size(0)) {
out[index] = exp(data[index]);
}
}
__kernel void TEMPLATE(kernel_channel_sum,Dtype)(const int_tp num, const int_tp channels,
const int_tp spatial_dim,
__global const Dtype* data,
__global Dtype* channel_sum) {
for (int_tp index = get_global_id(0); index < num * spatial_dim; index +=
get_global_size(0)) {
int_tp n = index / spatial_dim;
int_tp s = index % spatial_dim;
Dtype sum = 0;
for (int_tp c = 0; c < channels; ++c) {
sum += data[(n * channels + c) * spatial_dim + s];
}
channel_sum[index] = sum;
}
}
__kernel void TEMPLATE(kernel_channel_div,Dtype)(const int_tp count, const int_tp num,
const int_tp channels, const int_tp spatial_dim,
__global const Dtype* channel_sum,
__global Dtype* data) {
for (int_tp index = get_global_id(0); index < count;
index += get_global_size(0)) {
int_tp n = index / channels / spatial_dim;
int_tp s = index % spatial_dim;
data[index] /= channel_sum[n * spatial_dim + s];
}
}
__kernel void TEMPLATE(kernel_channel_dot,Dtype)(const int_tp num, const int_tp channels,
const int_tp spatial_dim,
__global const Dtype* data_1,
__global const Dtype* data_2,
__global Dtype* channel_dot) {
for (int_tp index = get_global_id(0); index < num * spatial_dim; index +=
get_global_size(0)) {
int_tp n = index / spatial_dim;
int_tp s = index % spatial_dim;
Dtype dot = 0;
for (int_tp c = 0; c < channels; ++c) {
dot += (data_1[(n * channels + c) * spatial_dim + s]
* data_2[(n * channels + c) * spatial_dim + s]);
}
channel_dot[index] = dot;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(concat,Dtype)(const int_tp nthreads, __global const Dtype* in_data,
const int forward, const int_tp num_concats,
const int_tp concat_size,
const int_tp top_concat_axis,
const int_tp bottom_concat_axis,
const int_tp offset_concat_axis,
__global Dtype* out_data) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp total_concat_size = concat_size * bottom_concat_axis;
const int_tp concat_num = index / total_concat_size;
const int_tp concat_index = index % total_concat_size;
const int_tp top_index = concat_index
+ (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
if (forward == 1) {
out_data[top_index] = in_data[index];
} else {
out_data[index] = in_data[top_index];
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(cll_backward,Dtype)(const int_tp count, const int_tp channels,
const Dtype margin, const Dtype alpha, __global const Dtype* y,
__global const Dtype* diff, __global const Dtype* dist_sq,
__global Dtype *bottom_diff) {
for (int_tp i = get_global_id(0); i < count;
i += get_global_size(0)) {
int_tp n = i / channels; // the num index, to access y and dist_sq
if (trunc(y[n]) != 0.) { // similar pairs
bottom_diff[i] = alpha * diff[i];
} else { // dissimilar pairs
Dtype mdist = 0.;
Dtype beta = 0.;
Dtype dist = sqrt(dist_sq[n]);
mdist = (margin - dist);
beta = -alpha * mdist / (dist + 1e-4) * diff[i];
if (mdist > 0.) {
bottom_diff[i] = beta;
} else {
bottom_diff[i] = 0;
}
}
}
}
__kernel void TEMPLATE(cll_backward_legacy,Dtype)(const int count, const int channels,
const Dtype margin, const Dtype alpha, __global Dtype* y,
__global Dtype* diff, __global Dtype* dist_sq,
__global Dtype* bottom_diff) {
for (int_tp i = get_global_id(0); i < count;
i += get_global_size(0)) {
int n = i / channels; // the num index, to access y and dist_sq
if (trunc(y[n]) != 0.) { // similar pairs
bottom_diff[i] = alpha * diff[i];
} else { // dissimilar pairs
Dtype mdist = 0.;
Dtype beta = 0.;
mdist = (margin - dist_sq[n]);
beta = -alpha;
if (mdist > 0.) {
bottom_diff[i] = beta;
} else {
bottom_diff[i] = 0;
}
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(conv_layer_spatial_phony,Dtype)(Dtype arg) {
Dtype out = arg;
}
#define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y)
#define LOOP1(VAR, STMT) (STMT); (VAR)++;
#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;
#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;
#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;
#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;
#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;
#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;
#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;
#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;
#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;
#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;
#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;
#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;
#define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++;
#define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++;
#define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++;
#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
#ifdef MULTI
__kernel void CFMulti(__global Dtype* image_data, int_tp image_offset,
__global Dtype* kernel_data, int_tp kernel_offset,
__global Dtype* bias,const int_tp bias_offset,
__global Dtype* convolved_image,const int_tp convolved_image_offset,
const ushort WIDTH,
const ushort HEIGHT,
const ushort OUTPUT_W,
const ushort OUTPUT_H) {
const int_tp outputX = get_global_id(0);
const int_tp outputY = get_global_id(1);
const int_tp kernelNum = get_global_id(2)*ZPAR;
if(outputX < OUTPUT_W && outputY < OUTPUT_H)
{
Dtype sum[ZPAR];
Dtype4 vectorSum[ZPAR];
for(int_tp kern =0; kern < ZPAR; kern++)
{
sum[kern] = 0.0f;
vectorSum[kern] = (0.0f,0.0f,0.0f,0.0f);
}
const int_tp currentKernelOffset = kernel_offset + kernelNum*KERNEL_H*KERNEL_W*CHANNELS;
const int_tp biasIndex=bias_offset + kernelNum;
const int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W;
const int_tp imageSize = WIDTH*HEIGHT;
const int_tp float4Reads = KERNEL_W / 4;
const int_tp floatReads = KERNEL_W % 4;
Dtype4 imageCache;
__global Dtype* image_dataPtrFloat = (image_data + (image_offset + local_image_offset));
__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
for(int_tp c = 0; c < CHANNELS; c++)
{
for(int_tp y = 0; y < KERNEL_H; y++)
{
for(int_tp x=0; x< float4Reads; x++)
{
imageCache = ((__global Dtype4*)image_dataPtrFloat)[x];
for(int_tp kern =0; kern < ZPAR; kern++)
{
vectorSum[kern] += imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[x];
}
}
if(floatReads == 1)
{
imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads];
for(int_tp kern =0; kern < ZPAR; kern++)
vectorSum[kern].s0 += ( imageCache * ( (__global Dtype4*) &(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]) )[float4Reads] ).s0;
}
else if(floatReads == 2)
{
imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads];
for(int_tp kern =0; kern < ZPAR; kern++)
vectorSum[kern].s01 += (imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[float4Reads]).s01;
}
else if(floatReads == 3)
{
imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads];
for(int_tp kern =0; kern < ZPAR; kern++)
vectorSum[kern].s012 += (imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[float4Reads]).s012;
}
image_dataPtrFloat += WIDTH;
kernel_dataPtrFloat += KERNEL_W;
}
image_dataPtrFloat += imageSize - WIDTH*KERNEL_H;
}
for(int_tp kern =0; kern < ZPAR; kern++)
sum[kern] = vectorSum[kern].x + vectorSum[kern].y + vectorSum[kern].z + vectorSum[kern].w;
if(APPLY_BIAS == 1)
{
for(int_tp kern = 0; kern < ZPAR; kern++)
if(kernelNum+kern < OUTPUT_Z)
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX] =
sum[kern] + bias[biasIndex +kern];
}
else
for(int_tp kern = 0; kern < ZPAR; kern++)
if(kernelNum+kern < OUTPUT_Z)
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX] = sum[kern];
}
}
#endif
#ifdef MULTI_11
__kernel void CFMulti_11_11_4(__global Dtype* image_data, int_tp image_offset,
__global Dtype* kernel_data, int_tp kernel_offset,
__global Dtype* bias,const int_tp bias_offset,
__global Dtype* convolved_image,const int_tp convolved_image_offset,
const ushort WIDTH,
const ushort HEIGHT,
const ushort OUTPUT_W,
const ushort OUTPUT_H) {
int_tp outputX = get_global_id(0)*XPAR;
int_tp outputY = get_global_id(1)*YPAR;
int_tp kernelNum = get_global_id(2)*ZPAR;
if(outputX < OUTPUT_W && outputY < OUTPUT_H)
{
Dtype sum[XPAR*YPAR*ZPAR];
for(int_tp kern =0; kern < XPAR*YPAR*ZPAR; kern++)
{
sum[kern] = 0.0f;
}
int_tp currentKernelOffset = kernel_offset + kernelNum*KERNELSIZE*CHANNELS;
int_tp biasIndex=bias_offset + kernelNum;
int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W;
int_tp imageSize = WIDTH*HEIGHT;
int_tp index;
__global Dtype* image_dataPtrFloat = (image_data + (image_offset + local_image_offset));
__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
Dtype16 imageCache;
Dtype8 imageCacheR;
Dtype8 kernelCache;
Dtype4 kernelCacheR;
for(int_tp c = 0; c < CHANNELS; c++)
{
for(int_tp y = 0; y < 11; y++)
{
imageCache = ((__global Dtype16*)image_dataPtrFloat)[0];
imageCacheR =((__global Dtype8*)image_dataPtrFloat)[2];
for(int_tp kern =0; kern < ZPAR; kern++)
{
kernelCache = ((__global Dtype8*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[0];
kernelCacheR = ((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[2];
index = kern*XPAR;
sum[index + 0] += dot(imageCache.S0123,kernelCache.S0123);
sum[index + 1] += dot(imageCache.S4567,kernelCache.S0123);
sum[index + 2] += dot(imageCache.S89AB,kernelCache.S0123);
sum[index + 3] += dot(imageCache.SCDEF,kernelCache.S0123);
sum[index + 0] += dot(imageCache.S4567,kernelCache.S4567);
sum[index + 1] += dot(imageCache.S89AB,kernelCache.S4567);
sum[index + 2] += dot(imageCache.SCDEF,kernelCache.S4567);
sum[index + 3] += dot(imageCacheR.S0123,kernelCache.S4567);
sum[index + 0] += dot(imageCache.S89A,kernelCacheR.S012);
sum[index + 1] += dot(imageCache.SCDE,kernelCacheR.S012);
sum[index + 2] += dot(imageCacheR.S012,kernelCacheR.S012);
sum[index + 3] += dot(imageCacheR.S456,kernelCacheR.S012);
}
image_dataPtrFloat += WIDTH;
kernel_dataPtrFloat += KERNEL_W;
}
image_dataPtrFloat += imageSize - WIDTH*KERNEL_H;
}
if(APPLY_BIAS == 1)
{
for(int_tp kern = 0; kern < ZPAR; kern++)
{
for(int_tp wi =0; wi < XPAR; wi++)
if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W)
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX + wi] =
sum[kern*XPAR + wi] + bias[biasIndex +kern];
}
}
else
for(int_tp kern = 0; kern < ZPAR; kern++)
for(int_tp wi =0; wi < XPAR; wi++)
if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W)
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX + wi] = sum[kern*XPAR + wi];
}
}
#endif
#ifdef MULTI_GEN
__kernel void CFMulti_6(__global const Dtype* restrict image_data, const int_tp image_offset,
__global const Dtype* restrict kernel_data, const int_tp kernel_offset,
__global const Dtype* restrict bias,const int_tp bias_offset,
__global Dtype* restrict convolved_image,const int_tp convolved_image_offset,
const ushort WIDTH,
const ushort HEIGHT,
const ushort OUTPUT_W,
const ushort OUTPUT_H) {
const int_tp outputX = get_global_id(0)*XPAR;
const int_tp outputY = get_global_id(1)*YPAR;
const int_tp kernelNum = get_global_id(2)*ZPAR;
if(outputX < OUTPUT_W && outputY < OUTPUT_H)
{
Dtype sum[XPAR*YPAR*ZPAR];
for(uint_tp kern = 0; kern < XPAR*YPAR*ZPAR; kern++)
sum[kern] = 0.0f;
const int_tp currentKernelOffset = kernel_offset + kernelNum*KERNELSIZE*CHANNELS;
const int_tp biasIndex=bias_offset + kernelNum;
const int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W;
const int_tp imageSize = WIDTH*HEIGHT;
int_tp index;
__global const Dtype* image_dataPtrFloat[2];
image_dataPtrFloat[0] = (image_data + (image_offset + local_image_offset));
image_dataPtrFloat[1] = image_dataPtrFloat[0];
__global const Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
DTImage imageCache[YPAR];
DTKernel kernelCache;
Dtype4 temp;
for(uint_tp c = 0; c < CHANNELS; c++)
{
imageCache[0] = ((__global DTImage*)image_dataPtrFloat[1])[0];
for(uint_tp preload = 1; preload < YPAR; preload++)
{
image_dataPtrFloat[1] += WIDTH;
imageCache[preload] = ((__global DTImage*)image_dataPtrFloat[1])[0];
}
int_tp y =0;
LOOP(KERNEL_H, y,
{
int_tp kern=0;
LOOP(ZPAR, kern,
{
kernelCache = ((__global DTKernel*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[0];
index = kern*XPAR*YPAR;
for(uint_tp y_par = 0; y_par < YPAR; y_par++)
{
temp = floatDotV4(imageCache[y_par],kernelCache);
sum[index + y_par*XPAR + 0] += temp.s0;
sum[index + y_par*XPAR + 1] += temp.s1;
sum[index + y_par*XPAR + 2] += temp.s2;
sum[index + y_par*XPAR + 3] += temp.s3;
}
});
kernel_dataPtrFloat += KERNEL_W;
for(uint_tp rotateData = 0; rotateData < YPAR - 1; rotateData++)
imageCache[rotateData] = imageCache[rotateData + 1];
image_dataPtrFloat[1] += WIDTH;
imageCache[YPAR - 1] = ((__global DTImage*)image_dataPtrFloat[1])[0];
});
image_dataPtrFloat[0] += imageSize;
image_dataPtrFloat[1] = image_dataPtrFloat[0];
}
if(APPLY_BIAS == 1)
{
for(uint_tp kern = 0; kern < ZPAR; kern++)
{
for(uint_tp hi =0; hi < YPAR; hi++)
for(uint_tp wi =0; wi < XPAR; wi++)
if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W && outputY + hi < OUTPUT_H)
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + (outputY +hi)*OUTPUT_W + outputX + wi] =
sum[kern*XPAR*YPAR + XPAR*hi + wi] + bias[biasIndex +kern];
}
}
else
for(uint_tp kern = 0; kern < ZPAR; kern++)
for(uint_tp hi =0; hi < YPAR; hi++)
for(uint_tp wi =0; wi < XPAR; wi++)
if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W && outputY + hi < OUTPUT_H)
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + (outputY + hi)*OUTPUT_W + outputX + wi] = sum[kern*XPAR*YPAR +XPAR*hi +wi];
}
}
#endif
//Begin IDLF kernels below here
#ifdef IDLF
#define activation_function(x) (x)
#if 0
#define _IW INPUT_WIDTH
#define _IH INPUT_HEIGHT
#define _OW OUTPUT_WIDTH
#define _OH OUTPUT_HEIGHT
#endif
#define _ID INPUT_DEPTH
#define _OD NUM_FILTERS
#define FILTER_DEPTH INPUT_DEPTH
#define NUM_INPUT INPUT_DEPTH
#define NUM_OUTPUT NUM_FILTERS
#define KERNEL FILTER_WIDTH
// convolution stride, same for x and y
#define K_STRIDE STRIDEX
#ifndef IWPAD
#define IWPAD 0
#endif
#ifndef IHPAD
#define IHPAD 0
#endif
#define OUT_BLOCK_SIZE (OUT_BLOCK_WIDTH*OUT_BLOCK_HEIGHT)
#ifndef MASTER_OUT_BLOCK_WIDTH
#define MASTER_OUT_BLOCK_WIDTH OUT_BLOCK_WIDTH
#endif
#ifndef MASTER_OUT_BLOCK_HEIGHT
#define MASTER_OUT_BLOCK_HEIGHT OUT_BLOCK_HEIGHT
#endif
// Each work-item computes a 4x6 region of one output map.
// Each work-group (which will be mapped to 1 SIMD16 EU thread) will compute 16 different feature maps, but each feature map is for the same 4x6 region of the imput image.
// NDRange: (_OW+pad)/ OUT_BLOCK_WIDTH, (_OH+pad)/OUT_BLOCK_HEIGHT, _OD/OUT_BLOCK_DEPTH
//#define SIMD_SIZE 16
// NOTE: this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break.
#ifdef SIMD16
#define TILE_X ((OUT_BLOCK_WIDTH - 1) * STRIDEX + KERNEL)
#define TILE_Y ((OUT_BLOCK_HEIGHT - 1) * STRIDEY + KERNEL)
#if (TILE_X % 4) != 0
__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
kernel void
convolve_simd16( // __global float *inputs, __global float* weights, __global float* outputs
__global float* inputs_base,
filter_qualifier float* weights_base,
__global float* biases_base,
__global float* outputs_base,
const ushort _IW,
const ushort _IH,
const ushort _OW,
const ushort _OH)
{
__global float* outputs = outputs_base;
__global float* inputs = inputs_base;
filter_qualifier float* weights = weights_base;
__global float* biases = biases_base;
uint_tp oc = get_global_id(0) * MASTER_OUT_BLOCK_WIDTH; // oc = Output Column
uint_tp or = get_global_id(1) * MASTER_OUT_BLOCK_HEIGHT;// or = Output Row
uint_tp fm = get_global_id(2);// fm = Feature Map = od = Output Depth
uint_tp fmg = get_group_id(2);
uint_tp lid = get_local_id(2);
float in[IN_BUFFER_SIZE];// load 11x16 block of input data, really only need 11x15 for 4x6 outputs, but keep it simple.
//float out[24]; // 4x6 block of outputs that is SIMD_SIZE deep (along the Feature Map dimension).
float out[OUT_BLOCK_SIZE];
uint_tp in_addr;
// find weights adress of given neuron (lid is index)
uint_tp weight_addr = (fmg % (_OD/SIMD_SIZE)) * INPUT_DEPTH * KERNEL * KERNEL * SIMD_SIZE + lid;
for(int_tp i=0;i<OUT_BLOCK_SIZE;i++) {
out[i]=0.0f;
}
uint_tp num_in_batch = fm / _OD;
uint_tp input_batch_offset = num_in_batch * (_IH + IHPAD) * (_IW + IWPAD) * TOTAL_INPUT_DEPTH_SIZE;
for(int_tp kd = 0; kd < _ID; kd++)
{
in_addr = input_batch_offset + (kd + INPUT_START_Z) * (_IH + IHPAD) * (_IW + IWPAD) + (or*K_STRIDE + INPUT_START_Y) * (_IW + IWPAD) + (oc*K_STRIDE + INPUT_START_X) + lid;
// read 11x16 input block into registers.
for(uint_tp reg = 0; reg < IN_BUFFER_SIZE; reg++) {
in[reg] = inputs[in_addr]; // read 16 elements
in_addr += (_IW + IWPAD);// move to next row down
}
// PREF could be 4 or 8, could not be other values.
#define WEIGHT_PREF 8
union {
float w[WEIGHT_PREF];
uint8 ui8;
} weight_buf;
int_tp w_idx=0;
weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]);
uint_tp orig_weight_addr = weight_addr;
weight_addr += SIMD_SIZE * WEIGHT_PREF;
int_tp kr = 0; // kr = Kernel Row
LOOP(KERNEL, kr,// LOOP is a macro that unrolls the loop.
{
int_tp kc = 0; // kc = Kernel Column
LOOP(KERNEL, kc,
{
for(int_tp br=0; br < OUT_BLOCK_HEIGHT; br++) {
for(int_tp bc=0; bc < OUT_BLOCK_WIDTH; bc++) {
float input = intel_sub_group_shuffle( in[br * K_STRIDE + kr], bc * K_STRIDE + kc);
out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
}
}
// We assume KERNEL_W is equal to KERNEL_H here.
if ((w_idx + 1) % WEIGHT_PREF == 0
#if KERNEL*KERNEL % 8 != 0
&& ((w_idx + 1) <= (KERNEL * KERNEL - WEIGHT_PREF))
#endif
) {
weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]);
weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
}
#if KERNEL*KERNEL % 8 == 0
// need to do nothing
#else
else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL * KERNEL - WEIGHT_PREF)))
#if KERNEL*KERNEL % 8 == 1
weight_buf.w[0] = weights[weight_addr];
#elif KERNEL*KERNEL % 4 == 0
weight_buf.ui8.s0123 = intel_sub_group_block_read4((__global uint *)&weights[weight_addr]);
#else
// should never be here if kernel_w equal to kernel_h. just in case.
#error unsupported kernel size.
#endif
#endif
++w_idx;
});
});
weight_addr = orig_weight_addr + KERNEL * KERNEL * SIMD_SIZE;
}
#ifdef IMAGE_AS_OUTPUT
// TODO: no ULT for that one yet!
uint_tp out_addr = ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD)) * (_OW + OWPAD) * (_OH + OHPAD);// out_addr indexes into start of 16 feature maps.
#else
// we need this address calculation for outputs because we support views and batching
uint_tp out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD) ) * (_OW + OWPAD) * (_OH + OHPAD);
#endif
out_addr += or * (_OW + OWPAD) + oc; // offset for the 4x3 block that this workitem is working on;
// we need this address calculation for biases because we support views and batching
float bias = biases[(fm) % _OD ];
#ifndef WRITE_PADDED_VALUES
if(get_global_id(0) != (get_global_size(0)-1) &&
get_global_id(1) != (get_global_size(1)-1) )
{
#endif
for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) {
for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) {
// this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
#ifdef IMAGE_AS_OUTPUT
write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]));
#else
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
#endif
}
}
#ifndef WRITE_PADDED_VALUES
} else if ( get_global_id(1) != (get_global_size(1)-1) )
{
for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) {
for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) {
#ifdef IMAGE_AS_OUTPUT
write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]));
#else
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
#endif
}
}
}
else if ( get_global_id(0) != (get_global_size(0)-1) )
{
for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) {
for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) {
#ifdef IMAGE_AS_OUTPUT
write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]));
#else
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
#endif
}
}
}
else
{
for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) {
for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) {
#ifdef IMAGE_AS_OUTPUT
write_imagef(outputs,(int2)(c,r*(_OW + OWPAD)),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]));
#else
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
#endif
}
}
}
#endif //#ifndef WRITE_PADDED_VALUES
}
#endif
#if TILE_X % 4 == 0
#define TILE_Y_STRIDE (64 / TILE_X)
#define INVEC_NUM ((TILE_Y + TILE_Y_STRIDE - 1) / TILE_Y_STRIDE)
__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
kernel void
convolve_simd16( // __global float *inputs, __global float* weights, __global float* outputs
__global float* inputs_base,
filter_qualifier float* weights_base,
__global float* biases_base,
__global float* outputs_base,
const ushort _IW,
const ushort _IH,
const ushort _OW,
const ushort _OH)
{
__global float* outputs = outputs_base;
__global float* inputs = inputs_base;
filter_qualifier float* weights = weights_base;
__global float* biases = biases_base;
uint_tp oc = get_global_id(0) * MASTER_OUT_BLOCK_WIDTH; // oc = Output Column
uint_tp or = get_global_id(1) * MASTER_OUT_BLOCK_HEIGHT;// or = Output Row
uint_tp fm = get_global_id(2);// fm = Feature Map = od = Output Depth
uint_tp fmg = get_group_id(2);
uint_tp lid = get_local_id(2);
float out[OUT_BLOCK_SIZE];
uint_tp in_addr;
// find weights adress of given neuron (lid is index)
uint_tp weight_addr = (fmg % (_OD/SIMD_SIZE)) * INPUT_DEPTH * KERNEL * KERNEL * SIMD_SIZE + lid;
for(int_tp i=0;i<OUT_BLOCK_SIZE;i++) {
out[i]=0.0f;
}
uint_tp num_in_batch = ( fm ) / _OD;
uint_tp input_batch_offset = num_in_batch * (_IH + IHPAD) * (_IW + IWPAD) * TOTAL_INPUT_DEPTH_SIZE;
in_addr = input_batch_offset + INPUT_START_Z * (_IH + IHPAD) * (_IW + IWPAD) + (or*STRIDEY + INPUT_START_Y) * (_IW + IWPAD) + (oc*STRIDEX + INPUT_START_X)
+ ( lid / ( TILE_X / 4 ) ) * (_IW + IWPAD) * STRIDEY // y tile offset
+ ( lid % ( TILE_X / 4 ) ) * 4 * STRIDEX; // x tile offset
for(int_tp kd = 0; kd < _ID; kd++)
{
union {
float4 in_vec[INVEC_NUM];
float in_array[INVEC_NUM * 4];
} in_buf;
uint_tp in_offset = in_addr;
int_tp reg = 0;
#if INVEC_NUM == 1
LOOP(1, reg,
#elif INVEC_NUM == 2
LOOP(2, reg,
#elif INVEC_NUM == 3
LOOP(3, reg,
#elif INVEC_NUM == 4
LOOP(4, reg,
#else
#error too large invec_num.
#endif
{
in_buf.in_vec[reg] = *(global float4*)(inputs + in_offset); // read 16 elements
in_offset += (_IW + IWPAD) * TILE_Y_STRIDE;
});
in_addr += (_IH + IHPAD) * (_IW + IWPAD);
// PREF could be 4 or 8, could not be other values.
#define WEIGHT_PREF 8
union {
float w[WEIGHT_PREF];
uint8 ui8;
} weight_buf;
int_tp w_idx=0;
weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]);
uint_tp orig_weight_addr = weight_addr;
weight_addr += SIMD_SIZE * WEIGHT_PREF;
#define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4))
int_tp kr = 0; // kr = Kernel Row
LOOP(KERNEL, kr,// LOOP is a macro that unrolls the loop.
{
int_tp kc = 0; // kc = Kernel Column
LOOP(KERNEL, kc,
{
for(int_tp br=0; br < OUT_BLOCK_HEIGHT; br++) {
for(int_tp bc=0; bc < OUT_BLOCK_WIDTH; bc++) {
float input = BLOCK_IN((br * STRIDEY + kr) * TILE_X + bc * STRIDEX + kc);//intel_sub_group_shuffle( in[br * K_STRIDE + kr], bc * K_STRIDE + kc);
out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
}
}
// We assume KERNEL_W is equal to KERNEL_H here.
if ((w_idx + 1) % WEIGHT_PREF == 0
#if KERNEL*KERNEL % 8 != 0
&& ((w_idx + 1) <= (KERNEL * KERNEL - WEIGHT_PREF))
#endif
) {
weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]);
weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
}
#if KERNEL*KERNEL % 8 == 0
// need to do nothing
#else
else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL * KERNEL - WEIGHT_PREF)))
#if KERNEL*KERNEL % 8 == 1
weight_buf.w[0] = weights[weight_addr];
#elif KERNEL*KERNEL % 4 == 0
weight_buf.ui8.s0123 = intel_sub_group_block_read4((__global uint *)&weights[weight_addr]);
#else
// should never be here if kernel_w equal to kernel_h. just in case.
#error unsupported kernel size.
#endif
#endif
++w_idx;
});
});
weight_addr = orig_weight_addr + KERNEL * KERNEL * SIMD_SIZE;
}
// we need this address calculation for outputs because we support views and batching
uint_tp out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD) ) * (_OW + OWPAD) * (_OH + OHPAD);
out_addr += or * (_OW + OWPAD) + oc; // offset for the 4x3 block that this workitem is working on;
// we need this address calculation for biases because we support views and batching
float bias = biases[(fm) % _OD ];
#ifndef WRITE_PADDED_VALUES
if(get_global_id(0) != (get_global_size(0)-1) &&
get_global_id(1) != (get_global_size(1)-1) )
{
#endif
for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) {
for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) {
// this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
}
}
#ifndef WRITE_PADDED_VALUES
} else if ( get_global_id(1) != (get_global_size(1)-1) )
{
for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) {
for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) {
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
}
}
}
else if ( get_global_id(0) != (get_global_size(0)-1) )
{
for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) {
for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) {
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
}
}
}
else
{
for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) {
for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) {
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]);
}
}
}
#endif //#ifndef WRITE_PADDED_VALUES
}
#endif // Stride > 2
#endif
#endif
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(copyImage, Dtype)
(__global Dtype* image_data,
int_tp image_offset,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp adjustedHeight, const int_tp adjustedWidth,
const int_tp pad_h, const int_tp pad_w,
__global Dtype* output_image,
const int_tp output_offset,
const int_tp batch_size) {
uint_tp sX = get_global_id(0);
uint_tp sY = get_global_id(1);
uint_tp sZ = get_global_id(2);
int_tp in_y = sY - pad_h;
int_tp in_x = sX - pad_w;
int_tp batch_offset = 0;
int_tp adjusted_batch_offset = 0;
for(uint_tp batch_idx = 0; batch_idx < batch_size; batch_idx++) {
int_tp dst_offset = adjusted_batch_offset + output_offset + sZ*adjustedHeight*adjustedWidth + sY*adjustedWidth +sX;
int_tp src_offset = batch_offset + image_offset + sZ*height*width + in_y*width + in_x;
if((in_y >= 0 && in_y < height && in_x >= 0 && in_x < width))
output_image[dst_offset] = image_data[src_offset];
else
output_image[dst_offset] = 0;
batch_offset += height * width * channels;
adjusted_batch_offset += adjustedHeight * adjustedWidth * channels;
}
}
__kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
(__global Dtype* weightIn,
__global Dtype* weightOut,
const int_tp kernel_w,
const int_tp kernel_h,
const int_tp channels,
const int_tp outputs,
const int_tp swizzleFactor) {
uint_tp sX = get_global_id(0);
//Original location
//Output location
int_tp outputSublayer = channels / swizzleFactor;
int_tp outputSublayerIndex = channels % swizzleFactor;
int_tp filter = sX / (kernel_w*kernel_h*channels);
int_tp kernel_X = sX % kernel_w;
int_tp kernel_Y = (sX / kernel_w) % kernel_h;
int_tp kernel_C = (sX / (kernel_w * kernel_h)) % channels;
int_tp FP = filter / swizzleFactor;
int_tp F1 = filter % swizzleFactor;
weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1]
= weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X];
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(crop_copy, Dtype)(const int_tp n, const int_tp height,
const int_tp width,
const int_tp src_outer_stride,
const int_tp src_inner_stride,
const int_tp dest_outer_stride,
const int_tp dest_inner_stride,
__global const Dtype* src,
const int_tp src_off,
__global Dtype* dest,
const int_tp dest_off) {
for (int_tp index = get_global_id(0); index < n;
index += get_global_size(0)) {
int_tp src_start = index / height * src_outer_stride
+ index % height * src_inner_stride;
int_tp dest_start = index / height * dest_outer_stride
+ index % height * dest_inner_stride;
for (int_tp i = 0; i < width; ++i) {
dest[dest_off + dest_start + i] = src[src_off + src_start + i];
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(dropout_forward,Dtype)(const int_tp n,
__global const Dtype* in,
__global const uint_tp* mask,
const uint_tp threshold,
const Dtype scale,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out[index] = in[index] * ((mask[index] > threshold)?1.0:0.0) * scale;
}
}
__kernel void TEMPLATE(dropout_backward,Dtype)(
const int_tp n, __global const Dtype* in_diff,
__global const uint_tp* mask, const uint_tp threshold,
const Dtype scale,
__global Dtype* out_diff) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out_diff[index] = in_diff[index] * ((mask[index] > threshold)?1.0:0.0) * scale;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(eltwise_max_forward,Dtype)(
const int_tp nthreads, __global const Dtype* bottom_data_a,
__global const Dtype* bottom_data_b, const int_tp blob_idx,
__global Dtype* top_data,
__global int_tp* mask) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
Dtype maxval = -FLT_MAX;
int_tp maxidx = -1;
if (bottom_data_a[index] > bottom_data_b[index]) {
// only update for very first bottom_data blob (blob_idx == 0)
if (blob_idx == 0) {
maxval = bottom_data_a[index];
top_data[index] = maxval;
maxidx = blob_idx;
mask[index] = maxidx;
}
} else {
maxval = bottom_data_b[index];
top_data[index] = maxval;
maxidx = blob_idx + 1;
mask[index] = maxidx;
}
}
}
__kernel void TEMPLATE(eltwise_max_backward,Dtype)(const int_tp nthreads,
__global const Dtype* top_diff,
const int_tp blob_idx,
__global const int_tp* mask,
__global Dtype* bottom_diff) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
Dtype gradient = 0;
if (mask[index] == blob_idx) {
gradient += top_diff[index];
}
bottom_diff[index] = gradient;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(elu_forward,Dtype)(const int n, __global const Dtype* in,
__global Dtype* out,
Dtype alpha) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out[index] = in[index] > 0 ? in[index] : alpha * (exp(in[index]) - 1.0);
}
}
__kernel void TEMPLATE(elu_backward,Dtype)(const int n, __global const Dtype* in_diff,
__global const Dtype* out_data,
__global const Dtype* in_data,
__global Dtype* out_diff,
Dtype alpha) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
out_diff[index] =
in_data[index] > 0 ?
in_diff[index] : in_diff[index] * (out_data[index] + alpha);
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(embed_forward,Dtype)(const int_tp nthreads,
__global const Dtype* bottom_data,
__global const Dtype* weight,
const int_tp M, const int_tp N,
const int_tp K,
__global Dtype* top_data) {
for (int_tp top_index = get_global_id(0); top_index < nthreads;
top_index += get_global_size(0)) {
const int_tp n = top_index / N;
const int_tp d = top_index % N;
const int_tp index = (int_tp)(bottom_data[n]);
const int_tp weight_index = index * N + d;
top_data[top_index] = weight[weight_index];
}
}
// atomic_add from: http://suhorukov.blogspot.com/2011/12/opencl-11-atomic-operations-on-floating.html
#if (TYPE == TYPE_FLOAT)
inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) {
union {
uint_tp intVal;
Dtype floatVal;
} newVal;
union {
uint_tp intVal;
Dtype floatVal;
} prevVal;
do {
prevVal.floatVal = *source;
newVal.floatVal = prevVal.floatVal + operand;
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
}
__kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data,
__global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K,
__global Dtype* weight_diff) {
for (int_tp top_index = get_global_id(0); top_index < nthreads;
top_index += get_global_size(0)) {
const int_tp n = top_index / N;
const int_tp d = top_index % N;
const int_tp index = (int_tp)(bottom_data[n]);
const int_tp weight_index = index * N + d;
TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index));
}
}
#endif
#if (TYPE == TYPE_DOUBLE)
#ifdef ATOMICS_64_AVAILABLE
inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) {
union {
unsigned long intVal;
Dtype floatVal;
} newVal;
union {
unsigned long intVal;
Dtype floatVal;
} prevVal;
do {
prevVal.floatVal = *source;
newVal.floatVal = prevVal.floatVal + operand;
} while (atom_cmpxchg((volatile __global unsigned long *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
}
__kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data,
__global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K,
__global Dtype* weight_diff) {
for (int_tp top_index = get_global_id(0); top_index < nthreads;
top_index += get_global_size(0)) {
const int_tp n = top_index / N;
const int_tp d = top_index % N;
const int_tp index = (int_tp)(bottom_data[n]);
const int_tp weight_index = index * N + d;
TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index));
}
}
#endif
#endif
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(fillbuffer,Dtype)(const int_tp n, const char alpha, __global char* x,
const int_tp offx) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
x[index + offx] = alpha;
}
}
__kernel void TEMPLATE(fill,Dtype)(const int_tp n, const Dtype alpha, __global Dtype* x,
const int_tp offx) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
x[index + offx] = alpha;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(im2col,Dtype)(const int_tp n,
__global const Dtype* data_im,
const int_tp data_im_off,
const int_tp height, const int_tp width,
const int_tp kernel_h,
const int_tp kernel_w, const int_tp pad_h,
const int_tp pad_w, const int_tp stride_h,
const int_tp stride_w,
const int_tp dilation_h,
const int_tp dilation_w,
const int_tp height_col,
const int_tp width_col,
__global Dtype* data_col,
const int_tp data_col_off) {
for (int_tp index = get_global_id(0); index < n;
index += get_global_size(0)) {
const int_tp h_index = index / width_col;
const int_tp h_col = h_index % height_col;
const int_tp w_col = index % width_col;
const int_tp c_im = h_index / height_col;
const int_tp c_col = c_im * kernel_h * kernel_w;
const int_tp h_offset = h_col * stride_h - pad_h;
const int_tp w_offset = w_col * stride_w - pad_w;
__global Dtype* data_col_ptr = data_col + data_col_off;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
__global const Dtype* data_im_ptr = data_im + data_im_off;
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (int_tp i = 0; i < kernel_h; ++i) {
for (int_tp j = 0; j < kernel_w; ++j) {
int_tp h_im = h_offset + i * dilation_h;
int_tp w_im = w_offset + j * dilation_w;
*data_col_ptr =
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
data_col_ptr += height_col * width_col;
}
}
}
}
__kernel void TEMPLATE(col2im,Dtype)(const int_tp n,
__global const Dtype* data_col,
const int_tp data_col_off,
const int_tp height, const int_tp width,
const int_tp channels,
const int_tp kernel_h,
const int_tp kernel_w, const int_tp pad_h,
const int_tp pad_w, const int_tp stride_h,
const int_tp stride_w,
const int_tp dilation_h,
const int_tp dilation_w,
const int_tp height_col,
const int_tp width_col,
__global Dtype* data_im,
const int_tp data_im_off) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
Dtype val = 0;
const int_tp w_im = index % width + pad_w;
const int_tp h_im = (index / width) % height + pad_h;
const int_tp c_im = index / (width * height);
int_tp kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
int_tp kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
// compute the start and end of the output
const int_tp w_col_start =
(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
const int_tp w_col_end = min(w_im / stride_w + 1, width_col);
const int_tp h_col_start =
(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
const int_tp h_col_end = min(h_im / stride_h + 1, height_col);
// TODO: use LCM of stride and dilation to avoid unnecessary loops
for (int_tp h_col = h_col_start; h_col < h_col_end; h_col += 1) {
for (int_tp w_col = w_col_start; w_col < w_col_end; w_col += 1) {
int_tp h_k = (h_im - h_col * stride_h);
int_tp w_k = (w_im - w_col * stride_w);
if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
h_k /= dilation_h;
w_k /= dilation_w;
int_tp data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
height_col + h_col) * width_col + w_col;
val += data_col[data_col_off + data_col_index];
}
}
}
data_im[data_im_off + index] = val;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(im2col_nd, Dtype)(const int_tp n, const int_tp num_axes,
const int_tp channel_axis,
__global const Dtype* data_im,
const int_tp data_im_off,
__global const int_tp* im_shape,
__global const int_tp* col_shape,
__global const int_tp* kernel_shape,
__global const int_tp* pad,
__global const int_tp* stride,
__global const int_tp* dilation,
__global Dtype* data_col,
const int_tp data_col_off) {
int_tp d_temp[6];
int_tp d_iter[6];
int_tp i;
__global const int_tp* im_shape_ptr = im_shape + channel_axis;
__global const int_tp* col_shape_ptr = col_shape + channel_axis;
__local int_tp shared_dilation[6];
__local int_tp shared_kernel_shape[6];
__local int_tp shared_pad[6];
__local int_tp shared_stride[6];
__local int_tp shared_col_shape[6 + 1];
__local int_tp shared_im_shape[6 + 1];
for (int li = get_local_id(0); li < num_axes; li += get_local_size(0)) {
shared_dilation[li] = dilation[li];
shared_kernel_shape[li] = kernel_shape[li];
shared_pad[li] = pad[li];
shared_stride[li] = stride[li];
}
for (int li = get_local_id(0); li < num_axes + 1; li += get_local_size(0)) {
shared_col_shape[li] = col_shape_ptr[li];
shared_im_shape[li] = im_shape_ptr[li];
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int_tp index = get_global_id(0); index < n;
index += get_global_size(0)) {
// Initialize channel_in, computed in the loop below, with intermediate
// computations used to compute the spatial indices.
int_tp channel_in = index;
int_tp channel_out = 1;
for (i = num_axes - 1; i >= 0; --i) {
d_temp[i] = channel_in % shared_col_shape[i + 1];
channel_in /= shared_col_shape[i + 1];
channel_out *= shared_kernel_shape[i];
}
channel_out *= channel_in;
int_tp data_col_inc = 1;
for (i = 0; i < num_axes; ++i) {
channel_out *= shared_col_shape[i + 1];
channel_out += d_temp[i];
d_temp[i] = d_temp[i] * shared_stride[i] - shared_pad[i];
channel_in *= shared_im_shape[i + 1];
channel_in += d_temp[i];
data_col_inc *= shared_col_shape[i + 1];
d_iter[i] = 0;
}
__global Dtype* data_col_ptr = data_col + data_col_off + channel_out;
__global const Dtype* data_im_ptr = data_im + data_im_off + channel_in;
bool incremented;
do {
bool in_range = true;
for (i = 0; i < num_axes; ++i) {
const int_tp d_iter_im = d_iter[i] * shared_dilation[i] + d_temp[i];
in_range &= d_iter_im >= 0 && d_iter_im < shared_im_shape[i + 1];
if (!in_range) {
break;
}
}
if (in_range) {
int_tp data_im_offset = d_iter[0] * shared_dilation[0];
for (i = 1; i < num_axes; ++i) {
data_im_offset *= shared_im_shape[i + 1];
data_im_offset += d_iter[i] * shared_dilation[i];
}
*data_col_ptr = data_im_ptr[data_im_offset];
} else {
*data_col_ptr = 0;
}
data_col_ptr += data_col_inc;
incremented = false;
for (i = num_axes - 1; i >= 0; --i) {
const int_tp d_max = shared_kernel_shape[i];
if (d_iter[i] == d_max - 1) {
d_iter[i] = 0;
} else { // d_iter[i] < d_max - 1
++d_iter[i];
incremented = true;
break;
}
} // for (int_tp i = num_axes - 1; i >= 0; --i)
} while (incremented); // do
}
}
__kernel void TEMPLATE(col2im_nd, Dtype)(const int_tp n, const int_tp num_axes,
const int_tp channel_axis,
__global const Dtype* data_col,
const int_tp data_col_off,
__global const int_tp* im_shape,
__global const int_tp* col_shape,
__global const int_tp* kernel_shape,
__global const int_tp* pad,
__global const int_tp* stride,
__global const int_tp* dilation,
__global Dtype* data_im,
const int_tp data_im_off) {
int_tp d_im[6];
int_tp d_col_iter[6];
int_tp d_col_start[6];
int_tp d_col_end[6];
__global const int_tp* im_shape_ptr = im_shape + channel_axis;
__global const int_tp* col_shape_ptr = col_shape + channel_axis;
__local int_tp shared_dilation[6];
__local int_tp shared_kernel_shape[6];
__local int_tp shared_pad[6];
__local int_tp shared_stride[6];
__local int_tp shared_col_shape[6 + 1];
__local int_tp shared_im_shape[6 + 1];
for (int li = get_local_id(0); li < num_axes; li += get_local_size(0)) {
shared_dilation[li] = dilation[li];
shared_kernel_shape[li] = kernel_shape[li];
shared_pad[li] = pad[li];
shared_stride[li] = stride[li];
}
for (int li = get_local_id(0); li < num_axes + 1; li += get_local_size(0)) {
shared_col_shape[li] = col_shape_ptr[li];
shared_im_shape[li] = im_shape_ptr[li];
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
// Initialize channel_in, computed in the loop below, with intermediate
// computations used to compute the spatial indices.
int_tp c_im = index;
// Calculate d_im (image dimensions).
for (int_tp i = num_axes - 1; i >= 0; --i) {
d_im[i] = c_im % shared_im_shape[i + 1] + shared_pad[i];
c_im /= shared_im_shape[i + 1];
}
// Calculate col start/end indices.
bool done = false;
for (int_tp i = 0; i < num_axes; ++i) {
const int_tp kernel_extent = shared_dilation[i]
* (shared_kernel_shape[i] - 1) + 1;
d_col_start[i] = d_col_iter[i] =
(d_im[i] < kernel_extent) ?
0 : (d_im[i] - kernel_extent) / shared_stride[i] + 1;
d_col_end[i] = min(d_im[i] / shared_stride[i] + 1,
shared_col_shape[i + 1]);
if (d_col_start[i] >= d_col_end[i]) {
// Skip computation if the dimension is 0 at any spatial axis --
// final val will be 0.
data_im[index] = 0;
done = true;
break; // for (int_tp i = 0; i < num_axes; ++i)
}
}
if (!done) {
// Loop over the col to compute the output val.
Dtype val = 0;
bool incremented = true;
bool skip = false;
do {
// Compute the final offset.
int_tp final_offset = 0;
int_tp kernel_shape_prod = 1;
int_tp kernel_index;
for (int_tp i = num_axes - 1; i >= 0; --i) {
kernel_index = d_im[i] - d_col_iter[i] * shared_stride[i];
if (kernel_index % shared_dilation[i]) {
skip = true;
break;
} else {
kernel_index /= shared_dilation[i];
final_offset += kernel_index * kernel_shape_prod;
kernel_shape_prod *= shared_kernel_shape[i];
}
}
if (!skip) {
final_offset += kernel_shape_prod * c_im;
for (int_tp i = 0; i < num_axes; ++i) {
final_offset *= shared_col_shape[i + 1];
final_offset += d_col_iter[i];
}
val += data_col[data_col_off + final_offset];
}
skip = false;
incremented = false;
for (int_tp i = num_axes - 1; i >= 0; --i) {
const int_tp d_max = d_col_end[i];
if (d_col_iter[i] == d_max - 1) {
d_col_iter[i] = d_col_start[i];
} else { // d_col_iter[i] < d_max - 1
++d_col_iter[i];
incremented = true;
break; // for (int_tp i = num_axes - 1; i >= 0; --i)
}
} // for (int_tp i = num_axes - 1; i >= 0; --i)
} while (incremented);
data_im[data_im_off + index] = val;
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(lrn_compute_output,Dtype)(const int_tp nthreads,
__global const Dtype* in,
__global const Dtype* scale,
const Dtype negative_beta,
__global Dtype* out) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
out[index] = in[index] * pow(scale[index], negative_beta);
}
}
__kernel void TEMPLATE(lrn_fill_scale,Dtype)(const int_tp nthreads, __global const Dtype* in,
const int_tp num, const int_tp channels,
const int_tp height, const int_tp width, const int_tp size,
const Dtype alpha_over_size, const Dtype k,
__global Dtype* const scale) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
// find out the local offset
const int_tp w = index % width;
const int_tp h = (index / width) % height;
const int_tp n = index / width / height;
const int_tp offset = (n * channels * height + h) * width + w;
const int_tp step = height * width;
__global const Dtype* in_off = in + offset;
__global Dtype* scale_off = scale + offset;
int_tp head = 0;
const int_tp pre_pad = (size - 1) / 2;
const int_tp post_pad = size - pre_pad - 1;
Dtype accum_scale = 0;
// fill the scale at [n, :, h, w]
// accumulate values
while (head < post_pad && head < channels) {
accum_scale += in_off[head * step] * in_off[head * step];
++head;
}
// both add and subtract
while (head < channels) {
accum_scale += in_off[head * step] * in_off[head * step];
if (head - size >= 0) {
accum_scale -= in_off[(head - size) * step]
* in_off[(head - size) * step];
}
scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
++head;
}
// subtract only
while (head < channels + post_pad) {
if (head - size >= 0) {
accum_scale -= in_off[(head - size) * step]
* in_off[(head - size) * step];
}
scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
++head;
}
}
}
__kernel void TEMPLATE(lrn_compute_diff,Dtype)(const int_tp nthreads,
__global const Dtype* bottom_data,
__global const Dtype* top_data,
__global const Dtype* scale,
__global const Dtype* top_diff, const int_tp num,
const int_tp channels, const int_tp height,
const int_tp width, const int_tp size,
const Dtype negative_beta,
const Dtype cache_ratio,
__global Dtype* bottom_diff) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
// find out the local offset
const int_tp w = index % width;
const int_tp h = (index / width) % height;
const int_tp n = index / width / height;
const int_tp offset = (n * channels * height + h) * width + w;
const int_tp step = height * width;
__global const Dtype* bottom_off = bottom_data + offset;
__global const Dtype* top_off = top_data + offset;
__global const Dtype* scale_off = scale + offset;
__global const Dtype* top_diff_off = top_diff + offset;
__global Dtype* bottom_diff_off = bottom_diff + offset;
int_tp head = 0;
const int_tp pre_pad = size - (size + 1) / 2;
const int_tp post_pad = size - pre_pad - 1;
Dtype accum_ratio = 0;
// accumulate values
while (head < post_pad && head < channels) {
accum_ratio += top_diff_off[head * step] * top_off[head * step]
/ scale_off[head * step];
++head;
}
// both add and subtract
while (head < channels) {
accum_ratio += top_diff_off[head * step] * top_off[head * step]
/ scale_off[head * step];
if (head - size >= 0) {
accum_ratio -= top_diff_off[(head - size) * step]
* top_off[(head - size) * step] / scale_off[(head - size) * step];
}
bottom_diff_off[(head - post_pad) * step] = top_diff_off[(head - post_pad)
* step] * pow(scale_off[(head - post_pad) * step], negative_beta)
- cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
++head;
}
// subtract only
while (head < channels + post_pad) {
if (head - size >= 0) {
accum_ratio -= top_diff_off[(head - size) * step]
* top_off[(head - size) * step] / scale_off[(head - size) * step];
}
bottom_diff_off[(head - post_pad) * step] = top_diff_off[(head - post_pad)
* step] * pow(scale_off[(head - post_pad) * step], negative_beta)
- cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
++head;
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
inline Dtype TEMPLATE(lstm_sigmoid,Dtype)(const Dtype x) {
return (Dtype)1 / ((Dtype)1 + exp(-x));
}
inline Dtype TEMPLATE(lstm_tanh,Dtype)(const Dtype x) {
return (Dtype)2 * TEMPLATE(lstm_sigmoid,Dtype)((Dtype)2 * x) - (Dtype)1;
}
__kernel void TEMPLATE(lstm_acts_forward,Dtype)(const int_tp nthreads, const int_tp dim,
__global const Dtype* X, __global Dtype* X_acts) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp x_dim = 4 * dim;
const int_tp d = index % x_dim;
if (d < 3 * dim) {
X_acts[index] = TEMPLATE(lstm_sigmoid,Dtype)(X[index]);
} else {
X_acts[index] = TEMPLATE(lstm_tanh,Dtype)(X[index]);
}
}
}
__kernel void TEMPLATE(lstm_unit_forward,Dtype)(const int_tp nthreads, const int_tp dim,
__global const Dtype* C_prev, __global const Dtype* X, __global const Dtype* cont,
__global Dtype* C, __global Dtype* H) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp n = index / dim;
const int_tp d = index % dim;
__global const Dtype* X_offset = X + 4 * dim * n;
const Dtype i = X_offset[d];
const Dtype f = X_offset[1 * dim + d];
const Dtype o = X_offset[2 * dim + d];
const Dtype g = X_offset[3 * dim + d];
const Dtype c_prev = C_prev[index];
const Dtype c = cont[n] * f * c_prev + i * g;
C[index] = c;
const Dtype tanh_c = TEMPLATE(lstm_tanh,Dtype)(c);
H[index] = o * tanh_c;
}
}
__kernel void TEMPLATE(lstm_unit_backward,Dtype)(const int_tp nthreads, const int_tp dim,
__global const Dtype* C_prev, __global const Dtype* X, __global const Dtype* C, __global const Dtype* H,
__global const Dtype* cont, __global const Dtype* C_diff, __global const Dtype* H_diff,
__global Dtype* C_prev_diff, __global Dtype* X_diff) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp n = index / dim;
const int_tp d = index % dim;
__global const Dtype* X_offset = X + 4 * dim * n;
const Dtype i = X_offset[d];
const Dtype f = X_offset[1 * dim + d];
const Dtype o = X_offset[2 * dim + d];
const Dtype g = X_offset[3 * dim + d];
const Dtype c_prev = C_prev[index];
const Dtype c = C[index];
const Dtype tanh_c = TEMPLATE(lstm_tanh,Dtype)(c);
__global Dtype* c_prev_diff = C_prev_diff + index;
__global Dtype* X_diff_offset = X_diff + 4 * dim * n;
__global Dtype* i_diff = X_diff_offset + d;
__global Dtype* f_diff = X_diff_offset + 1 * dim + d;
__global Dtype* o_diff = X_diff_offset + 2 * dim + d;
__global Dtype* g_diff = X_diff_offset + 3 * dim + d;
const Dtype c_term_diff =
C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
const Dtype cont_n = cont[n];
*c_prev_diff = cont_n * c_term_diff * f;
*i_diff = c_term_diff * g;
*f_diff = cont_n * c_term_diff * c_prev;
*o_diff = H_diff[index] * tanh_c;
*g_diff = c_term_diff * i;
}
}
__kernel void TEMPLATE(lstm_acts_backward,Dtype)(const int_tp nthreads, const int_tp dim,
__global const Dtype* X_acts, __global const Dtype* X_acts_diff, __global Dtype* X_diff) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp x_dim = 4 * dim;
const int_tp d = index % x_dim;
const Dtype X_act = X_acts[index];
if (d < 3 * dim) {
X_diff[index] = X_acts_diff[index] * X_act * ((Dtype)1 - X_act);
} else {
X_diff[index] = X_acts_diff[index] * ((Dtype)1 - X_act * X_act);
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(mul,Dtype)(const int_tp n, __global const Dtype* a,
const int_tp offa,
__global Dtype* b,
const int_tp offb, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[index + offy] = a[index + offa] * b[index + offb];
}
}
__kernel void TEMPLATE(div,Dtype)(const int_tp n, __global const Dtype* a,
const int_tp offa,
__global Dtype* b,
const int_tp offb, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[index + offy] = a[index + offa] / b[index + offb];
}
}
__kernel void TEMPLATE(add_scalar,Dtype)(const int_tp N, const Dtype alpha,
__global Dtype* Y,
const int_tp offY) {
for (int_tp index = get_global_id(0); index < N; index += get_global_size(0)) {
Y[offY + index] += alpha;
}
}
__kernel void TEMPLATE(add,Dtype)(const int_tp n, __global const Dtype* a,
const int_tp offa, __global const Dtype* b,
const int_tp offb, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[offy + index] = a[offa + index] + b[offb + index];
}
}
__kernel void TEMPLATE(sub,Dtype)(const int_tp n, __global const Dtype* a,
const int_tp offa, __global const Dtype* b,
const int_tp offb, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[offy + index] = a[offa + index] - b[offb + index];
}
}
__kernel void TEMPLATE(abs,Dtype)(const int_tp n, __global const Dtype* a,
const int_tp offa, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[offy + index] = fabs((Dtype)(a[offa + index]));
}
}
__kernel void TEMPLATE(exp,Dtype)(const int_tp n, __global const Dtype* a,
const int_tp offa, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[offy + index] = exp(a[offa + index]);
}
}
__kernel void TEMPLATE(log,Dtype)(const int_tp n, __global const Dtype* a,
const int_tp offa, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[offy + index] = log((Dtype)(a[offa + index]));
}
}
__kernel void TEMPLATE(powx,Dtype)(const int_tp n, __global const Dtype* a,
const int_tp offa, Dtype alpha,
__global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
if(alpha == 2.0) {
y[offy + index] = pow((Dtype)fabs(a[offa + index]), (Dtype)alpha);
} else {
y[offy + index] = pow((Dtype)a[offa + index], (Dtype)alpha);
}
}
}
__kernel void TEMPLATE(sign,Dtype)(const int_tp n, __global const Dtype* x,
const int_tp offx, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[index + offy] = (0.0 < x[index + offx])
- (x[index + offx] < 0.0);
}
}
__kernel void TEMPLATE(sgnbit,Dtype)(const int_tp n, __global const Dtype* x,
const int_tp offx, __global Dtype* y,
const int_tp offy) {
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
y[index + offy] = signbit(x[index + offx]);
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(merge_copy_forward_stack, Dtype)(const int_tp nthreads,
const int_tp dims,
__global const Dtype* bottom_a,
const int_tp forward_a,
__global const Dtype* bottom_b,
const int_tp forward_b,
__global Dtype* top,
const int_tp num,
const int_tp channels_a,
const int_tp channels_b,
__global const int_tp* shape_a,
__global const int_tp* shape_b) {
int_tp pad[6];
int_tp tmp_idx[6];
int_tp size_a = 1;
int_tp size_b = 1;
for (int_tp i = 0; i < dims; ++i) {
pad[i] = (shape_b[i] - shape_a[i]) / 2;
size_a *= shape_a[i];
size_b *= shape_b[i];
}
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
int_tp batch_id = index / ((channels_a + channels_b) * size_a);
int_tp bottom_id = ((index - batch_id * (channels_a + channels_b) * size_a)
/ (channels_a * size_a)) % 2;
int_tp counter = index;
for (int_tp i = dims - 1; i >= 0; --i) {
tmp_idx[i] = counter % shape_a[i];
counter /= shape_a[i];
}
if (bottom_id == 0) {
int_tp channel_id = (index / size_a) % channels_a;
int_tp aidx = batch_id * channels_a + channel_id;
for (int_tp i = 0; i < dims; ++i) {
aidx *= shape_a[i];
aidx += tmp_idx[i];
}
top[index] = (forward_a == 1) ? bottom_a[aidx] : 0;
} else {
int_tp channel_id = (index / size_a) % channels_b;
int_tp bidx = (batch_id * channels_b + channel_id) * size_b;
int_tp btemp = 1;
for (int_tp i = dims - 1; i >= 0; --i) {
bidx += btemp * (tmp_idx[i] + pad[i]);
btemp *= shape_b[i];
}
top[index] = (forward_b == 1) ? bottom_b[bidx] : 0;
}
}
}
__kernel void TEMPLATE(merge_copy_backward_stack,Dtype)(const int_tp nthreads,
const int_tp dims,
__global Dtype* bottom_a,
const int_tp backward_a,
__global Dtype* bottom_b,
const int_tp backward_b,
__global const Dtype* top,
const int_tp num,
const int_tp channels_a,
const int_tp channels_b,
__global const int_tp* shape_a,
__global const int_tp* shape_b) {
int_tp pad[6];
int_tp tmp_idx[6];
int_tp size_a = 1;
int_tp size_b = 1;
for (int_tp i = 0; i < dims; ++i) {
pad[i] = (shape_b[i] - shape_a[i]) / 2;
size_a *= shape_a[i];
size_b *= shape_b[i];
}
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
int_tp batch_id = index / ((channels_a + channels_b) * size_a);
int_tp bottom_id = ((index - batch_id * (channels_a + channels_b) * size_a)
/ (channels_a * size_a)) % 2;
int_tp counter = index;
for (int_tp i = dims - 1; i >= 0; --i) {
tmp_idx[i] = counter % shape_a[i];
counter /= shape_a[i];
}
if (bottom_id == 0) {
int_tp channel_id = (index / size_a) % channels_a;
int_tp aidx = batch_id * channels_a + channel_id;
for (int_tp i = 0; i < dims; ++i) {
aidx *= shape_a[i];
aidx += tmp_idx[i];
}
bottom_a[aidx] = (backward_a == 1) ? top[index] : 0;
} else {
int_tp channel_id = (index / size_a) % channels_b;
int_tp bidx = (batch_id * channels_b + channel_id) * size_b;
int_tp btemp = 1;
for (int_tp i = dims - 1; i >= 0; --i) {
bidx += btemp * (tmp_idx[i] + pad[i]);
btemp *= shape_b[i];
}
bottom_b[bidx] = (backward_b == 1) ? top[index] : 0;
}
}
}
__kernel void TEMPLATE(merge_copy_forward_add, Dtype)(const int_tp nthreads,
const int_tp dims,
__global const Dtype* bottom_a,
const int_tp forward_a,
__global const Dtype* bottom_b,
const int_tp forward_b,
__global Dtype* top,
const int_tp num,
const int_tp channels,
__global const int_tp* shape_a,
__global const int_tp* shape_b) {
int_tp pad[6];
int_tp tmp_idx[6];
int_tp size_a = 1;
int_tp size_b = 1;
for (int_tp i = 0; i < dims; ++i) {
pad[i] = (shape_b[i] - shape_a[i]) / 2;
size_a *= shape_a[i];
size_b *= shape_b[i];
}
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
int_tp batch_id = index / (channels * size_a);
int_tp counter = index;
for (int_tp i = dims - 1; i >= 0; --i) {
tmp_idx[i] = counter % shape_a[i];
counter /= shape_a[i];
}
top[index] = 0;
int_tp channel_id = (index / size_a) % channels;
int_tp aidx = batch_id * channels + channel_id;
for (int_tp i = 0; i < dims; ++i) {
aidx *= shape_a[i];
aidx += tmp_idx[i];
}
top[index] = forward_a ? top[index] + bottom_a[aidx] : top[index];
int_tp bidx = (batch_id * channels + channel_id) * size_b;
int_tp btemp = 1;
for (int_tp i = dims - 1; i >= 0; --i) {
bidx += btemp * (tmp_idx[i] + pad[i]);
btemp *= shape_b[i];
}
top[index] = forward_b ? top[index] + bottom_b[bidx] : top[index];
}
}
__kernel void TEMPLATE(merge_copy_backward_add,Dtype)(const int_tp nthreads,
const int_tp dims,
__global Dtype* bottom_a,
const int_tp backward_a,
__global Dtype* bottom_b,
const int_tp backward_b,
__global const Dtype* top,
const int_tp num,
const int_tp channels,
__global const int_tp* shape_a,
__global const int_tp* shape_b) {
int_tp pad[6];
int_tp tmp_idx[6];
int_tp size_a = 1;
int_tp size_b = 1;
for (int_tp i = 0; i < dims; ++i) {
pad[i] = (shape_b[i] - shape_a[i]) / 2;
size_a *= shape_a[i];
size_b *= shape_b[i];
}
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
int_tp batch_id = index / (channels * size_a);
int_tp counter = index;
for (int_tp i = dims - 1; i >= 0; --i) {
tmp_idx[i] = counter % shape_a[i];
counter /= shape_a[i];
}
int_tp channel_id = (index / size_a) % channels;
int_tp aidx = batch_id * channels + channel_id;
for (int_tp i = 0; i < dims; ++i) {
aidx *= shape_a[i];
aidx += tmp_idx[i];
}
bottom_a[aidx] = backward_a ? top[index] : 0;
int_tp bidx = (batch_id * channels + channel_id) * size_b;
int_tp btemp = 1;
for (int_tp i = dims - 1; i >= 0; --i) {
bidx += btemp * (tmp_idx[i] + pad[i]);
btemp *= shape_b[i];
}
bottom_b[bidx] = backward_b ? top[index] : 0;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(max_pool_forward,Dtype)(
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h,
const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h,
const int_tp pad_w,
__global Dtype* top_data,
const int use_mask, __global int_tp* mask, __global Dtype* top_mask) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp pw = index % pooled_width;
const int_tp ph = (index / pooled_width) % pooled_height;
const int_tp c = (index / pooled_width / pooled_height) % channels;
const int_tp n = index / pooled_width / pooled_height / channels;
int_tp hstart = ph * stride_h - pad_h;
int_tp wstart = pw * stride_w - pad_w;
const int_tp hend = min(hstart + kernel_h, height);
const int_tp wend = min(wstart + kernel_w, width);
hstart = max(hstart, (int_tp)0);
wstart = max(wstart, (int_tp)0);
Dtype maxval = -FLT_MAX;
int_tp maxidx = -1;
__global const Dtype* bottom_slice = bottom_data
+ (n * channels + c) * height * width;
for (int_tp h = hstart; h < hend; ++h) {
for (int_tp w = wstart; w < wend; ++w) {
if (bottom_slice[h * width + w] > maxval) {
maxidx = h * width + w;
maxval = bottom_slice[maxidx];
}
}
}
top_data[index] = maxval;
if (use_mask == 1) {
mask[index] = maxidx;
} else {
top_mask[index] = maxidx;
}
}
}
__kernel void TEMPLATE(ave_pool_forward,Dtype)(
const int_tp nthreads, __global const Dtype* const bottom_data, const int_tp num,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h,
const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h,
const int_tp pad_w, __global Dtype* top_data) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
{
const int_tp pw = index % pooled_width;
const int_tp ph = (index / pooled_width) % pooled_height;
const int_tp c = (index / pooled_width / pooled_height) % channels;
const int_tp n = index / pooled_width / pooled_height / channels;
int_tp hstart = ph * stride_h - pad_h;
int_tp wstart = pw * stride_w - pad_w;
int_tp hend = min(hstart + kernel_h, height + pad_h);
int_tp wend = min(wstart + kernel_w, width + pad_w);
const int_tp pool_size = (hend - hstart) * (wend - wstart);
hstart = max(hstart, (int_tp)0);
wstart = max(wstart, (int_tp)0);
hend = min(hend, height);
wend = min(wend, width);
Dtype aveval = 0;
__global const Dtype* bottom_slice = bottom_data
+ (n * channels + c) * height * width;
for (int_tp h = hstart; h < hend; ++h) {
for (int_tp w = wstart; w < wend; ++w) {
aveval += bottom_slice[h * width + w];
}
}
top_data[index] = aveval / pool_size;
}
}
}
__kernel void TEMPLATE(sto_pool_forward_train,Dtype)(
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h,
const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w,
__global Dtype* rand_idx,
__global Dtype* top_data) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp pw = index % pooled_width;
const int_tp ph = (index / pooled_width) % pooled_height;
const int_tp c = (index / pooled_width / pooled_height) % channels;
const int_tp n = index / pooled_width / pooled_height / channels;
const int_tp hstart = ph * stride_h;
const int_tp hend = min(hstart + kernel_h, height);
const int_tp wstart = pw * stride_w;
const int_tp wend = min(wstart + kernel_w, width);
Dtype cumsum = 0.;
__global const Dtype* bottom_slice = bottom_data
+ (n * channels + c) * height * width;
// First pass: get sum
for (int_tp h = hstart; h < hend; ++h) {
for (int_tp w = wstart; w < wend; ++w) {
cumsum += bottom_slice[h * width + w];
}
}
const float thres = rand_idx[index] * cumsum;
// Second pass: get value, and set index.
cumsum = 0;
for (int_tp h = hstart; h < hend; ++h) {
for (int_tp w = wstart; w < wend; ++w) {
cumsum += bottom_slice[h * width + w];
if (cumsum >= thres) {
rand_idx[index] = ((n * channels + c) * height + h) * width + w;
top_data[index] = bottom_slice[h * width + w];
h = hend;
w = wend;
}
}
}
}
}
__kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
const int_tp nthreads, __global const Dtype* const bottom_data, const int_tp num,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h,
const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w,
__global Dtype* top_data) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp pw = index % pooled_width;
const int_tp ph = (index / pooled_width) % pooled_height;
const int_tp c = (index / pooled_width / pooled_height) % channels;
const int_tp n = index / pooled_width / pooled_height / channels;
const int_tp hstart = ph * stride_h;
const int_tp hend = min(hstart + kernel_h, height);
const int_tp wstart = pw * stride_w;
const int_tp wend = min(wstart + kernel_w, width);
// We set cumsum to be 0 to avoid divide-by-zero problems
Dtype cumsum = FLT_MIN;
Dtype cumvalues = 0.;
__global const Dtype* bottom_slice = bottom_data
+ (n * channels + c) * height * width;
// First pass: get sum
for (int_tp h = hstart; h < hend; ++h) {
for (int_tp w = wstart; w < wend; ++w) {
cumsum += bottom_slice[h * width + w];
cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
}
}
top_data[index] = cumvalues / cumsum;
}
}
__kernel void TEMPLATE(max_pool_backward,Dtype)(const int_tp nthreads,
__global const Dtype* top_diff,
const int use_mask,
__global const int_tp* mask,
__global const Dtype* top_mask,
const int_tp num,
const int_tp channels,
const int_tp height,
const int_tp width,
const int_tp pooled_height,
const int_tp pooled_width,
const int_tp kernel_h,
const int_tp kernel_w,
const int_tp stride_h,
const int_tp stride_w,
const int_tp pad_h,
const int_tp pad_w,
__global Dtype* bottom_diff) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
// find out the local index
// find out the local offset
const int_tp w = index % width;
const int_tp h = (index / width) % height;
const int_tp c = (index / width / height) % channels;
const int_tp n = index / width / height / channels;
const int_tp phstart =
(h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
const int_tp phend = min((h + pad_h) / stride_h + 1, pooled_height);
const int_tp pwstart =
(w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
const int_tp pwend = min((w + pad_w) / stride_w + 1, pooled_width);
Dtype gradient = 0;
const int_tp offset = (n * channels + c) * pooled_height * pooled_width;
__global const Dtype* top_diff_slice = top_diff + offset;
if (use_mask == 1) {
__global const int_tp* mask_slice = mask + offset;
for (int_tp ph = phstart; ph < phend; ++ph) {
for (int_tp pw = pwstart; pw < pwend; ++pw) {
if (mask_slice[ph * pooled_width + pw] == h * width + w) {
gradient += top_diff_slice[ph * pooled_width + pw];
}
}
}
} else {
__global const Dtype* top_mask_slice = top_mask + offset;
for (int_tp ph = phstart; ph < phend; ++ph) {
for (int_tp pw = pwstart; pw < pwend; ++pw) {
if (top_mask_slice[ph * pooled_width + pw] == h * width + w) {
gradient += top_diff_slice[ph * pooled_width + pw];
}
}
}
}
bottom_diff[index] = gradient;
}
}
__kernel void TEMPLATE(ave_pool_backward,Dtype)(const int_tp nthreads,
__global const Dtype* top_diff,
const int_tp num,
const int_tp channels,
const int_tp height,
const int_tp width,
const int_tp pooled_height,
const int_tp pooled_width,
const int_tp kernel_h,
const int_tp kernel_w,
const int_tp stride_h,
const int_tp stride_w,
const int_tp pad_h,
const int_tp pad_w,
__global Dtype* bottom_diff) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
// find out the local index
// find out the local offset
const int_tp w = index % width + pad_w;
const int_tp h = (index / width) % height + pad_h;
const int_tp c = (index / width / height) % channels;
const int_tp n = index / width / height / channels;
const int_tp phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
const int_tp phend = min(h / stride_h + 1, pooled_height);
const int_tp pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
const int_tp pwend = min(w / stride_w + 1, pooled_width);
Dtype gradient = 0.0;
__global const Dtype* const top_diff_slice = top_diff
+ (n * channels + c) * pooled_height * pooled_width;
for (int_tp ph = phstart; ph < phend; ++ph) {
for (int_tp pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size
int_tp hstart = ph * stride_h - pad_h;
int_tp wstart = pw * stride_w - pad_w;
int_tp hend = min(hstart + kernel_h, height + pad_h);
int_tp wend = min(wstart + kernel_w, width + pad_w);
int_tp pool_size = (hend - hstart) * (wend - wstart);
gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
}
}
bottom_diff[index] = gradient;
}
}
__kernel void TEMPLATE(sto_pool_backward,Dtype)(
const int_tp nthreads, __global const Dtype* rand_idx,
__global const Dtype* const top_diff, const int_tp num,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp pooled_height, const int_tp pooled_width,
const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h,
const int_tp stride_w, __global Dtype* bottom_diff) {
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
// find out the local index
// find out the local offset
const int_tp w = index % width;
const int_tp h = (index / width) % height;
const int_tp c = (index / width / height) % channels;
const int_tp n = index / width / height / channels;
const int_tp phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
const int_tp phend = min(h / stride_h + 1, pooled_height);
const int_tp pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
const int_tp pwend = min(w / stride_w + 1, pooled_width);
Dtype gradient = 0.0;
__global const Dtype* rand_idx_slice = rand_idx
+ (n * channels + c) * pooled_height * pooled_width;
__global const Dtype* top_diff_slice = top_diff
+ (n * channels + c) * pooled_height * pooled_width;
for (int_tp ph = phstart; ph < phend; ++ph) {
for (int_tp pw = pwstart; pw < pwend; ++pw) {
gradient += top_diff_slice[ph * pooled_width + pw]
* (index == (int_tp) (rand_idx_slice[ph * pooled_width + pw])?1.0:0.0);
}
}
bottom_diff[index] = gradient;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(max_pool_forward_nd, Dtype)(const int_tp n,
const int_tp num_axes,
__global const Dtype* bottom_data,
const int_tp channels,
__global const int_tp* size,
__global const int_tp* pooled_size,
__global const int_tp* kernel_size,
__global const int_tp* ext_kernel_size,
__global const int_tp* stride,
__global const int_tp* dilation,
__global const int_tp* pad,
__global Dtype* top_data,
const int use_mask,
__global int_tp* mask, __global Dtype* top_mask) {
int_tp d_idx[6];
int_tp d_start[6];
int_tp d_end[6];
int_tp d_iter[6];
int_tp i;
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
int_tp offset = 1;
int_tp num = index;
bool do_continue = false;
for (i = num_axes - 1; i >= 0; --i) {
d_idx[i] = num % pooled_size[i];
d_start[i] = d_idx[i] * stride[i] - pad[i];
d_end[i] = min(d_start[i] + ext_kernel_size[i], size[i]);
d_start[i] = max(d_start[i], (int_tp)0);
num /= pooled_size[i];
offset *= size[i];
d_iter[i] = d_start[i];
if (d_start[i] >= d_end[i]) {
top_data[index] = -FLT_MAX;
if (use_mask) {
mask[index] = -1;
} else {
top_mask[index] = -1;
}
do_continue = true;
}
}
if(do_continue) {
continue;
}
int_tp chan = num % channels;
num /= channels;
offset *= (num * channels + chan);
Dtype maxval = -FLT_MAX;
int_tp maxidx = -1;
int_tp final_offset = 0;
bool incremented;
do {
final_offset = offset;
int_tp size_prod = 1;
for (i = num_axes - 1; i >= 0; --i) {
final_offset += d_iter[i] * size_prod;
size_prod *= size[i];
}
if (bottom_data[final_offset] > maxval) {
maxidx = final_offset;
maxval = bottom_data[maxidx];
}
incremented = false;
for (i = num_axes - 1; i >= 0; --i) {
if (d_iter[i] >= d_end[i] - dilation[i]) {
d_iter[i] = d_start[i];
} else {
d_iter[i] += dilation[i];
incremented = true;
break;
}
}
} while (incremented);
top_data[index] = maxval;
if (use_mask == 1) {
mask[index] = maxidx;
} else {
top_mask[index] = maxidx;
}
}
}
__kernel void TEMPLATE(max_pool_backward_nd, Dtype)(const int_tp n,
const int_tp num_axes,
__global const Dtype* top_diff,
const int use_mask,
__global const int_tp* mask,
__global const Dtype* top_mask,
const int_tp channels,
__global const int_tp* size,
__global const int_tp* pooled_size,
__global const int_tp* kernel_size,
__global const int_tp* ext_kernel_size,
__global const int_tp* stride,
__global const int_tp* dilation,
__global const int_tp* pad,
__global Dtype* bottom_diff) {
int_tp d_idx[6];
int_tp d_start[6];
int_tp d_end[6];
int_tp d_iter[6];
int_tp i;
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) {
// find out the local index
// find out the local offset
int_tp offset = 1;
int_tp num = index;
for (i = num_axes - 1; i >= 0; --i) {
d_idx[i] = num % size[i];
if (dilation[i] > 1) {
d_start[i] =
(d_idx[i] < ext_kernel_size[i]) ?
d_idx[i] % dilation[i] : (d_idx[i] - ext_kernel_size[i]) + 1;
d_end[i] =
(d_idx[i] >= pooled_size[i]) ?
(pooled_size[i] - 1)
- (pooled_size[i] - 1 - d_start[i]) % dilation[i] :
d_idx[i];
} else {
d_start[i] =
(d_idx[i] + pad[i] < kernel_size[i]) ?
0 : (d_idx[i] + pad[i] - kernel_size[i]) / stride[i] + 1;
d_end[i] = min((int_tp) ((d_idx[i] + pad[i]) / stride[i] + 1),
(int_tp) (pooled_size[i]));
}
num /= size[i];
offset *= pooled_size[i];
d_iter[i] = d_start[i];
if (d_start[i] > d_end[i]) {
bottom_diff[index] = 0;
return;
}
}
int_tp chan = num % channels;
num /= channels;
offset *= (num * channels + chan);
Dtype gradient = 0;
int_tp final_offset = 0;
int_tp im_offset = 0;
bool incremented;
do {
final_offset = offset;
im_offset = 0;
int_tp size_prod = 1;
int_tp pooled_size_prod = 1;
for (i = num_axes - 1; i >= 0; --i) {
final_offset += d_iter[i] * pooled_size_prod;
im_offset += d_idx[i] * size_prod;
size_prod *= size[i];
pooled_size_prod *= pooled_size[i];
}
if (use_mask) {
if (mask[final_offset] == im_offset) {
gradient += top_diff[final_offset];
}
} else {
if (top_mask[final_offset] == im_offset) {
gradient += top_diff[final_offset];
}
}
incremented = false;
for (i = num_axes - 1; i >= 0; --i) {
if (d_iter[i] > d_end[i] - dilation[i]) {
d_iter[i] = d_start[i];
} else {
d_iter[i] += dilation[i];
incremented = true;
break;
}
}
} while (incremented);
bottom_diff[index] = gradient;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(max_pool_forward_sk,Dtype)(const int_tp nthreads,
__global Dtype* bottom_data,
const int_tp num,
const int_tp channels,
const int_tp height,
const int_tp width,
const int_tp pooled_height,
const int_tp pooled_width,
const int_tp kernel_h,
const int_tp kernel_w,
const int_tp ext_kernel_h,
const int_tp ext_kernel_w,
const int_tp stride_h,
const int_tp stride_w,
const int_tp dilation_h,
const int_tp dilation_w,
const int_tp pad_h,
const int_tp pad_w,
__global Dtype* top_data,
const int use_mask,
__global int_tp* mask,
__global Dtype* top_mask) {
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
int_tp pw = index % pooled_width;
int_tp ph = (index / pooled_width) % pooled_height;
int_tp c = (index / pooled_width / pooled_height) % channels;
int_tp n = index / pooled_width / pooled_height / channels;
int_tp hstart = ph * stride_h - pad_h;
int_tp wstart = pw * stride_w - pad_w;
int_tp hend = min(hstart + ext_kernel_h, height);
int_tp wend = min(wstart + ext_kernel_w, width);
hstart = max(hstart, (int_tp) 0);
wstart = max(wstart, (int_tp) 0);
Dtype maxval = -FLT_MAX;
int_tp maxidx = -1;
__global Dtype* bottom_data_ptr = bottom_data
+ (n * channels + c) * height * width;
for (int_tp h = hstart; h < hend; h += dilation_h) {
for (int_tp w = wstart; w < wend; w += dilation_w) {
if (bottom_data_ptr[h * width + w] > maxval) {
maxidx = h * width + w;
maxval = bottom_data_ptr[maxidx];
}
}
}
top_data[index] = maxval;
if (use_mask == 1) {
mask[index] = maxidx;
} else {
top_mask[index] = maxidx;
}
}
}
__kernel void TEMPLATE(max_pool_backward_sk,Dtype)(
const int_tp nthreads, __global const Dtype* top_diff, const int use_mask,
__global const int_tp* mask, __global const Dtype* top_mask,
const int_tp num, const int_tp channels, const int_tp height,
const int_tp width, const int_tp pooled_height, const int_tp pooled_width,
const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h,
const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w,
const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h,
const int_tp pad_w,
__global Dtype* bottom_diff) {
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
__global const int_tp* mask_ptr = mask;
__global const Dtype* top_diff_ptr = top_diff;
// find out the local index
// find out the local offset
int_tp w = index % width;
int_tp h = (index / width) % height;
int_tp c = (index / width / height) % channels;
int_tp n = index / width / height / channels;
int_tp pooled_height_1 = pooled_height - 1;
int_tp pooled_width_1 = pooled_width - 1;
int_tp phstart =
(h < ext_kernel_h) ? h % dilation_h : (h - ext_kernel_h) + 1;
int_tp phend =
(h >= pooled_height) ?
pooled_height_1 - (pooled_height_1 - phstart) % dilation_h : h;
int_tp pwstart =
(w < ext_kernel_w) ? w % dilation_w : (w - ext_kernel_w) + 1;
int_tp pwend =
(w >= pooled_width) ?
pooled_width_1 - (pooled_width_1 - pwstart) % dilation_w : w;
Dtype gradient = 0;
int_tp offset = (n * channels + c) * pooled_height * pooled_width;
top_diff_ptr += offset;
if (use_mask == 1) {
mask_ptr += offset;
for (int_tp ph = phstart; ph <= phend; ph += dilation_h) {
for (int_tp pw = pwstart; pw <= pwend; pw += dilation_w) {
if (mask_ptr[ph * pooled_width + pw] == h * width + w) {
gradient += top_diff_ptr[ph * pooled_width + pw];
}
}
}
} else {
for (int_tp ph = phstart; ph <= phend; ph += dilation_h) {
for (int_tp pw = pwstart; pw <= pwend; pw += dilation_w) {
if (top_mask[ph * pooled_width + pw] == h * width + w) {
gradient += top_diff_ptr[ph * pooled_width + pw];
}
}
}
}
bottom_diff[index] = gradient;
}
}
__kernel void TEMPLATE(ave_pool_forward_sk,Dtype)(
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp pooled_height, const int_tp pooled_width,
const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h,
const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w,
const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h,
const int_tp pad_w,
__global Dtype* top_data) {
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
int_tp pw = index % pooled_width;
int_tp ph = (index / pooled_width) % pooled_height;
int_tp c = (index / pooled_width / pooled_height) % channels;
int_tp n = index / pooled_width / pooled_height / channels;
int_tp hstart = ph * stride_h - pad_h;
int_tp wstart = pw * stride_w - pad_w;
int_tp hend = min(hstart + ext_kernel_h, height + pad_h);
int_tp wend = min(wstart + ext_kernel_w, width + pad_w);
hstart = max(hstart, (int_tp)0);
wstart = max(wstart, (int_tp)0);
hend = min(hend, height);
wend = min(wend, width);
Dtype aveval = 0;
__global const Dtype* bottom_data_ptr = bottom_data;
bottom_data_ptr += (n * channels + c) * height * width;
int_tp pool_size = 0;
for (int_tp h = hstart; h < hend; ++h) {
for (int_tp w = wstart; w < wend; ++w) {
aveval += bottom_data_ptr[h * width + w];
++pool_size;
}
}
top_data[index] = aveval / pool_size;
}
}
__kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)(
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp pooled_height, const int_tp pooled_width,
const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h,
const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w,
const int_tp dilation_h, const int_tp dilation_w, __global Dtype* rand_idx,
__global Dtype* top_data) {
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
int_tp pw = index % pooled_width;
int_tp ph = (index / pooled_width) % pooled_height;
int_tp c = (index / pooled_width / pooled_height) % channels;
int_tp n = index / pooled_width / pooled_height / channels;
int_tp hstart = ph * stride_h;
int_tp hend = min(hstart + ext_kernel_h, height);
int_tp wstart = pw * stride_w;
int_tp wend = min(wstart + ext_kernel_w, width);
Dtype cumsum = 0.;
__global const Dtype* bottom_data_ptr = bottom_data;
bottom_data_ptr += (n * channels + c) * height * width;
// First pass: get sum
for (int_tp h = hstart; h < hend; h += dilation_h) {
for (int_tp w = wstart; w < wend; w += dilation_w) {
cumsum += bottom_data_ptr[h * width + w];
}
}
float thres = rand_idx[index] * cumsum;
// Second pass: get value, and set index.
cumsum = 0;
for (int_tp h = hstart; h < hend; h += dilation_h) {
for (int_tp w = wstart; w < wend; w += dilation_w) {
cumsum += bottom_data_ptr[h * width + w];
if (cumsum >= thres) {
rand_idx[index] = ((n * channels + c) * height + h) * width + w;
top_data[index] = bottom_data_ptr[h * width + w];
h = hend;
w = wend;
}
}
}
}
}
__kernel void TEMPLATE(sto_pool_forward_test_sk,Dtype)(
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num,
const int_tp channels, const int_tp height, const int_tp width,
const int_tp pooled_height, const int_tp pooled_width,
const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h,
const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w,
const int_tp dilation_h, const int_tp dilation_w,
__global Dtype* top_data) {
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
int_tp pw = index % pooled_width;
int_tp ph = (index / pooled_width) % pooled_height;
int_tp c = (index / pooled_width / pooled_height) % channels;
int_tp n = index / pooled_width / pooled_height / channels;
int_tp hstart = ph * stride_h;
int_tp hend = min(hstart + ext_kernel_h, height);
int_tp wstart = pw * stride_w;
int_tp wend = min(wstart + ext_kernel_w, width);
// We set cumsum to be 0 to avoid divide-by-zero problems
Dtype cumsum = FLT_MIN;
Dtype cumvalues = 0.;
__global const Dtype* bottom_data_ptr = bottom_data;
bottom_data_ptr += (n * channels + c) * height * width;
// First pass: get sum
for (int_tp h = hstart; h < hend; h += dilation_h) {
for (int_tp w = wstart; w < wend; w += dilation_w) {
cumsum += bottom_data_ptr[h * width + w];
cumvalues += bottom_data_ptr[h * width + w]
* bottom_data_ptr[h * width + w];
}
}
top_data[index] = cumvalues / cumsum;
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(slice,Dtype)(const int_tp nthreads,
__global const Dtype* in_data,
const int forward, const int_tp num_slices,
const int_tp slice_size,
const int_tp bottom_slice_axis,
const int_tp top_slice_axis,
const int_tp offset_slice_axis,
__global Dtype* out_data) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp total_slice_size = slice_size * top_slice_axis;
const int_tp slice_num = index / total_slice_size;
const int_tp slice_index = index % total_slice_size;
const int_tp bottom_index = slice_index
+ (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;
if (forward == 1) {
out_data[index] = in_data[bottom_index];
} else {
out_data[bottom_index] = in_data[index];
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(softmax_loss_forward,Dtype)(
int_tp n, __global const Dtype* prob_data, __global const Dtype* label,
__global Dtype* loss,
const int_tp num, const int_tp dim, const int_tp spatial_dim,
const int has_ignore_label_, const int_tp ignore_label_,
__global Dtype* counts) {
for (int_tp index = get_global_id(0); index < n;
index += get_global_size(0)) {
const int_tp n = index / spatial_dim;
const int_tp s = index % spatial_dim;
const int_tp label_value = (int_tp) (label[n * spatial_dim + s]);
if (has_ignore_label_ == 1 && label_value == ignore_label_) {
loss[index] = 0;
counts[index] = 0;
} else {
loss[index] = -log((Dtype)(
max((Dtype) (prob_data[n * dim + label_value * spatial_dim + s]),
(Dtype) FLT_MIN)));
counts[index] = 1;
}
}
}
__kernel void TEMPLATE(softmax_loss_backward,Dtype)(const int_tp nthreads,
__global const Dtype* top,
__global const Dtype* label,
__global Dtype* bottom_diff,
const int_tp num,
const int_tp dim,
const int_tp spatial_dim,
const int has_ignore_label_,
const int_tp ignore_label_,
__global Dtype* counts) {
const int_tp channels = dim / spatial_dim;
for (int_tp index = get_global_id(0); index < nthreads; index +=
get_global_size(0)) {
const int_tp n = index / spatial_dim;
const int_tp s = index % spatial_dim;
const int_tp label_value = (int_tp) (label[n * spatial_dim + s]);
if (has_ignore_label_ == 1 && label_value == ignore_label_) {
for (int_tp c = 0; c < channels; ++c) {
bottom_diff[n * dim + c * spatial_dim + s] = 0;
}
counts[index] = 0;
} else {
bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
counts[index] = 1;
}
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(ada_delta_update,Dtype)(int_tp N, __global Dtype* g,
__global Dtype* h,
__global Dtype* h2,
Dtype momentum,
Dtype delta,
Dtype local_rate) {
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
Dtype gi = g[i];
Dtype hi = h[i] = momentum * h[i] + (1.0 - momentum) * gi * gi;
gi = gi * sqrt((h2[i] + delta) / (hi + delta));
h2[i] = momentum * h2[i] + (1.0 - momentum) * gi * gi;
g[i] = local_rate * gi;
}
}
__kernel void TEMPLATE(ada_grad_update,Dtype)(int_tp N, __global Dtype* g,
__global Dtype* h,
Dtype delta,
Dtype local_rate) {
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
Dtype gi = g[i];
Dtype hi = h[i] = h[i] + gi * gi;
g[i] = local_rate * gi / (sqrt(hi) + delta);
}
}
__kernel void TEMPLATE(adam_update,Dtype)(int_tp N, __global Dtype* g,
__global Dtype* m,
__global Dtype* v,
Dtype beta1,
Dtype beta2,
Dtype eps_hat,
Dtype corrected_local_rate) {
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
Dtype gi = g[i];
Dtype mi = m[i] = m[i] * beta1 + gi * (1 - beta1);
Dtype vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2);
g[i] = corrected_local_rate * mi / (sqrt(vi) + eps_hat);
}
}
__kernel void TEMPLATE(nesterov_update,Dtype)(int_tp N, __global Dtype* g,
__global Dtype* h,
Dtype momentum,
Dtype local_rate) {
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
Dtype hi = h[i];
Dtype hi_new = h[i] = momentum * hi + local_rate * g[i];
g[i] = (1 + momentum) * hi_new - momentum * hi;
}
}
__kernel void TEMPLATE(rms_prop_update,Dtype)(int_tp N, __global Dtype* g,
__global Dtype* h,
Dtype rms_decay,
Dtype delta,
Dtype local_rate) {
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
Dtype gi = g[i];
Dtype hi = h[i] = rms_decay * h[i] + (1 - rms_decay) * gi * gi;
g[i] = local_rate * g[i] / (sqrt(hi) + delta);
}
}
__kernel void TEMPLATE(sgd_update,Dtype)(int_tp N, __global Dtype* g,
__global Dtype* h,
Dtype momentum,
Dtype local_rate) {
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) {
g[i] = h[i] = momentum * h[i] + local_rate * g[i];
}
}
#ifndef __OPENCL_VERSION__
#include "header.cl"
#endif
__kernel void TEMPLATE(tile,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data,
const int_tp tile_size, const int_tp num_tiles,
const int_tp bottom_tile_axis,
__global Dtype* top_data) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp d = index % tile_size;
const int_tp b = (index / tile_size / num_tiles) % bottom_tile_axis;
const int_tp n = index / tile_size / num_tiles / bottom_tile_axis;
const int_tp bottom_index = (n * bottom_tile_axis + b) * tile_size + d;
top_data[index] = bottom_data[bottom_index];
}
}
__kernel void TEMPLATE(tile_backward,Dtype)(const int_tp nthreads,
__global const Dtype* top_diff,
const int_tp tile_size,
const int_tp num_tiles,
const int_tp bottom_tile_axis,
__global Dtype* bottom_diff) {
for (int_tp index = get_global_id(0); index < nthreads;
index += get_global_size(0)) {
const int_tp d = index % tile_size;
const int_tp b = (index / tile_size) % bottom_tile_axis;
const int_tp n = index / tile_size / bottom_tile_axis;
bottom_diff[index] = 0;
int_tp top_index = (n * num_tiles * bottom_tile_axis + b) * tile_size + d;
for (int_tp t = 0; t < num_tiles; ++t) {
bottom_diff[index] += top_diff[top_index];
top_index += bottom_tile_axis * tile_size;
}
}
}
#endif // DOUBLE_SUPPORT_AVAILABLE
Note: Google Test filter = *OpenCLKernelCompileTest*
[==========] Running 2 tests from 2 test cases.
[----------] Global test environment set-up.
[----------] 1 test from OpenCLKernelCompileTest/0, where TypeParam = float
[ RUN ] OpenCLKernelCompileTest/0.TestCompile
Kernel bundle: activation: OK
Kernel bundle: auxiliary: OK
Kernel bundle: batch_reindex: OK
Kernel bundle: benchmark: OK
Kernel bundle: bias: OK
Kernel bundle: bnll: OK
Kernel bundle: channel: OK
Kernel bundle: concat: OK
Kernel bundle: contrastive_loss: OK
Kernel bundle: conv_layer_spatial: OK
Kernel bundle: conv_spatial_helper: OK
Kernel bundle: crop: OK
Kernel bundle: dropout: OK
Kernel bundle: eltwise: OK
Kernel bundle: elu: OK
BUILD LOG:
(92:0) : error : atomic function "atomic_cmpxchg" not supported
Kernel bundle: fft: OK
Kernel bundle: fillbuffer: OK
Kernel bundle: im2col: OK
Kernel bundle: im2col_nd: OK
Kernel bundle: lrn: OK
Kernel bundle: lstm_unit: OK
Kernel bundle: math: OK
Kernel bundle: mergecrop: OK
Kernel bundle: pooling: OK
Kernel bundle: pooling_nd: OK
Kernel bundle: pooling_sk: OK
Kernel bundle: slice: OK
Kernel bundle: softmax_loss: OK
Kernel bundle: solvers: OK
Kernel bundle: tile: OK
src/caffe/test/test_ocl_kernel_compile.cpp:79: Failure
Value of: failure
Actual: true
Expected: false
[ FAILED ] OpenCLKernelCompileTest/0.TestCompile, where TypeParam = float (775 ms)
[----------] 1 test from OpenCLKernelCompileTest/0 (775 ms total)
[----------] 1 test from OpenCLKernelCompileTest/1, where TypeParam = double
[ RUN ] OpenCLKernelCompileTest/1.TestCompile
[ OK ] OpenCLKernelCompileTest/1.TestCompile (0 ms)
[----------] 1 test from OpenCLKernelCompileTest/1 (1 ms total)
[----------] Global test environment tear-down
[==========] 2 tests from 2 test cases ran. (777 ms total)
[ PASSED ] 1 test.
[ FAILED ] 1 test, listed below:
[ FAILED ] OpenCLKernelCompileTest/0.TestCompile, where TypeParam = float
1 FAILED TEST
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment