Skip to content

Instantly share code, notes, and snippets.

@moskewcz
Last active October 14, 2015 21:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save moskewcz/20b4b19818622c3d3904 to your computer and use it in GitHub Desktop.
Save moskewcz/20b4b19818622c3d3904 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
//typedef unsigned uint32_t;
typedef int int32_t;
//typedef long long int64_t;
#define CUCL_GLOBAL_KERNEL kernel
#define GASQ global
#define GLOB_ID_1D get_global_id(0)
#define LOC_ID_1D get_local_id(0)
#define GRP_ID_1D get_group_id(0)
#define LOC_SZ_1D get_local_size(0)
#define LOCSHAR_MEM local
#define LSMASQ local
#define BARRIER_SYNC barrier(CLK_LOCAL_MEM_FENCE)
// note: it seems OpenCL doesn't provide powf(), but instead overloads pow() for double and float.
// so, we use this as a compatibility wrapper.
// the casts should help uses that might expect implict casts from double->float when using powf()
// ... or maybe that's a bad idea?
#define powf(v,e) pow((float)v,(float)e)
// each thread: computes 8x8 block of out
// loop over k dim
CUCL_GLOBAL_KERNEL void tconv__num_imgs_20__in_dim_0_227__in_dim_1_227__kern_sz_7__stride_2__in_pad_3__t_tile_sz_8__conv_has_relu_1__out_chans_64__in_chans_3( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) {
LOCSHAR_MEM float all_smem[1330]; // note: max(filts+in,out) == max(448+882,1024)
LSMASQ float * const filts_smem = all_smem;
LSMASQ float * const in_smem = filts_smem + 448;
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz )
float in_strip[21]; // segment of input line sufficient for one unrolling of inner loop
int32_t blk_in_ix_base = GRP_ID_1D*2646 + LOC_ID_1D;// index of first input pel to load for this thread
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*9408; // index of first out chan
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D;
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%8);
int32_t out_line = (GRP_ID_1D/15)*16; // first out_line of block
int32_t const blk_fli = (out_line/114); // image of first out_line of block
out_line += (LOC_ID_1D/8); // adjust to out_line of this thread
// offset in lines to deal with >1 img/block = (number of prior images (partial or full) in this block) * (adj to next img)
int32_t const img_off_lines = ((out_line/114) - blk_fli)*(7-2);
int32_t const in_y = (out_line%114)*2 - 3;
for( int32_t in_chan = 0; in_chan != 3; ++in_chan ) {
BARRIER_SYNC;
// begin in_smem_loads
in_smem[(LOC_ID_1D + 128 * 0)] = in[ blk_in_ix_base + (128*0) ];
in_smem[(LOC_ID_1D + 128 * 1)] = in[ blk_in_ix_base + (128*1) ];
in_smem[(LOC_ID_1D + 128 * 2)] = in[ blk_in_ix_base + (128*2) ];
in_smem[(LOC_ID_1D + 128 * 3)] = in[ blk_in_ix_base + (128*3) ];
in_smem[(LOC_ID_1D + 128 * 4)] = in[ blk_in_ix_base + (128*4) ];
in_smem[(LOC_ID_1D + 128 * 5)] = in[ blk_in_ix_base + (128*5) ];
if( (LOC_ID_1D + 128 * 6) < 882) { in_smem[(LOC_ID_1D + 128 * 6)] = in[ blk_in_ix_base + (128*6) ];}
blk_in_ix_base += 882;
// end in_smem_loads;
for( int32_t ky = 0; ky != 7; ++ky ) {
if( ky != 0 ) { BARRIER_SYNC; }
// begin filt_smem_loads
filts_smem[(LOC_ID_1D + 128 * 0)] = filts[filts_off+(128*0)];
filts_smem[(LOC_ID_1D + 128 * 1)] = filts[filts_off+(128*1)];
filts_smem[(LOC_ID_1D + 128 * 2)] = filts[filts_off+(128*2)];
if( (LOC_ID_1D + 128 * 3) < 448 ) { filts_smem[(LOC_ID_1D + 128 * 3)] = filts[filts_off+(128*3)];}
filts_off += 448;
// end filt_smem_loads;
BARRIER_SYNC;
if( (out_line/114) >= 20 ) { continue; } // required: skip lines from invalid images (read might be invalid)
if( ((in_y+ky) < 0) || ((in_y+ky)>227) ) { continue; } // optimization: skip known-to-be-padding input lines
LSMASQ float * const in_smem_off = in_smem + ((LOC_ID_1D/8)*2+ky+img_off_lines)*21;
// begin inner_loop_body
in_strip[0] = in_smem_off[0];
in_strip[1] = in_smem_off[1];
in_strip[2] = in_smem_off[2];
in_strip[3] = in_smem_off[3];
in_strip[4] = in_smem_off[4];
in_strip[5] = in_smem_off[5];
in_strip[6] = in_smem_off[6];
in_strip[7] = in_smem_off[7];
in_strip[8] = in_smem_off[8];
in_strip[9] = in_smem_off[9];
in_strip[10] = in_smem_off[10];
in_strip[11] = in_smem_off[11];
in_strip[12] = in_smem_off[12];
in_strip[13] = in_smem_off[13];
in_strip[14] = in_smem_off[14];
in_strip[15] = in_smem_off[15];
in_strip[16] = in_smem_off[16];
in_strip[17] = in_smem_off[17];
in_strip[18] = in_smem_off[18];
in_strip[19] = in_smem_off[19];
in_strip[20] = in_smem_off[20];
filts_strip[0] = filts_smem_off[0*64+0*8];
filts_strip[1] = filts_smem_off[0*64+1*8];
filts_strip[2] = filts_smem_off[0*64+2*8];
filts_strip[3] = filts_smem_off[0*64+3*8];
filts_strip[4] = filts_smem_off[0*64+4*8];
filts_strip[5] = filts_smem_off[0*64+5*8];
filts_strip[6] = filts_smem_off[0*64+6*8];
filts_strip[7] = filts_smem_off[0*64+7*8];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[2];
out_tile[9] += filts_strip[1]*in_strip[2];
out_tile[10] += filts_strip[2]*in_strip[2];
out_tile[11] += filts_strip[3]*in_strip[2];
out_tile[12] += filts_strip[4]*in_strip[2];
out_tile[13] += filts_strip[5]*in_strip[2];
out_tile[14] += filts_strip[6]*in_strip[2];
out_tile[15] += filts_strip[7]*in_strip[2];
out_tile[16] += filts_strip[0]*in_strip[4];
out_tile[17] += filts_strip[1]*in_strip[4];
out_tile[18] += filts_strip[2]*in_strip[4];
out_tile[19] += filts_strip[3]*in_strip[4];
out_tile[20] += filts_strip[4]*in_strip[4];
out_tile[21] += filts_strip[5]*in_strip[4];
out_tile[22] += filts_strip[6]*in_strip[4];
out_tile[23] += filts_strip[7]*in_strip[4];
out_tile[24] += filts_strip[0]*in_strip[6];
out_tile[25] += filts_strip[1]*in_strip[6];
out_tile[26] += filts_strip[2]*in_strip[6];
out_tile[27] += filts_strip[3]*in_strip[6];
out_tile[28] += filts_strip[4]*in_strip[6];
out_tile[29] += filts_strip[5]*in_strip[6];
out_tile[30] += filts_strip[6]*in_strip[6];
out_tile[31] += filts_strip[7]*in_strip[6];
out_tile[32] += filts_strip[0]*in_strip[8];
out_tile[33] += filts_strip[1]*in_strip[8];
out_tile[34] += filts_strip[2]*in_strip[8];
out_tile[35] += filts_strip[3]*in_strip[8];
out_tile[36] += filts_strip[4]*in_strip[8];
out_tile[37] += filts_strip[5]*in_strip[8];
out_tile[38] += filts_strip[6]*in_strip[8];
out_tile[39] += filts_strip[7]*in_strip[8];
out_tile[40] += filts_strip[0]*in_strip[10];
out_tile[41] += filts_strip[1]*in_strip[10];
out_tile[42] += filts_strip[2]*in_strip[10];
out_tile[43] += filts_strip[3]*in_strip[10];
out_tile[44] += filts_strip[4]*in_strip[10];
out_tile[45] += filts_strip[5]*in_strip[10];
out_tile[46] += filts_strip[6]*in_strip[10];
out_tile[47] += filts_strip[7]*in_strip[10];
out_tile[48] += filts_strip[0]*in_strip[12];
out_tile[49] += filts_strip[1]*in_strip[12];
out_tile[50] += filts_strip[2]*in_strip[12];
out_tile[51] += filts_strip[3]*in_strip[12];
out_tile[52] += filts_strip[4]*in_strip[12];
out_tile[53] += filts_strip[5]*in_strip[12];
out_tile[54] += filts_strip[6]*in_strip[12];
out_tile[55] += filts_strip[7]*in_strip[12];
out_tile[56] += filts_strip[0]*in_strip[14];
out_tile[57] += filts_strip[1]*in_strip[14];
out_tile[58] += filts_strip[2]*in_strip[14];
out_tile[59] += filts_strip[3]*in_strip[14];
out_tile[60] += filts_strip[4]*in_strip[14];
out_tile[61] += filts_strip[5]*in_strip[14];
out_tile[62] += filts_strip[6]*in_strip[14];
out_tile[63] += filts_strip[7]*in_strip[14];
filts_strip[0] = filts_smem_off[1*64+0*8];
filts_strip[1] = filts_smem_off[1*64+1*8];
filts_strip[2] = filts_smem_off[1*64+2*8];
filts_strip[3] = filts_smem_off[1*64+3*8];
filts_strip[4] = filts_smem_off[1*64+4*8];
filts_strip[5] = filts_smem_off[1*64+5*8];
filts_strip[6] = filts_smem_off[1*64+6*8];
filts_strip[7] = filts_smem_off[1*64+7*8];
out_tile[0] += filts_strip[0]*in_strip[1];
out_tile[1] += filts_strip[1]*in_strip[1];
out_tile[2] += filts_strip[2]*in_strip[1];
out_tile[3] += filts_strip[3]*in_strip[1];
out_tile[4] += filts_strip[4]*in_strip[1];
out_tile[5] += filts_strip[5]*in_strip[1];
out_tile[6] += filts_strip[6]*in_strip[1];
out_tile[7] += filts_strip[7]*in_strip[1];
out_tile[8] += filts_strip[0]*in_strip[3];
out_tile[9] += filts_strip[1]*in_strip[3];
out_tile[10] += filts_strip[2]*in_strip[3];
out_tile[11] += filts_strip[3]*in_strip[3];
out_tile[12] += filts_strip[4]*in_strip[3];
out_tile[13] += filts_strip[5]*in_strip[3];
out_tile[14] += filts_strip[6]*in_strip[3];
out_tile[15] += filts_strip[7]*in_strip[3];
out_tile[16] += filts_strip[0]*in_strip[5];
out_tile[17] += filts_strip[1]*in_strip[5];
out_tile[18] += filts_strip[2]*in_strip[5];
out_tile[19] += filts_strip[3]*in_strip[5];
out_tile[20] += filts_strip[4]*in_strip[5];
out_tile[21] += filts_strip[5]*in_strip[5];
out_tile[22] += filts_strip[6]*in_strip[5];
out_tile[23] += filts_strip[7]*in_strip[5];
out_tile[24] += filts_strip[0]*in_strip[7];
out_tile[25] += filts_strip[1]*in_strip[7];
out_tile[26] += filts_strip[2]*in_strip[7];
out_tile[27] += filts_strip[3]*in_strip[7];
out_tile[28] += filts_strip[4]*in_strip[7];
out_tile[29] += filts_strip[5]*in_strip[7];
out_tile[30] += filts_strip[6]*in_strip[7];
out_tile[31] += filts_strip[7]*in_strip[7];
out_tile[32] += filts_strip[0]*in_strip[9];
out_tile[33] += filts_strip[1]*in_strip[9];
out_tile[34] += filts_strip[2]*in_strip[9];
out_tile[35] += filts_strip[3]*in_strip[9];
out_tile[36] += filts_strip[4]*in_strip[9];
out_tile[37] += filts_strip[5]*in_strip[9];
out_tile[38] += filts_strip[6]*in_strip[9];
out_tile[39] += filts_strip[7]*in_strip[9];
out_tile[40] += filts_strip[0]*in_strip[11];
out_tile[41] += filts_strip[1]*in_strip[11];
out_tile[42] += filts_strip[2]*in_strip[11];
out_tile[43] += filts_strip[3]*in_strip[11];
out_tile[44] += filts_strip[4]*in_strip[11];
out_tile[45] += filts_strip[5]*in_strip[11];
out_tile[46] += filts_strip[6]*in_strip[11];
out_tile[47] += filts_strip[7]*in_strip[11];
out_tile[48] += filts_strip[0]*in_strip[13];
out_tile[49] += filts_strip[1]*in_strip[13];
out_tile[50] += filts_strip[2]*in_strip[13];
out_tile[51] += filts_strip[3]*in_strip[13];
out_tile[52] += filts_strip[4]*in_strip[13];
out_tile[53] += filts_strip[5]*in_strip[13];
out_tile[54] += filts_strip[6]*in_strip[13];
out_tile[55] += filts_strip[7]*in_strip[13];
out_tile[56] += filts_strip[0]*in_strip[15];
out_tile[57] += filts_strip[1]*in_strip[15];
out_tile[58] += filts_strip[2]*in_strip[15];
out_tile[59] += filts_strip[3]*in_strip[15];
out_tile[60] += filts_strip[4]*in_strip[15];
out_tile[61] += filts_strip[5]*in_strip[15];
out_tile[62] += filts_strip[6]*in_strip[15];
out_tile[63] += filts_strip[7]*in_strip[15];
filts_strip[0] = filts_smem_off[2*64+0*8];
filts_strip[1] = filts_smem_off[2*64+1*8];
filts_strip[2] = filts_smem_off[2*64+2*8];
filts_strip[3] = filts_smem_off[2*64+3*8];
filts_strip[4] = filts_smem_off[2*64+4*8];
filts_strip[5] = filts_smem_off[2*64+5*8];
filts_strip[6] = filts_smem_off[2*64+6*8];
filts_strip[7] = filts_smem_off[2*64+7*8];
out_tile[0] += filts_strip[0]*in_strip[2];
out_tile[1] += filts_strip[1]*in_strip[2];
out_tile[2] += filts_strip[2]*in_strip[2];
out_tile[3] += filts_strip[3]*in_strip[2];
out_tile[4] += filts_strip[4]*in_strip[2];
out_tile[5] += filts_strip[5]*in_strip[2];
out_tile[6] += filts_strip[6]*in_strip[2];
out_tile[7] += filts_strip[7]*in_strip[2];
out_tile[8] += filts_strip[0]*in_strip[4];
out_tile[9] += filts_strip[1]*in_strip[4];
out_tile[10] += filts_strip[2]*in_strip[4];
out_tile[11] += filts_strip[3]*in_strip[4];
out_tile[12] += filts_strip[4]*in_strip[4];
out_tile[13] += filts_strip[5]*in_strip[4];
out_tile[14] += filts_strip[6]*in_strip[4];
out_tile[15] += filts_strip[7]*in_strip[4];
out_tile[16] += filts_strip[0]*in_strip[6];
out_tile[17] += filts_strip[1]*in_strip[6];
out_tile[18] += filts_strip[2]*in_strip[6];
out_tile[19] += filts_strip[3]*in_strip[6];
out_tile[20] += filts_strip[4]*in_strip[6];
out_tile[21] += filts_strip[5]*in_strip[6];
out_tile[22] += filts_strip[6]*in_strip[6];
out_tile[23] += filts_strip[7]*in_strip[6];
out_tile[24] += filts_strip[0]*in_strip[8];
out_tile[25] += filts_strip[1]*in_strip[8];
out_tile[26] += filts_strip[2]*in_strip[8];
out_tile[27] += filts_strip[3]*in_strip[8];
out_tile[28] += filts_strip[4]*in_strip[8];
out_tile[29] += filts_strip[5]*in_strip[8];
out_tile[30] += filts_strip[6]*in_strip[8];
out_tile[31] += filts_strip[7]*in_strip[8];
out_tile[32] += filts_strip[0]*in_strip[10];
out_tile[33] += filts_strip[1]*in_strip[10];
out_tile[34] += filts_strip[2]*in_strip[10];
out_tile[35] += filts_strip[3]*in_strip[10];
out_tile[36] += filts_strip[4]*in_strip[10];
out_tile[37] += filts_strip[5]*in_strip[10];
out_tile[38] += filts_strip[6]*in_strip[10];
out_tile[39] += filts_strip[7]*in_strip[10];
out_tile[40] += filts_strip[0]*in_strip[12];
out_tile[41] += filts_strip[1]*in_strip[12];
out_tile[42] += filts_strip[2]*in_strip[12];
out_tile[43] += filts_strip[3]*in_strip[12];
out_tile[44] += filts_strip[4]*in_strip[12];
out_tile[45] += filts_strip[5]*in_strip[12];
out_tile[46] += filts_strip[6]*in_strip[12];
out_tile[47] += filts_strip[7]*in_strip[12];
out_tile[48] += filts_strip[0]*in_strip[14];
out_tile[49] += filts_strip[1]*in_strip[14];
out_tile[50] += filts_strip[2]*in_strip[14];
out_tile[51] += filts_strip[3]*in_strip[14];
out_tile[52] += filts_strip[4]*in_strip[14];
out_tile[53] += filts_strip[5]*in_strip[14];
out_tile[54] += filts_strip[6]*in_strip[14];
out_tile[55] += filts_strip[7]*in_strip[14];
out_tile[56] += filts_strip[0]*in_strip[16];
out_tile[57] += filts_strip[1]*in_strip[16];
out_tile[58] += filts_strip[2]*in_strip[16];
out_tile[59] += filts_strip[3]*in_strip[16];
out_tile[60] += filts_strip[4]*in_strip[16];
out_tile[61] += filts_strip[5]*in_strip[16];
out_tile[62] += filts_strip[6]*in_strip[16];
out_tile[63] += filts_strip[7]*in_strip[16];
filts_strip[0] = filts_smem_off[3*64+0*8];
filts_strip[1] = filts_smem_off[3*64+1*8];
filts_strip[2] = filts_smem_off[3*64+2*8];
filts_strip[3] = filts_smem_off[3*64+3*8];
filts_strip[4] = filts_smem_off[3*64+4*8];
filts_strip[5] = filts_smem_off[3*64+5*8];
filts_strip[6] = filts_smem_off[3*64+6*8];
filts_strip[7] = filts_smem_off[3*64+7*8];
out_tile[0] += filts_strip[0]*in_strip[3];
out_tile[1] += filts_strip[1]*in_strip[3];
out_tile[2] += filts_strip[2]*in_strip[3];
out_tile[3] += filts_strip[3]*in_strip[3];
out_tile[4] += filts_strip[4]*in_strip[3];
out_tile[5] += filts_strip[5]*in_strip[3];
out_tile[6] += filts_strip[6]*in_strip[3];
out_tile[7] += filts_strip[7]*in_strip[3];
out_tile[8] += filts_strip[0]*in_strip[5];
out_tile[9] += filts_strip[1]*in_strip[5];
out_tile[10] += filts_strip[2]*in_strip[5];
out_tile[11] += filts_strip[3]*in_strip[5];
out_tile[12] += filts_strip[4]*in_strip[5];
out_tile[13] += filts_strip[5]*in_strip[5];
out_tile[14] += filts_strip[6]*in_strip[5];
out_tile[15] += filts_strip[7]*in_strip[5];
out_tile[16] += filts_strip[0]*in_strip[7];
out_tile[17] += filts_strip[1]*in_strip[7];
out_tile[18] += filts_strip[2]*in_strip[7];
out_tile[19] += filts_strip[3]*in_strip[7];
out_tile[20] += filts_strip[4]*in_strip[7];
out_tile[21] += filts_strip[5]*in_strip[7];
out_tile[22] += filts_strip[6]*in_strip[7];
out_tile[23] += filts_strip[7]*in_strip[7];
out_tile[24] += filts_strip[0]*in_strip[9];
out_tile[25] += filts_strip[1]*in_strip[9];
out_tile[26] += filts_strip[2]*in_strip[9];
out_tile[27] += filts_strip[3]*in_strip[9];
out_tile[28] += filts_strip[4]*in_strip[9];
out_tile[29] += filts_strip[5]*in_strip[9];
out_tile[30] += filts_strip[6]*in_strip[9];
out_tile[31] += filts_strip[7]*in_strip[9];
out_tile[32] += filts_strip[0]*in_strip[11];
out_tile[33] += filts_strip[1]*in_strip[11];
out_tile[34] += filts_strip[2]*in_strip[11];
out_tile[35] += filts_strip[3]*in_strip[11];
out_tile[36] += filts_strip[4]*in_strip[11];
out_tile[37] += filts_strip[5]*in_strip[11];
out_tile[38] += filts_strip[6]*in_strip[11];
out_tile[39] += filts_strip[7]*in_strip[11];
out_tile[40] += filts_strip[0]*in_strip[13];
out_tile[41] += filts_strip[1]*in_strip[13];
out_tile[42] += filts_strip[2]*in_strip[13];
out_tile[43] += filts_strip[3]*in_strip[13];
out_tile[44] += filts_strip[4]*in_strip[13];
out_tile[45] += filts_strip[5]*in_strip[13];
out_tile[46] += filts_strip[6]*in_strip[13];
out_tile[47] += filts_strip[7]*in_strip[13];
out_tile[48] += filts_strip[0]*in_strip[15];
out_tile[49] += filts_strip[1]*in_strip[15];
out_tile[50] += filts_strip[2]*in_strip[15];
out_tile[51] += filts_strip[3]*in_strip[15];
out_tile[52] += filts_strip[4]*in_strip[15];
out_tile[53] += filts_strip[5]*in_strip[15];
out_tile[54] += filts_strip[6]*in_strip[15];
out_tile[55] += filts_strip[7]*in_strip[15];
out_tile[56] += filts_strip[0]*in_strip[17];
out_tile[57] += filts_strip[1]*in_strip[17];
out_tile[58] += filts_strip[2]*in_strip[17];
out_tile[59] += filts_strip[3]*in_strip[17];
out_tile[60] += filts_strip[4]*in_strip[17];
out_tile[61] += filts_strip[5]*in_strip[17];
out_tile[62] += filts_strip[6]*in_strip[17];
out_tile[63] += filts_strip[7]*in_strip[17];
filts_strip[0] = filts_smem_off[4*64+0*8];
filts_strip[1] = filts_smem_off[4*64+1*8];
filts_strip[2] = filts_smem_off[4*64+2*8];
filts_strip[3] = filts_smem_off[4*64+3*8];
filts_strip[4] = filts_smem_off[4*64+4*8];
filts_strip[5] = filts_smem_off[4*64+5*8];
filts_strip[6] = filts_smem_off[4*64+6*8];
filts_strip[7] = filts_smem_off[4*64+7*8];
out_tile[0] += filts_strip[0]*in_strip[4];
out_tile[1] += filts_strip[1]*in_strip[4];
out_tile[2] += filts_strip[2]*in_strip[4];
out_tile[3] += filts_strip[3]*in_strip[4];
out_tile[4] += filts_strip[4]*in_strip[4];
out_tile[5] += filts_strip[5]*in_strip[4];
out_tile[6] += filts_strip[6]*in_strip[4];
out_tile[7] += filts_strip[7]*in_strip[4];
out_tile[8] += filts_strip[0]*in_strip[6];
out_tile[9] += filts_strip[1]*in_strip[6];
out_tile[10] += filts_strip[2]*in_strip[6];
out_tile[11] += filts_strip[3]*in_strip[6];
out_tile[12] += filts_strip[4]*in_strip[6];
out_tile[13] += filts_strip[5]*in_strip[6];
out_tile[14] += filts_strip[6]*in_strip[6];
out_tile[15] += filts_strip[7]*in_strip[6];
out_tile[16] += filts_strip[0]*in_strip[8];
out_tile[17] += filts_strip[1]*in_strip[8];
out_tile[18] += filts_strip[2]*in_strip[8];
out_tile[19] += filts_strip[3]*in_strip[8];
out_tile[20] += filts_strip[4]*in_strip[8];
out_tile[21] += filts_strip[5]*in_strip[8];
out_tile[22] += filts_strip[6]*in_strip[8];
out_tile[23] += filts_strip[7]*in_strip[8];
out_tile[24] += filts_strip[0]*in_strip[10];
out_tile[25] += filts_strip[1]*in_strip[10];
out_tile[26] += filts_strip[2]*in_strip[10];
out_tile[27] += filts_strip[3]*in_strip[10];
out_tile[28] += filts_strip[4]*in_strip[10];
out_tile[29] += filts_strip[5]*in_strip[10];
out_tile[30] += filts_strip[6]*in_strip[10];
out_tile[31] += filts_strip[7]*in_strip[10];
out_tile[32] += filts_strip[0]*in_strip[12];
out_tile[33] += filts_strip[1]*in_strip[12];
out_tile[34] += filts_strip[2]*in_strip[12];
out_tile[35] += filts_strip[3]*in_strip[12];
out_tile[36] += filts_strip[4]*in_strip[12];
out_tile[37] += filts_strip[5]*in_strip[12];
out_tile[38] += filts_strip[6]*in_strip[12];
out_tile[39] += filts_strip[7]*in_strip[12];
out_tile[40] += filts_strip[0]*in_strip[14];
out_tile[41] += filts_strip[1]*in_strip[14];
out_tile[42] += filts_strip[2]*in_strip[14];
out_tile[43] += filts_strip[3]*in_strip[14];
out_tile[44] += filts_strip[4]*in_strip[14];
out_tile[45] += filts_strip[5]*in_strip[14];
out_tile[46] += filts_strip[6]*in_strip[14];
out_tile[47] += filts_strip[7]*in_strip[14];
out_tile[48] += filts_strip[0]*in_strip[16];
out_tile[49] += filts_strip[1]*in_strip[16];
out_tile[50] += filts_strip[2]*in_strip[16];
out_tile[51] += filts_strip[3]*in_strip[16];
out_tile[52] += filts_strip[4]*in_strip[16];
out_tile[53] += filts_strip[5]*in_strip[16];
out_tile[54] += filts_strip[6]*in_strip[16];
out_tile[55] += filts_strip[7]*in_strip[16];
out_tile[56] += filts_strip[0]*in_strip[18];
out_tile[57] += filts_strip[1]*in_strip[18];
out_tile[58] += filts_strip[2]*in_strip[18];
out_tile[59] += filts_strip[3]*in_strip[18];
out_tile[60] += filts_strip[4]*in_strip[18];
out_tile[61] += filts_strip[5]*in_strip[18];
out_tile[62] += filts_strip[6]*in_strip[18];
out_tile[63] += filts_strip[7]*in_strip[18];
filts_strip[0] = filts_smem_off[5*64+0*8];
filts_strip[1] = filts_smem_off[5*64+1*8];
filts_strip[2] = filts_smem_off[5*64+2*8];
filts_strip[3] = filts_smem_off[5*64+3*8];
filts_strip[4] = filts_smem_off[5*64+4*8];
filts_strip[5] = filts_smem_off[5*64+5*8];
filts_strip[6] = filts_smem_off[5*64+6*8];
filts_strip[7] = filts_smem_off[5*64+7*8];
out_tile[0] += filts_strip[0]*in_strip[5];
out_tile[1] += filts_strip[1]*in_strip[5];
out_tile[2] += filts_strip[2]*in_strip[5];
out_tile[3] += filts_strip[3]*in_strip[5];
out_tile[4] += filts_strip[4]*in_strip[5];
out_tile[5] += filts_strip[5]*in_strip[5];
out_tile[6] += filts_strip[6]*in_strip[5];
out_tile[7] += filts_strip[7]*in_strip[5];
out_tile[8] += filts_strip[0]*in_strip[7];
out_tile[9] += filts_strip[1]*in_strip[7];
out_tile[10] += filts_strip[2]*in_strip[7];
out_tile[11] += filts_strip[3]*in_strip[7];
out_tile[12] += filts_strip[4]*in_strip[7];
out_tile[13] += filts_strip[5]*in_strip[7];
out_tile[14] += filts_strip[6]*in_strip[7];
out_tile[15] += filts_strip[7]*in_strip[7];
out_tile[16] += filts_strip[0]*in_strip[9];
out_tile[17] += filts_strip[1]*in_strip[9];
out_tile[18] += filts_strip[2]*in_strip[9];
out_tile[19] += filts_strip[3]*in_strip[9];
out_tile[20] += filts_strip[4]*in_strip[9];
out_tile[21] += filts_strip[5]*in_strip[9];
out_tile[22] += filts_strip[6]*in_strip[9];
out_tile[23] += filts_strip[7]*in_strip[9];
out_tile[24] += filts_strip[0]*in_strip[11];
out_tile[25] += filts_strip[1]*in_strip[11];
out_tile[26] += filts_strip[2]*in_strip[11];
out_tile[27] += filts_strip[3]*in_strip[11];
out_tile[28] += filts_strip[4]*in_strip[11];
out_tile[29] += filts_strip[5]*in_strip[11];
out_tile[30] += filts_strip[6]*in_strip[11];
out_tile[31] += filts_strip[7]*in_strip[11];
out_tile[32] += filts_strip[0]*in_strip[13];
out_tile[33] += filts_strip[1]*in_strip[13];
out_tile[34] += filts_strip[2]*in_strip[13];
out_tile[35] += filts_strip[3]*in_strip[13];
out_tile[36] += filts_strip[4]*in_strip[13];
out_tile[37] += filts_strip[5]*in_strip[13];
out_tile[38] += filts_strip[6]*in_strip[13];
out_tile[39] += filts_strip[7]*in_strip[13];
out_tile[40] += filts_strip[0]*in_strip[15];
out_tile[41] += filts_strip[1]*in_strip[15];
out_tile[42] += filts_strip[2]*in_strip[15];
out_tile[43] += filts_strip[3]*in_strip[15];
out_tile[44] += filts_strip[4]*in_strip[15];
out_tile[45] += filts_strip[5]*in_strip[15];
out_tile[46] += filts_strip[6]*in_strip[15];
out_tile[47] += filts_strip[7]*in_strip[15];
out_tile[48] += filts_strip[0]*in_strip[17];
out_tile[49] += filts_strip[1]*in_strip[17];
out_tile[50] += filts_strip[2]*in_strip[17];
out_tile[51] += filts_strip[3]*in_strip[17];
out_tile[52] += filts_strip[4]*in_strip[17];
out_tile[53] += filts_strip[5]*in_strip[17];
out_tile[54] += filts_strip[6]*in_strip[17];
out_tile[55] += filts_strip[7]*in_strip[17];
out_tile[56] += filts_strip[0]*in_strip[19];
out_tile[57] += filts_strip[1]*in_strip[19];
out_tile[58] += filts_strip[2]*in_strip[19];
out_tile[59] += filts_strip[3]*in_strip[19];
out_tile[60] += filts_strip[4]*in_strip[19];
out_tile[61] += filts_strip[5]*in_strip[19];
out_tile[62] += filts_strip[6]*in_strip[19];
out_tile[63] += filts_strip[7]*in_strip[19];
filts_strip[0] = filts_smem_off[6*64+0*8];
filts_strip[1] = filts_smem_off[6*64+1*8];
filts_strip[2] = filts_smem_off[6*64+2*8];
filts_strip[3] = filts_smem_off[6*64+3*8];
filts_strip[4] = filts_smem_off[6*64+4*8];
filts_strip[5] = filts_smem_off[6*64+5*8];
filts_strip[6] = filts_smem_off[6*64+6*8];
filts_strip[7] = filts_smem_off[6*64+7*8];
out_tile[0] += filts_strip[0]*in_strip[6];
out_tile[1] += filts_strip[1]*in_strip[6];
out_tile[2] += filts_strip[2]*in_strip[6];
out_tile[3] += filts_strip[3]*in_strip[6];
out_tile[4] += filts_strip[4]*in_strip[6];
out_tile[5] += filts_strip[5]*in_strip[6];
out_tile[6] += filts_strip[6]*in_strip[6];
out_tile[7] += filts_strip[7]*in_strip[6];
out_tile[8] += filts_strip[0]*in_strip[8];
out_tile[9] += filts_strip[1]*in_strip[8];
out_tile[10] += filts_strip[2]*in_strip[8];
out_tile[11] += filts_strip[3]*in_strip[8];
out_tile[12] += filts_strip[4]*in_strip[8];
out_tile[13] += filts_strip[5]*in_strip[8];
out_tile[14] += filts_strip[6]*in_strip[8];
out_tile[15] += filts_strip[7]*in_strip[8];
out_tile[16] += filts_strip[0]*in_strip[10];
out_tile[17] += filts_strip[1]*in_strip[10];
out_tile[18] += filts_strip[2]*in_strip[10];
out_tile[19] += filts_strip[3]*in_strip[10];
out_tile[20] += filts_strip[4]*in_strip[10];
out_tile[21] += filts_strip[5]*in_strip[10];
out_tile[22] += filts_strip[6]*in_strip[10];
out_tile[23] += filts_strip[7]*in_strip[10];
out_tile[24] += filts_strip[0]*in_strip[12];
out_tile[25] += filts_strip[1]*in_strip[12];
out_tile[26] += filts_strip[2]*in_strip[12];
out_tile[27] += filts_strip[3]*in_strip[12];
out_tile[28] += filts_strip[4]*in_strip[12];
out_tile[29] += filts_strip[5]*in_strip[12];
out_tile[30] += filts_strip[6]*in_strip[12];
out_tile[31] += filts_strip[7]*in_strip[12];
out_tile[32] += filts_strip[0]*in_strip[14];
out_tile[33] += filts_strip[1]*in_strip[14];
out_tile[34] += filts_strip[2]*in_strip[14];
out_tile[35] += filts_strip[3]*in_strip[14];
out_tile[36] += filts_strip[4]*in_strip[14];
out_tile[37] += filts_strip[5]*in_strip[14];
out_tile[38] += filts_strip[6]*in_strip[14];
out_tile[39] += filts_strip[7]*in_strip[14];
out_tile[40] += filts_strip[0]*in_strip[16];
out_tile[41] += filts_strip[1]*in_strip[16];
out_tile[42] += filts_strip[2]*in_strip[16];
out_tile[43] += filts_strip[3]*in_strip[16];
out_tile[44] += filts_strip[4]*in_strip[16];
out_tile[45] += filts_strip[5]*in_strip[16];
out_tile[46] += filts_strip[6]*in_strip[16];
out_tile[47] += filts_strip[7]*in_strip[16];
out_tile[48] += filts_strip[0]*in_strip[18];
out_tile[49] += filts_strip[1]*in_strip[18];
out_tile[50] += filts_strip[2]*in_strip[18];
out_tile[51] += filts_strip[3]*in_strip[18];
out_tile[52] += filts_strip[4]*in_strip[18];
out_tile[53] += filts_strip[5]*in_strip[18];
out_tile[54] += filts_strip[6]*in_strip[18];
out_tile[55] += filts_strip[7]*in_strip[18];
out_tile[56] += filts_strip[0]*in_strip[20];
out_tile[57] += filts_strip[1]*in_strip[20];
out_tile[58] += filts_strip[2]*in_strip[20];
out_tile[59] += filts_strip[3]*in_strip[20];
out_tile[60] += filts_strip[4]*in_strip[20];
out_tile[61] += filts_strip[5]*in_strip[20];
out_tile[62] += filts_strip[6]*in_strip[20];
out_tile[63] += filts_strip[7]*in_strip[20];
;
}
}
if( flags == 2 ) { return; }
BARRIER_SYNC;
for( int32_t i = 0; i != 1; ++i ) {
int32_t const t_smem_bias_ix = LOC_ID_1D+128*i;
if( t_smem_bias_ix < 64 ) {
int32_t const ocix_base = (GRP_ID_1D%1)*64;
int32_t const load_reg = t_smem_bias_ix / 8;
int32_t const load_tile = t_smem_bias_ix % 8;
int32_t const ocix = ocix_base + load_tile*8 + load_reg;
if( ocix < 64 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; }
}
}
BARRIER_SYNC;
// begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*8];
filts_strip[1] = filts_smem_off[1*8];
filts_strip[2] = filts_smem_off[2*8];
filts_strip[3] = filts_smem_off[3*8];
filts_strip[4] = filts_smem_off[4*8];
filts_strip[5] = filts_smem_off[5*8];
filts_strip[6] = filts_smem_off[6*8];
filts_strip[7] = filts_smem_off[7*8];
// end t_tile_bias_loads;
if( flags == 1 ) { return; }
// begin t_tile_stores
if( (out_line/114) >= 20 ) { return; }
int32_t out_x = (GRP_ID_1D%15)*8;
int32_t out_chan = ((GRP_ID_1D%1)*8 + (LOC_ID_1D%8))*8;
GASQ float * out_off = out + (out_line/114)*831744 + out_chan*12996 + (out_line%114)*114 + out_x*1 ;
if( (out_x + 0) >= 114 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 64 ) { out_off[ 0*12996 + 0*1 ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( (out_chan + 1) < 64 ) { out_off[ 1*12996 + 0*1 ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( (out_chan + 2) < 64 ) { out_off[ 2*12996 + 0*1 ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( (out_chan + 3) < 64 ) { out_off[ 3*12996 + 0*1 ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( (out_chan + 4) < 64 ) { out_off[ 4*12996 + 0*1 ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( (out_chan + 5) < 64 ) { out_off[ 5*12996 + 0*1 ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( (out_chan + 6) < 64 ) { out_off[ 6*12996 + 0*1 ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( (out_chan + 7) < 64 ) { out_off[ 7*12996 + 0*1 ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( (out_x + 1) >= 114 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 64 ) { out_off[ 0*12996 + 1*1 ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( (out_chan + 1) < 64 ) { out_off[ 1*12996 + 1*1 ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( (out_chan + 2) < 64 ) { out_off[ 2*12996 + 1*1 ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( (out_chan + 3) < 64 ) { out_off[ 3*12996 + 1*1 ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( (out_chan + 4) < 64 ) { out_off[ 4*12996 + 1*1 ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( (out_chan + 5) < 64 ) { out_off[ 5*12996 + 1*1 ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( (out_chan + 6) < 64 ) { out_off[ 6*12996 + 1*1 ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( (out_chan + 7) < 64 ) { out_off[ 7*12996 + 1*1 ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( (out_x + 2) >= 114 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 64 ) { out_off[ 0*12996 + 2*1 ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( (out_chan + 1) < 64 ) { out_off[ 1*12996 + 2*1 ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( (out_chan + 2) < 64 ) { out_off[ 2*12996 + 2*1 ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( (out_chan + 3) < 64 ) { out_off[ 3*12996 + 2*1 ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( (out_chan + 4) < 64 ) { out_off[ 4*12996 + 2*1 ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( (out_chan + 5) < 64 ) { out_off[ 5*12996 + 2*1 ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( (out_chan + 6) < 64 ) { out_off[ 6*12996 + 2*1 ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( (out_chan + 7) < 64 ) { out_off[ 7*12996 + 2*1 ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( (out_x + 3) >= 114 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 64 ) { out_off[ 0*12996 + 3*1 ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( (out_chan + 1) < 64 ) { out_off[ 1*12996 + 3*1 ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( (out_chan + 2) < 64 ) { out_off[ 2*12996 + 3*1 ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( (out_chan + 3) < 64 ) { out_off[ 3*12996 + 3*1 ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( (out_chan + 4) < 64 ) { out_off[ 4*12996 + 3*1 ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( (out_chan + 5) < 64 ) { out_off[ 5*12996 + 3*1 ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( (out_chan + 6) < 64 ) { out_off[ 6*12996 + 3*1 ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( (out_chan + 7) < 64 ) { out_off[ 7*12996 + 3*1 ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( (out_x + 4) >= 114 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 64 ) { out_off[ 0*12996 + 4*1 ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( (out_chan + 1) < 64 ) { out_off[ 1*12996 + 4*1 ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( (out_chan + 2) < 64 ) { out_off[ 2*12996 + 4*1 ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( (out_chan + 3) < 64 ) { out_off[ 3*12996 + 4*1 ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( (out_chan + 4) < 64 ) { out_off[ 4*12996 + 4*1 ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( (out_chan + 5) < 64 ) { out_off[ 5*12996 + 4*1 ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( (out_chan + 6) < 64 ) { out_off[ 6*12996 + 4*1 ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( (out_chan + 7) < 64 ) { out_off[ 7*12996 + 4*1 ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( (out_x + 5) >= 114 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 64 ) { out_off[ 0*12996 + 5*1 ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( (out_chan + 1) < 64 ) { out_off[ 1*12996 + 5*1 ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( (out_chan + 2) < 64 ) { out_off[ 2*12996 + 5*1 ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( (out_chan + 3) < 64 ) { out_off[ 3*12996 + 5*1 ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( (out_chan + 4) < 64 ) { out_off[ 4*12996 + 5*1 ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( (out_chan + 5) < 64 ) { out_off[ 5*12996 + 5*1 ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( (out_chan + 6) < 64 ) { out_off[ 6*12996 + 5*1 ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( (out_chan + 7) < 64 ) { out_off[ 7*12996 + 5*1 ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( (out_x + 6) >= 114 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 64 ) { out_off[ 0*12996 + 6*1 ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( (out_chan + 1) < 64 ) { out_off[ 1*12996 + 6*1 ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( (out_chan + 2) < 64 ) { out_off[ 2*12996 + 6*1 ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( (out_chan + 3) < 64 ) { out_off[ 3*12996 + 6*1 ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( (out_chan + 4) < 64 ) { out_off[ 4*12996 + 6*1 ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( (out_chan + 5) < 64 ) { out_off[ 5*12996 + 6*1 ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( (out_chan + 6) < 64 ) { out_off[ 6*12996 + 6*1 ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( (out_chan + 7) < 64 ) { out_off[ 7*12996 + 6*1 ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( (out_x + 7) >= 114 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 64 ) { out_off[ 0*12996 + 7*1 ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( (out_chan + 1) < 64 ) { out_off[ 1*12996 + 7*1 ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( (out_chan + 2) < 64 ) { out_off[ 2*12996 + 7*1 ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( (out_chan + 3) < 64 ) { out_off[ 3*12996 + 7*1 ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( (out_chan + 4) < 64 ) { out_off[ 4*12996 + 7*1 ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( (out_chan + 5) < 64 ) { out_off[ 5*12996 + 7*1 ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( (out_chan + 6) < 64 ) { out_off[ 6*12996 + 7*1 ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( (out_chan + 7) < 64 ) { out_off[ 7*12996 + 7*1 ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_dim_0 = 227 */
/* in_dim_1 = 227 */
/* kern_sz = 7 */
/* stride = 2 */
/* in_pad = 3 */
/* t_tile_sz = 8 */
/* conv_has_relu = 1 */
/* out_chans = 64 */
/* in_chans = 3 */
/* rtc_func_name = tconv__num_imgs_20__in_dim_0_227__in_dim_1_227__kern_sz_7__stride_2__in_pad_3__t_tile_sz_8__conv_has_relu_1__out_chans_64__in_chans_3 */
/* out_ix_x_dim = 114 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%114) */
/* out_ix_y_dim = 114 */
/* out_ix_y_sz = 114 */
/* out_ix_y_nomod = (out_ix/114) */
/* out_ix_y = ((out_ix/114)%%114) */
/* out_ix_chan_dim = 64 */
/* out_ix_chan_sz = 12996 */
/* out_ix_chan_nomod = (out_ix/12996) */
/* out_ix_chan = ((out_ix/12996)%%64) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 831744 */
/* out_ix_img_nomod = (out_ix/831744) */
/* out_ix_img = (out_ix/831744) */
/* out_ix_sz = 16634880 */
/* tpb = 128 */
/* out_line_y_dim = 114 */
/* out_line_y_sz = 1 */
/* out_line_y_nomod = out_line */
/* out_line_y = (out_line%%114) */
/* out_line_img_dim = 20 */
/* out_line_img_sz = 114 */
/* out_line_img_nomod = (out_line/114) */
/* out_line_img = (out_line/114) */
/* out_line_sz = 2280 */
/* in_ix_blk_x_dim = 21 */
/* in_ix_blk_x_sz = 1 */
/* in_ix_blk_x_nomod = in_ix */
/* in_ix_blk_x = (in_ix%%21) */
/* in_ix_blk_y_dim = 42 */
/* in_ix_blk_y_sz = 21 */
/* in_ix_blk_y_nomod = (in_ix/21) */
/* in_ix_blk_y = ((in_ix/21)%%42) */
/* in_ix_blk_in_chan_dim = 3 */
/* in_ix_blk_in_chan_sz = 882 */
/* in_ix_blk_in_chan_nomod = (in_ix/882) */
/* in_ix_blk_in_chan = ((in_ix/882)%%3) */
/* in_ix_blk_bx_dim = 15 */
/* in_ix_blk_bx_sz = 2646 */
/* in_ix_blk_bx_nomod = (in_ix/2646) */
/* in_ix_blk_bx = ((in_ix/2646)%%15) */
/* in_ix_blk_bline_dim = 143 */
/* in_ix_blk_bline_sz = 39690 */
/* in_ix_blk_bline_nomod = (in_ix/39690) */
/* in_ix_blk_bline = (in_ix/39690) */
/* in_ix_sz = 5675670 */
/* LOC_ID_1D_out_chan_tile_dim = 8 */
/* LOC_ID_1D_out_chan_tile_sz = 1 */
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%8) */
/* LOC_ID_1D_blk_y_dim = 16 */
/* LOC_ID_1D_blk_y_sz = 8 */
/* LOC_ID_1D_blk_y_nomod = (LOC_ID_1D/8) */
/* LOC_ID_1D_blk_y = (LOC_ID_1D/8) */
/* LOC_ID_1D_sz = 128 */
/* GRP_ID_1D_out_chan_blk_dim = 1 */
/* GRP_ID_1D_out_chan_blk_sz = 1 */
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */
/* GRP_ID_1D_blk_bx_dim = 15 */
/* GRP_ID_1D_blk_bx_sz = 1 */
/* GRP_ID_1D_blk_bx_nomod = GRP_ID_1D */
/* GRP_ID_1D_blk_bx = (GRP_ID_1D%%15) */
/* GRP_ID_1D_blk_bline_dim = 143 */
/* GRP_ID_1D_blk_bline_sz = 15 */
/* GRP_ID_1D_blk_bline_nomod = (GRP_ID_1D/15) */
/* GRP_ID_1D_blk_bline = (GRP_ID_1D/15) */
/* GRP_ID_1D_sz = 2145 */
/* blk_filt_ix_sz = 64 */
/* filts_smem_sz = 448 */
/* in_smem_sz = 882 */
/* out_smem_sz = 1024 */
/* all_smem_sz = 1330 */
/* filts_xp_ix_out_chan_tile_dim = 8 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%8) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 8 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/8) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/8)%%8) */
/* filts_xp_ix_x_dim = 7 */
/* filts_xp_ix_x_sz = 64 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/64) */
/* filts_xp_ix_x = ((filts_xp_ix/64)%%7) */
/* filts_xp_ix_y_dim = 7 */
/* filts_xp_ix_y_sz = 448 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/448) */
/* filts_xp_ix_y = ((filts_xp_ix/448)%%7) */
/* filts_xp_ix_in_chan_dim = 3 */
/* filts_xp_ix_in_chan_sz = 3136 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/3136) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/3136)%%3) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 9408 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/9408) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/9408) */
/* filts_xp_ix_sz = 9408 */
/* out_chan_bias_smem_load_iter = 1 */
/* filts_off_adj = LOC_ID_1D */
/* filt_smem_loads = // begin filt_smem_loads
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)];
filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)];
filts_smem[(LOC_ID_1D + %(tpb) * 2)] = filts[filts_off+(%(tpb)*2)];
if( (LOC_ID_1D + %(tpb) * 3) < %(filts_smem_sz) ) { filts_smem[(LOC_ID_1D + %(tpb) * 3)] = filts[filts_off+(%(tpb)*3)];}
filts_off += %(filts_xp_ix_y_sz);
// end filt_smem_loads */
/* in_smem_loads = // begin in_smem_loads
in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ];
in_smem[(LOC_ID_1D + %(tpb) * 1)] = in[ blk_in_ix_base + (%(tpb)*1) ];
in_smem[(LOC_ID_1D + %(tpb) * 2)] = in[ blk_in_ix_base + (%(tpb)*2) ];
in_smem[(LOC_ID_1D + %(tpb) * 3)] = in[ blk_in_ix_base + (%(tpb)*3) ];
in_smem[(LOC_ID_1D + %(tpb) * 4)] = in[ blk_in_ix_base + (%(tpb)*4) ];
in_smem[(LOC_ID_1D + %(tpb) * 5)] = in[ blk_in_ix_base + (%(tpb)*5) ];
if( (LOC_ID_1D + %(tpb) * 6) < %(in_smem_sz)) { in_smem[(LOC_ID_1D + %(tpb) * 6)] = in[ blk_in_ix_base + (%(tpb)*6) ];}
blk_in_ix_base += %(in_ix_blk_in_chan_sz);
// end in_smem_loads */
/* inner_loop_body = // begin inner_loop_body
in_strip[0] = in_smem_off[0];
in_strip[1] = in_smem_off[1];
in_strip[2] = in_smem_off[2];
in_strip[3] = in_smem_off[3];
in_strip[4] = in_smem_off[4];
in_strip[5] = in_smem_off[5];
in_strip[6] = in_smem_off[6];
in_strip[7] = in_smem_off[7];
in_strip[8] = in_smem_off[8];
in_strip[9] = in_smem_off[9];
in_strip[10] = in_smem_off[10];
in_strip[11] = in_smem_off[11];
in_strip[12] = in_smem_off[12];
in_strip[13] = in_smem_off[13];
in_strip[14] = in_smem_off[14];
in_strip[15] = in_smem_off[15];
in_strip[16] = in_smem_off[16];
in_strip[17] = in_smem_off[17];
in_strip[18] = in_smem_off[18];
in_strip[19] = in_smem_off[19];
in_strip[20] = in_smem_off[20];
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[2];
out_tile[9] += filts_strip[1]*in_strip[2];
out_tile[10] += filts_strip[2]*in_strip[2];
out_tile[11] += filts_strip[3]*in_strip[2];
out_tile[12] += filts_strip[4]*in_strip[2];
out_tile[13] += filts_strip[5]*in_strip[2];
out_tile[14] += filts_strip[6]*in_strip[2];
out_tile[15] += filts_strip[7]*in_strip[2];
out_tile[16] += filts_strip[0]*in_strip[4];
out_tile[17] += filts_strip[1]*in_strip[4];
out_tile[18] += filts_strip[2]*in_strip[4];
out_tile[19] += filts_strip[3]*in_strip[4];
out_tile[20] += filts_strip[4]*in_strip[4];
out_tile[21] += filts_strip[5]*in_strip[4];
out_tile[22] += filts_strip[6]*in_strip[4];
out_tile[23] += filts_strip[7]*in_strip[4];
out_tile[24] += filts_strip[0]*in_strip[6];
out_tile[25] += filts_strip[1]*in_strip[6];
out_tile[26] += filts_strip[2]*in_strip[6];
out_tile[27] += filts_strip[3]*in_strip[6];
out_tile[28] += filts_strip[4]*in_strip[6];
out_tile[29] += filts_strip[5]*in_strip[6];
out_tile[30] += filts_strip[6]*in_strip[6];
out_tile[31] += filts_strip[7]*in_strip[6];
out_tile[32] += filts_strip[0]*in_strip[8];
out_tile[33] += filts_strip[1]*in_strip[8];
out_tile[34] += filts_strip[2]*in_strip[8];
out_tile[35] += filts_strip[3]*in_strip[8];
out_tile[36] += filts_strip[4]*in_strip[8];
out_tile[37] += filts_strip[5]*in_strip[8];
out_tile[38] += filts_strip[6]*in_strip[8];
out_tile[39] += filts_strip[7]*in_strip[8];
out_tile[40] += filts_strip[0]*in_strip[10];
out_tile[41] += filts_strip[1]*in_strip[10];
out_tile[42] += filts_strip[2]*in_strip[10];
out_tile[43] += filts_strip[3]*in_strip[10];
out_tile[44] += filts_strip[4]*in_strip[10];
out_tile[45] += filts_strip[5]*in_strip[10];
out_tile[46] += filts_strip[6]*in_strip[10];
out_tile[47] += filts_strip[7]*in_strip[10];
out_tile[48] += filts_strip[0]*in_strip[12];
out_tile[49] += filts_strip[1]*in_strip[12];
out_tile[50] += filts_strip[2]*in_strip[12];
out_tile[51] += filts_strip[3]*in_strip[12];
out_tile[52] += filts_strip[4]*in_strip[12];
out_tile[53] += filts_strip[5]*in_strip[12];
out_tile[54] += filts_strip[6]*in_strip[12];
out_tile[55] += filts_strip[7]*in_strip[12];
out_tile[56] += filts_strip[0]*in_strip[14];
out_tile[57] += filts_strip[1]*in_strip[14];
out_tile[58] += filts_strip[2]*in_strip[14];
out_tile[59] += filts_strip[3]*in_strip[14];
out_tile[60] += filts_strip[4]*in_strip[14];
out_tile[61] += filts_strip[5]*in_strip[14];
out_tile[62] += filts_strip[6]*in_strip[14];
out_tile[63] += filts_strip[7]*in_strip[14];
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[1];
out_tile[1] += filts_strip[1]*in_strip[1];
out_tile[2] += filts_strip[2]*in_strip[1];
out_tile[3] += filts_strip[3]*in_strip[1];
out_tile[4] += filts_strip[4]*in_strip[1];
out_tile[5] += filts_strip[5]*in_strip[1];
out_tile[6] += filts_strip[6]*in_strip[1];
out_tile[7] += filts_strip[7]*in_strip[1];
out_tile[8] += filts_strip[0]*in_strip[3];
out_tile[9] += filts_strip[1]*in_strip[3];
out_tile[10] += filts_strip[2]*in_strip[3];
out_tile[11] += filts_strip[3]*in_strip[3];
out_tile[12] += filts_strip[4]*in_strip[3];
out_tile[13] += filts_strip[5]*in_strip[3];
out_tile[14] += filts_strip[6]*in_strip[3];
out_tile[15] += filts_strip[7]*in_strip[3];
out_tile[16] += filts_strip[0]*in_strip[5];
out_tile[17] += filts_strip[1]*in_strip[5];
out_tile[18] += filts_strip[2]*in_strip[5];
out_tile[19] += filts_strip[3]*in_strip[5];
out_tile[20] += filts_strip[4]*in_strip[5];
out_tile[21] += filts_strip[5]*in_strip[5];
out_tile[22] += filts_strip[6]*in_strip[5];
out_tile[23] += filts_strip[7]*in_strip[5];
out_tile[24] += filts_strip[0]*in_strip[7];
out_tile[25] += filts_strip[1]*in_strip[7];
out_tile[26] += filts_strip[2]*in_strip[7];
out_tile[27] += filts_strip[3]*in_strip[7];
out_tile[28] += filts_strip[4]*in_strip[7];
out_tile[29] += filts_strip[5]*in_strip[7];
out_tile[30] += filts_strip[6]*in_strip[7];
out_tile[31] += filts_strip[7]*in_strip[7];
out_tile[32] += filts_strip[0]*in_strip[9];
out_tile[33] += filts_strip[1]*in_strip[9];
out_tile[34] += filts_strip[2]*in_strip[9];
out_tile[35] += filts_strip[3]*in_strip[9];
out_tile[36] += filts_strip[4]*in_strip[9];
out_tile[37] += filts_strip[5]*in_strip[9];
out_tile[38] += filts_strip[6]*in_strip[9];
out_tile[39] += filts_strip[7]*in_strip[9];
out_tile[40] += filts_strip[0]*in_strip[11];
out_tile[41] += filts_strip[1]*in_strip[11];
out_tile[42] += filts_strip[2]*in_strip[11];
out_tile[43] += filts_strip[3]*in_strip[11];
out_tile[44] += filts_strip[4]*in_strip[11];
out_tile[45] += filts_strip[5]*in_strip[11];
out_tile[46] += filts_strip[6]*in_strip[11];
out_tile[47] += filts_strip[7]*in_strip[11];
out_tile[48] += filts_strip[0]*in_strip[13];
out_tile[49] += filts_strip[1]*in_strip[13];
out_tile[50] += filts_strip[2]*in_strip[13];
out_tile[51] += filts_strip[3]*in_strip[13];
out_tile[52] += filts_strip[4]*in_strip[13];
out_tile[53] += filts_strip[5]*in_strip[13];
out_tile[54] += filts_strip[6]*in_strip[13];
out_tile[55] += filts_strip[7]*in_strip[13];
out_tile[56] += filts_strip[0]*in_strip[15];
out_tile[57] += filts_strip[1]*in_strip[15];
out_tile[58] += filts_strip[2]*in_strip[15];
out_tile[59] += filts_strip[3]*in_strip[15];
out_tile[60] += filts_strip[4]*in_strip[15];
out_tile[61] += filts_strip[5]*in_strip[15];
out_tile[62] += filts_strip[6]*in_strip[15];
out_tile[63] += filts_strip[7]*in_strip[15];
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[2];
out_tile[1] += filts_strip[1]*in_strip[2];
out_tile[2] += filts_strip[2]*in_strip[2];
out_tile[3] += filts_strip[3]*in_strip[2];
out_tile[4] += filts_strip[4]*in_strip[2];
out_tile[5] += filts_strip[5]*in_strip[2];
out_tile[6] += filts_strip[6]*in_strip[2];
out_tile[7] += filts_strip[7]*in_strip[2];
out_tile[8] += filts_strip[0]*in_strip[4];
out_tile[9] += filts_strip[1]*in_strip[4];
out_tile[10] += filts_strip[2]*in_strip[4];
out_tile[11] += filts_strip[3]*in_strip[4];
out_tile[12] += filts_strip[4]*in_strip[4];
out_tile[13] += filts_strip[5]*in_strip[4];
out_tile[14] += filts_strip[6]*in_strip[4];
out_tile[15] += filts_strip[7]*in_strip[4];
out_tile[16] += filts_strip[0]*in_strip[6];
out_tile[17] += filts_strip[1]*in_strip[6];
out_tile[18] += filts_strip[2]*in_strip[6];
out_tile[19] += filts_strip[3]*in_strip[6];
out_tile[20] += filts_strip[4]*in_strip[6];
out_tile[21] += filts_strip[5]*in_strip[6];
out_tile[22] += filts_strip[6]*in_strip[6];
out_tile[23] += filts_strip[7]*in_strip[6];
out_tile[24] += filts_strip[0]*in_strip[8];
out_tile[25] += filts_strip[1]*in_strip[8];
out_tile[26] += filts_strip[2]*in_strip[8];
out_tile[27] += filts_strip[3]*in_strip[8];
out_tile[28] += filts_strip[4]*in_strip[8];
out_tile[29] += filts_strip[5]*in_strip[8];
out_tile[30] += filts_strip[6]*in_strip[8];
out_tile[31] += filts_strip[7]*in_strip[8];
out_tile[32] += filts_strip[0]*in_strip[10];
out_tile[33] += filts_strip[1]*in_strip[10];
out_tile[34] += filts_strip[2]*in_strip[10];
out_tile[35] += filts_strip[3]*in_strip[10];
out_tile[36] += filts_strip[4]*in_strip[10];
out_tile[37] += filts_strip[5]*in_strip[10];
out_tile[38] += filts_strip[6]*in_strip[10];
out_tile[39] += filts_strip[7]*in_strip[10];
out_tile[40] += filts_strip[0]*in_strip[12];
out_tile[41] += filts_strip[1]*in_strip[12];
out_tile[42] += filts_strip[2]*in_strip[12];
out_tile[43] += filts_strip[3]*in_strip[12];
out_tile[44] += filts_strip[4]*in_strip[12];
out_tile[45] += filts_strip[5]*in_strip[12];
out_tile[46] += filts_strip[6]*in_strip[12];
out_tile[47] += filts_strip[7]*in_strip[12];
out_tile[48] += filts_strip[0]*in_strip[14];
out_tile[49] += filts_strip[1]*in_strip[14];
out_tile[50] += filts_strip[2]*in_strip[14];
out_tile[51] += filts_strip[3]*in_strip[14];
out_tile[52] += filts_strip[4]*in_strip[14];
out_tile[53] += filts_strip[5]*in_strip[14];
out_tile[54] += filts_strip[6]*in_strip[14];
out_tile[55] += filts_strip[7]*in_strip[14];
out_tile[56] += filts_strip[0]*in_strip[16];
out_tile[57] += filts_strip[1]*in_strip[16];
out_tile[58] += filts_strip[2]*in_strip[16];
out_tile[59] += filts_strip[3]*in_strip[16];
out_tile[60] += filts_strip[4]*in_strip[16];
out_tile[61] += filts_strip[5]*in_strip[16];
out_tile[62] += filts_strip[6]*in_strip[16];
out_tile[63] += filts_strip[7]*in_strip[16];
filts_strip[0] = filts_smem_off[3*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[3*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[3*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[3*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[3*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[3*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[3*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[3*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[3];
out_tile[1] += filts_strip[1]*in_strip[3];
out_tile[2] += filts_strip[2]*in_strip[3];
out_tile[3] += filts_strip[3]*in_strip[3];
out_tile[4] += filts_strip[4]*in_strip[3];
out_tile[5] += filts_strip[5]*in_strip[3];
out_tile[6] += filts_strip[6]*in_strip[3];
out_tile[7] += filts_strip[7]*in_strip[3];
out_tile[8] += filts_strip[0]*in_strip[5];
out_tile[9] += filts_strip[1]*in_strip[5];
out_tile[10] += filts_strip[2]*in_strip[5];
out_tile[11] += filts_strip[3]*in_strip[5];
out_tile[12] += filts_strip[4]*in_strip[5];
out_tile[13] += filts_strip[5]*in_strip[5];
out_tile[14] += filts_strip[6]*in_strip[5];
out_tile[15] += filts_strip[7]*in_strip[5];
out_tile[16] += filts_strip[0]*in_strip[7];
out_tile[17] += filts_strip[1]*in_strip[7];
out_tile[18] += filts_strip[2]*in_strip[7];
out_tile[19] += filts_strip[3]*in_strip[7];
out_tile[20] += filts_strip[4]*in_strip[7];
out_tile[21] += filts_strip[5]*in_strip[7];
out_tile[22] += filts_strip[6]*in_strip[7];
out_tile[23] += filts_strip[7]*in_strip[7];
out_tile[24] += filts_strip[0]*in_strip[9];
out_tile[25] += filts_strip[1]*in_strip[9];
out_tile[26] += filts_strip[2]*in_strip[9];
out_tile[27] += filts_strip[3]*in_strip[9];
out_tile[28] += filts_strip[4]*in_strip[9];
out_tile[29] += filts_strip[5]*in_strip[9];
out_tile[30] += filts_strip[6]*in_strip[9];
out_tile[31] += filts_strip[7]*in_strip[9];
out_tile[32] += filts_strip[0]*in_strip[11];
out_tile[33] += filts_strip[1]*in_strip[11];
out_tile[34] += filts_strip[2]*in_strip[11];
out_tile[35] += filts_strip[3]*in_strip[11];
out_tile[36] += filts_strip[4]*in_strip[11];
out_tile[37] += filts_strip[5]*in_strip[11];
out_tile[38] += filts_strip[6]*in_strip[11];
out_tile[39] += filts_strip[7]*in_strip[11];
out_tile[40] += filts_strip[0]*in_strip[13];
out_tile[41] += filts_strip[1]*in_strip[13];
out_tile[42] += filts_strip[2]*in_strip[13];
out_tile[43] += filts_strip[3]*in_strip[13];
out_tile[44] += filts_strip[4]*in_strip[13];
out_tile[45] += filts_strip[5]*in_strip[13];
out_tile[46] += filts_strip[6]*in_strip[13];
out_tile[47] += filts_strip[7]*in_strip[13];
out_tile[48] += filts_strip[0]*in_strip[15];
out_tile[49] += filts_strip[1]*in_strip[15];
out_tile[50] += filts_strip[2]*in_strip[15];
out_tile[51] += filts_strip[3]*in_strip[15];
out_tile[52] += filts_strip[4]*in_strip[15];
out_tile[53] += filts_strip[5]*in_strip[15];
out_tile[54] += filts_strip[6]*in_strip[15];
out_tile[55] += filts_strip[7]*in_strip[15];
out_tile[56] += filts_strip[0]*in_strip[17];
out_tile[57] += filts_strip[1]*in_strip[17];
out_tile[58] += filts_strip[2]*in_strip[17];
out_tile[59] += filts_strip[3]*in_strip[17];
out_tile[60] += filts_strip[4]*in_strip[17];
out_tile[61] += filts_strip[5]*in_strip[17];
out_tile[62] += filts_strip[6]*in_strip[17];
out_tile[63] += filts_strip[7]*in_strip[17];
filts_strip[0] = filts_smem_off[4*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[4*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[4*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[4*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[4*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[4*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[4*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[4*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[4];
out_tile[1] += filts_strip[1]*in_strip[4];
out_tile[2] += filts_strip[2]*in_strip[4];
out_tile[3] += filts_strip[3]*in_strip[4];
out_tile[4] += filts_strip[4]*in_strip[4];
out_tile[5] += filts_strip[5]*in_strip[4];
out_tile[6] += filts_strip[6]*in_strip[4];
out_tile[7] += filts_strip[7]*in_strip[4];
out_tile[8] += filts_strip[0]*in_strip[6];
out_tile[9] += filts_strip[1]*in_strip[6];
out_tile[10] += filts_strip[2]*in_strip[6];
out_tile[11] += filts_strip[3]*in_strip[6];
out_tile[12] += filts_strip[4]*in_strip[6];
out_tile[13] += filts_strip[5]*in_strip[6];
out_tile[14] += filts_strip[6]*in_strip[6];
out_tile[15] += filts_strip[7]*in_strip[6];
out_tile[16] += filts_strip[0]*in_strip[8];
out_tile[17] += filts_strip[1]*in_strip[8];
out_tile[18] += filts_strip[2]*in_strip[8];
out_tile[19] += filts_strip[3]*in_strip[8];
out_tile[20] += filts_strip[4]*in_strip[8];
out_tile[21] += filts_strip[5]*in_strip[8];
out_tile[22] += filts_strip[6]*in_strip[8];
out_tile[23] += filts_strip[7]*in_strip[8];
out_tile[24] += filts_strip[0]*in_strip[10];
out_tile[25] += filts_strip[1]*in_strip[10];
out_tile[26] += filts_strip[2]*in_strip[10];
out_tile[27] += filts_strip[3]*in_strip[10];
out_tile[28] += filts_strip[4]*in_strip[10];
out_tile[29] += filts_strip[5]*in_strip[10];
out_tile[30] += filts_strip[6]*in_strip[10];
out_tile[31] += filts_strip[7]*in_strip[10];
out_tile[32] += filts_strip[0]*in_strip[12];
out_tile[33] += filts_strip[1]*in_strip[12];
out_tile[34] += filts_strip[2]*in_strip[12];
out_tile[35] += filts_strip[3]*in_strip[12];
out_tile[36] += filts_strip[4]*in_strip[12];
out_tile[37] += filts_strip[5]*in_strip[12];
out_tile[38] += filts_strip[6]*in_strip[12];
out_tile[39] += filts_strip[7]*in_strip[12];
out_tile[40] += filts_strip[0]*in_strip[14];
out_tile[41] += filts_strip[1]*in_strip[14];
out_tile[42] += filts_strip[2]*in_strip[14];
out_tile[43] += filts_strip[3]*in_strip[14];
out_tile[44] += filts_strip[4]*in_strip[14];
out_tile[45] += filts_strip[5]*in_strip[14];
out_tile[46] += filts_strip[6]*in_strip[14];
out_tile[47] += filts_strip[7]*in_strip[14];
out_tile[48] += filts_strip[0]*in_strip[16];
out_tile[49] += filts_strip[1]*in_strip[16];
out_tile[50] += filts_strip[2]*in_strip[16];
out_tile[51] += filts_strip[3]*in_strip[16];
out_tile[52] += filts_strip[4]*in_strip[16];
out_tile[53] += filts_strip[5]*in_strip[16];
out_tile[54] += filts_strip[6]*in_strip[16];
out_tile[55] += filts_strip[7]*in_strip[16];
out_tile[56] += filts_strip[0]*in_strip[18];
out_tile[57] += filts_strip[1]*in_strip[18];
out_tile[58] += filts_strip[2]*in_strip[18];
out_tile[59] += filts_strip[3]*in_strip[18];
out_tile[60] += filts_strip[4]*in_strip[18];
out_tile[61] += filts_strip[5]*in_strip[18];
out_tile[62] += filts_strip[6]*in_strip[18];
out_tile[63] += filts_strip[7]*in_strip[18];
filts_strip[0] = filts_smem_off[5*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[5*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[5*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[5*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[5*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[5*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[5*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[5*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[5];
out_tile[1] += filts_strip[1]*in_strip[5];
out_tile[2] += filts_strip[2]*in_strip[5];
out_tile[3] += filts_strip[3]*in_strip[5];
out_tile[4] += filts_strip[4]*in_strip[5];
out_tile[5] += filts_strip[5]*in_strip[5];
out_tile[6] += filts_strip[6]*in_strip[5];
out_tile[7] += filts_strip[7]*in_strip[5];
out_tile[8] += filts_strip[0]*in_strip[7];
out_tile[9] += filts_strip[1]*in_strip[7];
out_tile[10] += filts_strip[2]*in_strip[7];
out_tile[11] += filts_strip[3]*in_strip[7];
out_tile[12] += filts_strip[4]*in_strip[7];
out_tile[13] += filts_strip[5]*in_strip[7];
out_tile[14] += filts_strip[6]*in_strip[7];
out_tile[15] += filts_strip[7]*in_strip[7];
out_tile[16] += filts_strip[0]*in_strip[9];
out_tile[17] += filts_strip[1]*in_strip[9];
out_tile[18] += filts_strip[2]*in_strip[9];
out_tile[19] += filts_strip[3]*in_strip[9];
out_tile[20] += filts_strip[4]*in_strip[9];
out_tile[21] += filts_strip[5]*in_strip[9];
out_tile[22] += filts_strip[6]*in_strip[9];
out_tile[23] += filts_strip[7]*in_strip[9];
out_tile[24] += filts_strip[0]*in_strip[11];
out_tile[25] += filts_strip[1]*in_strip[11];
out_tile[26] += filts_strip[2]*in_strip[11];
out_tile[27] += filts_strip[3]*in_strip[11];
out_tile[28] += filts_strip[4]*in_strip[11];
out_tile[29] += filts_strip[5]*in_strip[11];
out_tile[30] += filts_strip[6]*in_strip[11];
out_tile[31] += filts_strip[7]*in_strip[11];
out_tile[32] += filts_strip[0]*in_strip[13];
out_tile[33] += filts_strip[1]*in_strip[13];
out_tile[34] += filts_strip[2]*in_strip[13];
out_tile[35] += filts_strip[3]*in_strip[13];
out_tile[36] += filts_strip[4]*in_strip[13];
out_tile[37] += filts_strip[5]*in_strip[13];
out_tile[38] += filts_strip[6]*in_strip[13];
out_tile[39] += filts_strip[7]*in_strip[13];
out_tile[40] += filts_strip[0]*in_strip[15];
out_tile[41] += filts_strip[1]*in_strip[15];
out_tile[42] += filts_strip[2]*in_strip[15];
out_tile[43] += filts_strip[3]*in_strip[15];
out_tile[44] += filts_strip[4]*in_strip[15];
out_tile[45] += filts_strip[5]*in_strip[15];
out_tile[46] += filts_strip[6]*in_strip[15];
out_tile[47] += filts_strip[7]*in_strip[15];
out_tile[48] += filts_strip[0]*in_strip[17];
out_tile[49] += filts_strip[1]*in_strip[17];
out_tile[50] += filts_strip[2]*in_strip[17];
out_tile[51] += filts_strip[3]*in_strip[17];
out_tile[52] += filts_strip[4]*in_strip[17];
out_tile[53] += filts_strip[5]*in_strip[17];
out_tile[54] += filts_strip[6]*in_strip[17];
out_tile[55] += filts_strip[7]*in_strip[17];
out_tile[56] += filts_strip[0]*in_strip[19];
out_tile[57] += filts_strip[1]*in_strip[19];
out_tile[58] += filts_strip[2]*in_strip[19];
out_tile[59] += filts_strip[3]*in_strip[19];
out_tile[60] += filts_strip[4]*in_strip[19];
out_tile[61] += filts_strip[5]*in_strip[19];
out_tile[62] += filts_strip[6]*in_strip[19];
out_tile[63] += filts_strip[7]*in_strip[19];
filts_strip[0] = filts_smem_off[6*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[6*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[6*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[6*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[6*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[6*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[6*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[6*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[6];
out_tile[1] += filts_strip[1]*in_strip[6];
out_tile[2] += filts_strip[2]*in_strip[6];
out_tile[3] += filts_strip[3]*in_strip[6];
out_tile[4] += filts_strip[4]*in_strip[6];
out_tile[5] += filts_strip[5]*in_strip[6];
out_tile[6] += filts_strip[6]*in_strip[6];
out_tile[7] += filts_strip[7]*in_strip[6];
out_tile[8] += filts_strip[0]*in_strip[8];
out_tile[9] += filts_strip[1]*in_strip[8];
out_tile[10] += filts_strip[2]*in_strip[8];
out_tile[11] += filts_strip[3]*in_strip[8];
out_tile[12] += filts_strip[4]*in_strip[8];
out_tile[13] += filts_strip[5]*in_strip[8];
out_tile[14] += filts_strip[6]*in_strip[8];
out_tile[15] += filts_strip[7]*in_strip[8];
out_tile[16] += filts_strip[0]*in_strip[10];
out_tile[17] += filts_strip[1]*in_strip[10];
out_tile[18] += filts_strip[2]*in_strip[10];
out_tile[19] += filts_strip[3]*in_strip[10];
out_tile[20] += filts_strip[4]*in_strip[10];
out_tile[21] += filts_strip[5]*in_strip[10];
out_tile[22] += filts_strip[6]*in_strip[10];
out_tile[23] += filts_strip[7]*in_strip[10];
out_tile[24] += filts_strip[0]*in_strip[12];
out_tile[25] += filts_strip[1]*in_strip[12];
out_tile[26] += filts_strip[2]*in_strip[12];
out_tile[27] += filts_strip[3]*in_strip[12];
out_tile[28] += filts_strip[4]*in_strip[12];
out_tile[29] += filts_strip[5]*in_strip[12];
out_tile[30] += filts_strip[6]*in_strip[12];
out_tile[31] += filts_strip[7]*in_strip[12];
out_tile[32] += filts_strip[0]*in_strip[14];
out_tile[33] += filts_strip[1]*in_strip[14];
out_tile[34] += filts_strip[2]*in_strip[14];
out_tile[35] += filts_strip[3]*in_strip[14];
out_tile[36] += filts_strip[4]*in_strip[14];
out_tile[37] += filts_strip[5]*in_strip[14];
out_tile[38] += filts_strip[6]*in_strip[14];
out_tile[39] += filts_strip[7]*in_strip[14];
out_tile[40] += filts_strip[0]*in_strip[16];
out_tile[41] += filts_strip[1]*in_strip[16];
out_tile[42] += filts_strip[2]*in_strip[16];
out_tile[43] += filts_strip[3]*in_strip[16];
out_tile[44] += filts_strip[4]*in_strip[16];
out_tile[45] += filts_strip[5]*in_strip[16];
out_tile[46] += filts_strip[6]*in_strip[16];
out_tile[47] += filts_strip[7]*in_strip[16];
out_tile[48] += filts_strip[0]*in_strip[18];
out_tile[49] += filts_strip[1]*in_strip[18];
out_tile[50] += filts_strip[2]*in_strip[18];
out_tile[51] += filts_strip[3]*in_strip[18];
out_tile[52] += filts_strip[4]*in_strip[18];
out_tile[53] += filts_strip[5]*in_strip[18];
out_tile[54] += filts_strip[6]*in_strip[18];
out_tile[55] += filts_strip[7]*in_strip[18];
out_tile[56] += filts_strip[0]*in_strip[20];
out_tile[57] += filts_strip[1]*in_strip[20];
out_tile[58] += filts_strip[2]*in_strip[20];
out_tile[59] += filts_strip[3]*in_strip[20];
out_tile[60] += filts_strip[4]*in_strip[20];
out_tile[61] += filts_strip[5]*in_strip[20];
out_tile[62] += filts_strip[6]*in_strip[20];
out_tile[63] += filts_strip[7]*in_strip[20];
*/
/* t_tile_bias_loads = // begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)];
// end t_tile_bias_loads */
/* t_tile_stores = // begin t_tile_stores
if( %(out_line_img) >= %(out_ix_img_dim) ) { return; }
int32_t out_x = %(GRP_ID_1D_blk_bx)*%(t_tile_sz);
int32_t out_chan = (%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim) + %(LOC_ID_1D_out_chan_tile))*%(t_tile_sz);
GASQ float * out_off = out + %(out_line_img)*%(out_ix_img_sz) + out_chan*%(out_ix_chan_sz) + %(out_line_y)*%(out_ix_y_sz) + out_x*%(out_ix_x_sz) ;
if( (out_x + 0) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( (out_x + 1) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( (out_x + 2) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( (out_x + 3) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( (out_x + 4) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( (out_x + 5) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( (out_x + 6) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( (out_x + 7) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores */
CUCL_GLOBAL_KERNEL void in_tile_xpose__num_imgs_20__stride_2__kern_sz_7__in_pad_3__in_chans_3__ysz_227__xsz_227__tix_pels_tile_sz_16__t_tile_sz_8__bix_pels_blk_sz_2145( GASQ float const * const in, GASQ float * const out ) {
int32_t const out_ix = GLOB_ID_1D;
if( out_ix >= 5675670 ) { return; }
int32_t const out_line = (out_ix/39690)*16;
int32_t const fi_skip_in_lines = (out_line%114)*2;
int32_t const in_line = (((out_ix/21)%42)+fi_skip_in_lines);
int32_t const img_in_lines = (114 - 1)*2 + 7;
int32_t const img_off = in_line/img_in_lines;
int32_t const img = (out_line/114) + img_off;
int32_t const iy = (in_line % img_in_lines) - 3; //(out_line%114)*2 + ((out_ix/21)%42) - 3;
int32_t const ix = ((out_ix/2646)%15)*8*2 + (out_ix%21) - 3;
float v = 0.0f;
if( 1
&& ( ix >= 0 )
&& ( iy >= 0 )
&& ( ix < 227 )
&& ( iy < 227 )
&& ( img < 20 )
)
{
v = in[ img*154587 +
((out_ix/882)%3)*51529 +
iy*227 +
ix*1 ];
}
out[out_ix] = v;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* stride = 2 */
/* kern_sz = 7 */
/* in_pad = 3 */
/* in_chans = 3 */
/* ysz = 227 */
/* xsz = 227 */
/* tix_pels_tile_sz = 16 */
/* t_tile_sz = 8 */
/* bix_pels_blk_sz = 2145 */
/* rtc_func_name = in_tile_xpose__num_imgs_20__stride_2__kern_sz_7__in_pad_3__in_chans_3__ysz_227__xsz_227__tix_pels_tile_sz_16__t_tile_sz_8__bix_pels_blk_sz_2145 */
/* out_ix_blk_x_dim = 21 */
/* out_ix_blk_x_sz = 1 */
/* out_ix_blk_x_nomod = out_ix */
/* out_ix_blk_x = (out_ix%%21) */
/* out_ix_blk_y_dim = 42 */
/* out_ix_blk_y_sz = 21 */
/* out_ix_blk_y_nomod = (out_ix/21) */
/* out_ix_blk_y = ((out_ix/21)%%42) */
/* out_ix_blk_in_chan_dim = 3 */
/* out_ix_blk_in_chan_sz = 882 */
/* out_ix_blk_in_chan_nomod = (out_ix/882) */
/* out_ix_blk_in_chan = ((out_ix/882)%%3) */
/* out_ix_blk_bx_dim = 15 */
/* out_ix_blk_bx_sz = 2646 */
/* out_ix_blk_bx_nomod = (out_ix/2646) */
/* out_ix_blk_bx = ((out_ix/2646)%%15) */
/* out_ix_blk_bline_dim = 143 */
/* out_ix_blk_bline_sz = 39690 */
/* out_ix_blk_bline_nomod = (out_ix/39690) */
/* out_ix_blk_bline = (out_ix/39690) */
/* out_ix_sz = 5675670 */
/* out_line_y_dim = 114 */
/* out_line_y_sz = 1 */
/* out_line_y_nomod = out_line */
/* out_line_y = (out_line%%114) */
/* out_line_img_dim = 20 */
/* out_line_img_sz = 114 */
/* out_line_img_nomod = (out_line/114) */
/* out_line_img = (out_line/114) */
/* out_line_sz = 2280 */
/* in_ix_x_dim = 227 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%227) */
/* in_ix_y_dim = 227 */
/* in_ix_y_sz = 227 */
/* in_ix_y_nomod = (in_ix/227) */
/* in_ix_y = ((in_ix/227)%%227) */
/* in_ix_chan_dim = 3 */
/* in_ix_chan_sz = 51529 */
/* in_ix_chan_nomod = (in_ix/51529) */
/* in_ix_chan = ((in_ix/51529)%%3) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 154587 */
/* in_ix_img_nomod = (in_ix/154587) */
/* in_ix_img = (in_ix/154587) */
/* in_ix_sz = 3091740 */
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_64__in_chans_3__kysz_7__kxsz_7( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile
{
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg
int32_t const filts_ix = GLOB_ID_1D;
if( filts_ix >= 9408 ) { return; }
int32_t const fioc = (filts_ix/147);
float val = 0.0f;
int32_t const filts_xp_ix =
(fioc/64)*9408 +
(fioc%8)*8 +
((fioc/8)%8)*1 +
((filts_ix/49)%3)*3136 +
((filts_ix/7)%7)*448 +
(filts_ix%7)*64;
#if 1
val = in[filts_ix];
#else
if( ((filts_ix/49)%3) == 0 ) {
// if( ((filts_ix%7) == 5) && (((filts_ix/7)%7) == 5) )
{
val = (filts_ix%7)*100 + ((filts_ix/7)%7);
}
}
#endif
out[filts_xp_ix] = val;
}
// -- template substituion table used: --
/* out_chans = 64 */
/* in_chans = 3 */
/* kysz = 7 */
/* kxsz = 7 */
/* rtc_func_name = xpose_filts__out_chans_64__in_chans_3__kysz_7__kxsz_7 */
/* t_tile_sz = 8 */
/* filts_ix_x_dim = 7 */
/* filts_ix_x_sz = 1 */
/* filts_ix_x_nomod = filts_ix */
/* filts_ix_x = (filts_ix%%7) */
/* filts_ix_y_dim = 7 */
/* filts_ix_y_sz = 7 */
/* filts_ix_y_nomod = (filts_ix/7) */
/* filts_ix_y = ((filts_ix/7)%%7) */
/* filts_ix_in_chan_dim = 3 */
/* filts_ix_in_chan_sz = 49 */
/* filts_ix_in_chan_nomod = (filts_ix/49) */
/* filts_ix_in_chan = ((filts_ix/49)%%3) */
/* filts_ix_out_chan_dim = 64 */
/* filts_ix_out_chan_sz = 147 */
/* filts_ix_out_chan_nomod = (filts_ix/147) */
/* filts_ix_out_chan = (filts_ix/147) */
/* filts_ix_sz = 9408 */
/* filts_xp_ix_out_chan_tile_dim = 8 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%8) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 8 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/8) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/8)%%8) */
/* filts_xp_ix_x_dim = 7 */
/* filts_xp_ix_x_sz = 64 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/64) */
/* filts_xp_ix_x = ((filts_xp_ix/64)%%7) */
/* filts_xp_ix_y_dim = 7 */
/* filts_xp_ix_y_sz = 448 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/448) */
/* filts_xp_ix_y = ((filts_xp_ix/448)%%7) */
/* filts_xp_ix_in_chan_dim = 3 */
/* filts_xp_ix_in_chan_sz = 3136 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/3136) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/3136)%%3) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 9408 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/9408) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/9408) */
/* filts_xp_ix_sz = 9408 */
/* fioc_out_chan_reg_dim = 8 */
/* fioc_out_chan_reg_sz = 1 */
/* fioc_out_chan_reg_nomod = fioc */
/* fioc_out_chan_reg = (fioc%%8) */
/* fioc_out_chan_tile_dim = 8 */
/* fioc_out_chan_tile_sz = 8 */
/* fioc_out_chan_tile_nomod = (fioc/8) */
/* fioc_out_chan_tile = ((fioc/8)%%8) */
/* fioc_out_chan_blk_dim = 1 */
/* fioc_out_chan_blk_sz = 64 */
/* fioc_out_chan_blk_nomod = (fioc/64) */
/* fioc_out_chan_blk = (fioc/64) */
/* fioc_sz = 64 */
CUCL_GLOBAL_KERNEL void pool__num_imgs_20__in_pad_0__in_dim_0_114__in_dim_1_114__conv_has_relu_0__kern_sz_3__stride_2__out_chans_64__avg_pool_0( GASQ float const * const in, GASQ float * const out ) {
int32_t const out_ix = GLOB_ID_1D;
if( out_ix >= 4158720 ) { return; }
float out_v = 0.0f;
for( int32_t kx = 0; kx != 3; ++kx ) {
for( int32_t ky = 0; ky != 3; ++ky ) {
float v = 0;
int const in_ix_y = ((out_ix/57)%57)*2 + ky - 0;
int const in_ix_x = (out_ix%57)*2 + kx - 0;
if(in_ix_y >= 0 && in_ix_x >= 0 && in_ix_x < 114 && in_ix_y < 114 ) {
int32_t const in_ix = (out_ix/207936)*831744 + ((out_ix/3249)%64)*12996 +
in_ix_y*114 + in_ix_x*1;
v = in[in_ix];
}
out_v = max( out_v, v );
}
}
;
out[out_ix] = out_v;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_pad = 0 */
/* in_dim_0 = 114 */
/* in_dim_1 = 114 */
/* conv_has_relu = 0 */
/* kern_sz = 3 */
/* stride = 2 */
/* out_chans = 64 */
/* avg_pool = 0 */
/* rtc_func_name = pool__num_imgs_20__in_pad_0__in_dim_0_114__in_dim_1_114__conv_has_relu_0__kern_sz_3__stride_2__out_chans_64__avg_pool_0 */
/* t_tile_sz = 8 */
/* out_ix_x_dim = 57 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%57) */
/* out_ix_y_dim = 57 */
/* out_ix_y_sz = 57 */
/* out_ix_y_nomod = (out_ix/57) */
/* out_ix_y = ((out_ix/57)%%57) */
/* out_ix_chan_dim = 64 */
/* out_ix_chan_sz = 3249 */
/* out_ix_chan_nomod = (out_ix/3249) */
/* out_ix_chan = ((out_ix/3249)%%64) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 207936 */
/* out_ix_img_nomod = (out_ix/207936) */
/* out_ix_img = (out_ix/207936) */
/* out_ix_sz = 4158720 */
/* in_ix_x_dim = 114 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%114) */
/* in_ix_y_dim = 114 */
/* in_ix_y_sz = 114 */
/* in_ix_y_nomod = (in_ix/114) */
/* in_ix_y = ((in_ix/114)%%114) */
/* in_ix_chan_dim = 64 */
/* in_ix_chan_sz = 12996 */
/* in_ix_chan_nomod = (in_ix/12996) */
/* in_ix_chan = ((in_ix/12996)%%64) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 831744 */
/* in_ix_img_nomod = (in_ix/831744) */
/* in_ix_img = (in_ix/831744) */
/* in_ix_sz = 16634880 */
/* op = out_v = max( out_v, v ) */
/* op_post = */
// each thread: computes outputs across chan dim, using inputs across chan dim
CUCL_GLOBAL_KERNEL void lrn__num_imgs_20__chans_64__ysz_57__xsz_57__local_size_5__alpha_0_0001__beta_0_75__k_1( GASQ float const * const in, GASQ float * const out ) {
int32_t const tix = GLOB_ID_1D;
if( tix >= 64980 ) { return; }
// iteratate over chans
float ls_buf[5] = {0.0f};
int32_t const hls = 5 >> 1;
int32_t const out_base_ix = (tix/3249)*207936 + ((tix/57)%57)*57 + (tix%57)*1;
for( int32_t in_chan_ix = 0; in_chan_ix < 64 + hls; ++in_chan_ix ) {
int32_t const in_off = in_chan_ix*3249;
int32_t const lsb_ix = in_chan_ix % 5;
ls_buf[lsb_ix] = (in_chan_ix < 64) ? in[out_base_ix + in_off] : 0.0f;
if( in_chan_ix >= hls ) {
int32_t const out_chan_ix = in_chan_ix - hls;
float ls_sum = 0.0f;
for( int32_t i = 0; i != 5; ++i ) { ls_sum += ls_buf[i]*ls_buf[i]; }
float const scale = powf( (1 + 0.0001*ls_sum/5), -0.75 );
out[out_base_ix + out_chan_ix*3249] = ls_buf[(lsb_ix+5-hls) % 5] * scale;
}
}
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* chans = 64 */
/* ysz = 57 */
/* xsz = 57 */
/* local_size = 5 */
/* alpha = 0.0001 */
/* beta = 0.75 */
/* k = 1 */
/* rtc_func_name = lrn__num_imgs_20__chans_64__ysz_57__xsz_57__local_size_5__alpha_0_0001__beta_0_75__k_1 */
/* tix_x_dim = 57 */
/* tix_x_sz = 1 */
/* tix_x_nomod = tix */
/* tix_x = (tix%%57) */
/* tix_y_dim = 57 */
/* tix_y_sz = 57 */
/* tix_y_nomod = (tix/57) */
/* tix_y = ((tix/57)%%57) */
/* tix_img_dim = 20 */
/* tix_img_sz = 3249 */
/* tix_img_nomod = (tix/3249) */
/* tix_img = (tix/3249) */
/* tix_sz = 64980 */
/* out_ix_x_dim = 57 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%57) */
/* out_ix_y_dim = 57 */
/* out_ix_y_sz = 57 */
/* out_ix_y_nomod = (out_ix/57) */
/* out_ix_y = ((out_ix/57)%%57) */
/* out_ix_chan_dim = 64 */
/* out_ix_chan_sz = 3249 */
/* out_ix_chan_nomod = (out_ix/3249) */
/* out_ix_chan = ((out_ix/3249)%%64) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 207936 */
/* out_ix_img_nomod = (out_ix/207936) */
/* out_ix_img = (out_ix/207936) */
/* out_ix_sz = 4158720 */
// each thread: computes 8x8 block of out
// loop over k dim
CUCL_GLOBAL_KERNEL void k1conv__num_imgs_20__in_dim_0_57__in_dim_1_57__conv_has_relu_1__out_chans_64__write_xposed_0__in_chans_64( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) {
//int32_t const blk_in_ix_sz = 16*8;
LOCSHAR_MEM float all_smem[1536]; // note: max(filts+in,out) == max(512+1024,1024)
LSMASQ float * const filts_smem = all_smem;
LSMASQ float * const in_smem = filts_smem + 512;
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz )
float in_strip[8]; // segment of input line sufficient for one unrolling of inner loop
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*4096; // index of first out chan
int32_t blk_in_ix_base = GRP_ID_1D*8192 + LOC_ID_1D;// index of first input pel to load for this thread
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%8);
LSMASQ float * const in_smem_off = in_smem + 8*(LOC_ID_1D/8);
LSMASQ float * const out_smem_off = all_smem + LOC_ID_1D;
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D;
// iteratate over filter elements
for( int32_t blk_iter = 0; blk_iter != 8; ++blk_iter ) {
BARRIER_SYNC;
// begin smem_loads
filts_smem[(LOC_ID_1D + 128 * 0)] = filts[filts_off+(128*0)];
filts_smem[(LOC_ID_1D + 128 * 1)] = filts[filts_off+(128*1)];
filts_smem[(LOC_ID_1D + 128 * 2)] = filts[filts_off+(128*2)];
filts_smem[(LOC_ID_1D + 128 * 3)] = filts[filts_off+(128*3)];
in_smem[(LOC_ID_1D + 128 * 0)] = in[ blk_in_ix_base + (128*0) ];
in_smem[(LOC_ID_1D + 128 * 1)] = in[ blk_in_ix_base + (128*1) ];
in_smem[(LOC_ID_1D + 128 * 2)] = in[ blk_in_ix_base + (128*2) ];
in_smem[(LOC_ID_1D + 128 * 3)] = in[ blk_in_ix_base + (128*3) ];
in_smem[(LOC_ID_1D + 128 * 4)] = in[ blk_in_ix_base + (128*4) ];
in_smem[(LOC_ID_1D + 128 * 5)] = in[ blk_in_ix_base + (128*5) ];
in_smem[(LOC_ID_1D + 128 * 6)] = in[ blk_in_ix_base + (128*6) ];
in_smem[(LOC_ID_1D + 128 * 7)] = in[ blk_in_ix_base + (128*7) ];
// end smem_loads;
BARRIER_SYNC;
filts_off += 64*8;
blk_in_ix_base += 1024;
// begin inner_loop_body
filts_strip[0] = filts_smem_off[0*64+0*8];
filts_strip[1] = filts_smem_off[0*64+1*8];
filts_strip[2] = filts_smem_off[0*64+2*8];
filts_strip[3] = filts_smem_off[0*64+3*8];
filts_strip[4] = filts_smem_off[0*64+4*8];
filts_strip[5] = filts_smem_off[0*64+5*8];
filts_strip[6] = filts_smem_off[0*64+6*8];
filts_strip[7] = filts_smem_off[0*64+7*8];
in_strip[0] = in_smem_off[(0*8*16+0)];
in_strip[1] = in_smem_off[(0*8*16+1)];
in_strip[2] = in_smem_off[(0*8*16+2)];
in_strip[3] = in_smem_off[(0*8*16+3)];
in_strip[4] = in_smem_off[(0*8*16+4)];
in_strip[5] = in_smem_off[(0*8*16+5)];
in_strip[6] = in_smem_off[(0*8*16+6)];
in_strip[7] = in_smem_off[(0*8*16+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[1*64+0*8];
filts_strip[1] = filts_smem_off[1*64+1*8];
filts_strip[2] = filts_smem_off[1*64+2*8];
filts_strip[3] = filts_smem_off[1*64+3*8];
filts_strip[4] = filts_smem_off[1*64+4*8];
filts_strip[5] = filts_smem_off[1*64+5*8];
filts_strip[6] = filts_smem_off[1*64+6*8];
filts_strip[7] = filts_smem_off[1*64+7*8];
in_strip[0] = in_smem_off[(1*8*16+0)];
in_strip[1] = in_smem_off[(1*8*16+1)];
in_strip[2] = in_smem_off[(1*8*16+2)];
in_strip[3] = in_smem_off[(1*8*16+3)];
in_strip[4] = in_smem_off[(1*8*16+4)];
in_strip[5] = in_smem_off[(1*8*16+5)];
in_strip[6] = in_smem_off[(1*8*16+6)];
in_strip[7] = in_smem_off[(1*8*16+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[2*64+0*8];
filts_strip[1] = filts_smem_off[2*64+1*8];
filts_strip[2] = filts_smem_off[2*64+2*8];
filts_strip[3] = filts_smem_off[2*64+3*8];
filts_strip[4] = filts_smem_off[2*64+4*8];
filts_strip[5] = filts_smem_off[2*64+5*8];
filts_strip[6] = filts_smem_off[2*64+6*8];
filts_strip[7] = filts_smem_off[2*64+7*8];
in_strip[0] = in_smem_off[(2*8*16+0)];
in_strip[1] = in_smem_off[(2*8*16+1)];
in_strip[2] = in_smem_off[(2*8*16+2)];
in_strip[3] = in_smem_off[(2*8*16+3)];
in_strip[4] = in_smem_off[(2*8*16+4)];
in_strip[5] = in_smem_off[(2*8*16+5)];
in_strip[6] = in_smem_off[(2*8*16+6)];
in_strip[7] = in_smem_off[(2*8*16+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[3*64+0*8];
filts_strip[1] = filts_smem_off[3*64+1*8];
filts_strip[2] = filts_smem_off[3*64+2*8];
filts_strip[3] = filts_smem_off[3*64+3*8];
filts_strip[4] = filts_smem_off[3*64+4*8];
filts_strip[5] = filts_smem_off[3*64+5*8];
filts_strip[6] = filts_smem_off[3*64+6*8];
filts_strip[7] = filts_smem_off[3*64+7*8];
in_strip[0] = in_smem_off[(3*8*16+0)];
in_strip[1] = in_smem_off[(3*8*16+1)];
in_strip[2] = in_smem_off[(3*8*16+2)];
in_strip[3] = in_smem_off[(3*8*16+3)];
in_strip[4] = in_smem_off[(3*8*16+4)];
in_strip[5] = in_smem_off[(3*8*16+5)];
in_strip[6] = in_smem_off[(3*8*16+6)];
in_strip[7] = in_smem_off[(3*8*16+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[4*64+0*8];
filts_strip[1] = filts_smem_off[4*64+1*8];
filts_strip[2] = filts_smem_off[4*64+2*8];
filts_strip[3] = filts_smem_off[4*64+3*8];
filts_strip[4] = filts_smem_off[4*64+4*8];
filts_strip[5] = filts_smem_off[4*64+5*8];
filts_strip[6] = filts_smem_off[4*64+6*8];
filts_strip[7] = filts_smem_off[4*64+7*8];
in_strip[0] = in_smem_off[(4*8*16+0)];
in_strip[1] = in_smem_off[(4*8*16+1)];
in_strip[2] = in_smem_off[(4*8*16+2)];
in_strip[3] = in_smem_off[(4*8*16+3)];
in_strip[4] = in_smem_off[(4*8*16+4)];
in_strip[5] = in_smem_off[(4*8*16+5)];
in_strip[6] = in_smem_off[(4*8*16+6)];
in_strip[7] = in_smem_off[(4*8*16+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[5*64+0*8];
filts_strip[1] = filts_smem_off[5*64+1*8];
filts_strip[2] = filts_smem_off[5*64+2*8];
filts_strip[3] = filts_smem_off[5*64+3*8];
filts_strip[4] = filts_smem_off[5*64+4*8];
filts_strip[5] = filts_smem_off[5*64+5*8];
filts_strip[6] = filts_smem_off[5*64+6*8];
filts_strip[7] = filts_smem_off[5*64+7*8];
in_strip[0] = in_smem_off[(5*8*16+0)];
in_strip[1] = in_smem_off[(5*8*16+1)];
in_strip[2] = in_smem_off[(5*8*16+2)];
in_strip[3] = in_smem_off[(5*8*16+3)];
in_strip[4] = in_smem_off[(5*8*16+4)];
in_strip[5] = in_smem_off[(5*8*16+5)];
in_strip[6] = in_smem_off[(5*8*16+6)];
in_strip[7] = in_smem_off[(5*8*16+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[6*64+0*8];
filts_strip[1] = filts_smem_off[6*64+1*8];
filts_strip[2] = filts_smem_off[6*64+2*8];
filts_strip[3] = filts_smem_off[6*64+3*8];
filts_strip[4] = filts_smem_off[6*64+4*8];
filts_strip[5] = filts_smem_off[6*64+5*8];
filts_strip[6] = filts_smem_off[6*64+6*8];
filts_strip[7] = filts_smem_off[6*64+7*8];
in_strip[0] = in_smem_off[(6*8*16+0)];
in_strip[1] = in_smem_off[(6*8*16+1)];
in_strip[2] = in_smem_off[(6*8*16+2)];
in_strip[3] = in_smem_off[(6*8*16+3)];
in_strip[4] = in_smem_off[(6*8*16+4)];
in_strip[5] = in_smem_off[(6*8*16+5)];
in_strip[6] = in_smem_off[(6*8*16+6)];
in_strip[7] = in_smem_off[(6*8*16+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[7*64+0*8];
filts_strip[1] = filts_smem_off[7*64+1*8];
filts_strip[2] = filts_smem_off[7*64+2*8];
filts_strip[3] = filts_smem_off[7*64+3*8];
filts_strip[4] = filts_smem_off[7*64+4*8];
filts_strip[5] = filts_smem_off[7*64+5*8];
filts_strip[6] = filts_smem_off[7*64+6*8];
filts_strip[7] = filts_smem_off[7*64+7*8];
in_strip[0] = in_smem_off[(7*8*16+0)];
in_strip[1] = in_smem_off[(7*8*16+1)];
in_strip[2] = in_smem_off[(7*8*16+2)];
in_strip[3] = in_smem_off[(7*8*16+3)];
in_strip[4] = in_smem_off[(7*8*16+4)];
in_strip[5] = in_smem_off[(7*8*16+5)];
in_strip[6] = in_smem_off[(7*8*16+6)];
in_strip[7] = in_smem_off[(7*8*16+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
;
}
// load per-block biases into smem
if( flags == 2 ) { return; }
BARRIER_SYNC;
for( int32_t i = 0; i != 1; ++i ) {
int32_t const t_smem_bias_ix = LOC_ID_1D+128*i;
if( t_smem_bias_ix < 64 ) {
int32_t const ocix_base = (GRP_ID_1D%1)*64;
int32_t const load_reg = t_smem_bias_ix / 8;
int32_t const load_tile = t_smem_bias_ix % 8;
int32_t const ocix = ocix_base + load_tile*8 + load_reg;
if( ocix < 64 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; }
}
}
BARRIER_SYNC;
// load biases into filts_strip
// begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*8];
filts_strip[1] = filts_smem_off[1*8];
filts_strip[2] = filts_smem_off[2*8];
filts_strip[3] = filts_smem_off[3*8];
filts_strip[4] = filts_smem_off[4*8];
filts_strip[5] = filts_smem_off[5*8];
filts_strip[6] = filts_smem_off[6*8];
filts_strip[7] = filts_smem_off[7*8];
// end t_tile_bias_loads;
if( flags == 1 ) {
GASQ float * const out_off = out + LOC_ID_1D;
out_off[0] = max(0.0f,out_tile[0]+filts_strip[0]);
out_off[128] = max(0.0f,out_tile[1]+filts_strip[1]);
out_off[256] = max(0.0f,out_tile[2]+filts_strip[2]);
out_off[384] = max(0.0f,out_tile[3]+filts_strip[3]);
out_off[512] = max(0.0f,out_tile[4]+filts_strip[4]);
out_off[640] = max(0.0f,out_tile[5]+filts_strip[5]);
out_off[768] = max(0.0f,out_tile[6]+filts_strip[6]);
out_off[896] = max(0.0f,out_tile[7]+filts_strip[7]);
out_off[1024] = max(0.0f,out_tile[8]+filts_strip[0]);
out_off[1152] = max(0.0f,out_tile[9]+filts_strip[1]);
out_off[1280] = max(0.0f,out_tile[10]+filts_strip[2]);
out_off[1408] = max(0.0f,out_tile[11]+filts_strip[3]);
out_off[1536] = max(0.0f,out_tile[12]+filts_strip[4]);
out_off[1664] = max(0.0f,out_tile[13]+filts_strip[5]);
out_off[1792] = max(0.0f,out_tile[14]+filts_strip[6]);
out_off[1920] = max(0.0f,out_tile[15]+filts_strip[7]);
out_off[2048] = max(0.0f,out_tile[16]+filts_strip[0]);
out_off[2176] = max(0.0f,out_tile[17]+filts_strip[1]);
out_off[2304] = max(0.0f,out_tile[18]+filts_strip[2]);
out_off[2432] = max(0.0f,out_tile[19]+filts_strip[3]);
out_off[2560] = max(0.0f,out_tile[20]+filts_strip[4]);
out_off[2688] = max(0.0f,out_tile[21]+filts_strip[5]);
out_off[2816] = max(0.0f,out_tile[22]+filts_strip[6]);
out_off[2944] = max(0.0f,out_tile[23]+filts_strip[7]);
out_off[3072] = max(0.0f,out_tile[24]+filts_strip[0]);
out_off[3200] = max(0.0f,out_tile[25]+filts_strip[1]);
out_off[3328] = max(0.0f,out_tile[26]+filts_strip[2]);
out_off[3456] = max(0.0f,out_tile[27]+filts_strip[3]);
out_off[3584] = max(0.0f,out_tile[28]+filts_strip[4]);
out_off[3712] = max(0.0f,out_tile[29]+filts_strip[5]);
out_off[3840] = max(0.0f,out_tile[30]+filts_strip[6]);
out_off[3968] = max(0.0f,out_tile[31]+filts_strip[7]);
out_off[4096] = max(0.0f,out_tile[32]+filts_strip[0]);
out_off[4224] = max(0.0f,out_tile[33]+filts_strip[1]);
out_off[4352] = max(0.0f,out_tile[34]+filts_strip[2]);
out_off[4480] = max(0.0f,out_tile[35]+filts_strip[3]);
out_off[4608] = max(0.0f,out_tile[36]+filts_strip[4]);
out_off[4736] = max(0.0f,out_tile[37]+filts_strip[5]);
out_off[4864] = max(0.0f,out_tile[38]+filts_strip[6]);
out_off[4992] = max(0.0f,out_tile[39]+filts_strip[7]);
out_off[5120] = max(0.0f,out_tile[40]+filts_strip[0]);
out_off[5248] = max(0.0f,out_tile[41]+filts_strip[1]);
out_off[5376] = max(0.0f,out_tile[42]+filts_strip[2]);
out_off[5504] = max(0.0f,out_tile[43]+filts_strip[3]);
out_off[5632] = max(0.0f,out_tile[44]+filts_strip[4]);
out_off[5760] = max(0.0f,out_tile[45]+filts_strip[5]);
out_off[5888] = max(0.0f,out_tile[46]+filts_strip[6]);
out_off[6016] = max(0.0f,out_tile[47]+filts_strip[7]);
out_off[6144] = max(0.0f,out_tile[48]+filts_strip[0]);
out_off[6272] = max(0.0f,out_tile[49]+filts_strip[1]);
out_off[6400] = max(0.0f,out_tile[50]+filts_strip[2]);
out_off[6528] = max(0.0f,out_tile[51]+filts_strip[3]);
out_off[6656] = max(0.0f,out_tile[52]+filts_strip[4]);
out_off[6784] = max(0.0f,out_tile[53]+filts_strip[5]);
out_off[6912] = max(0.0f,out_tile[54]+filts_strip[6]);
out_off[7040] = max(0.0f,out_tile[55]+filts_strip[7]);
out_off[7168] = max(0.0f,out_tile[56]+filts_strip[0]);
out_off[7296] = max(0.0f,out_tile[57]+filts_strip[1]);
out_off[7424] = max(0.0f,out_tile[58]+filts_strip[2]);
out_off[7552] = max(0.0f,out_tile[59]+filts_strip[3]);
out_off[7680] = max(0.0f,out_tile[60]+filts_strip[4]);
out_off[7808] = max(0.0f,out_tile[61]+filts_strip[5]);
out_off[7936] = max(0.0f,out_tile[62]+filts_strip[6]);
out_off[8064] = max(0.0f,out_tile[63]+filts_strip[7]);
;
return;
}
// add bias to each elem of out_tile[] and store the results to out[]
// begin t_tile_stores
int32_t tpix[8];
int32_t tcix[8];
tpix[0] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+0)/3249)*207936 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+0)%3249)*1 ; // cache out patch ixs
tpix[1] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+1)/3249)*207936 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+1)%3249)*1 ; // cache out patch ixs
tpix[2] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+2)/3249)*207936 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+2)%3249)*1 ; // cache out patch ixs
tpix[3] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+3)/3249)*207936 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+3)%3249)*1 ; // cache out patch ixs
tpix[4] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+4)/3249)*207936 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+4)%3249)*1 ; // cache out patch ixs
tpix[5] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+5)/3249)*207936 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+5)%3249)*1 ; // cache out patch ixs
tpix[6] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+6)/3249)*207936 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+6)%3249)*1 ; // cache out patch ixs
tpix[7] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+7)/3249)*207936 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+7)%3249)*1 ; // cache out patch ixs
tcix[0] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+0)*3249; // cache out chan ixs
tcix[1] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+1)*3249; // cache out chan ixs
tcix[2] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+2)*3249; // cache out chan ixs
tcix[3] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+3)*3249; // cache out chan ixs
tcix[4] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+4)*3249; // cache out chan ixs
tcix[5] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+5)*3249; // cache out chan ixs
tcix[6] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+6)*3249; // cache out chan ixs
tcix[7] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+7)*3249; // cache out chan ixs
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+0)/3249) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (64*3249) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( tcix[1] < (64*3249) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( tcix[2] < (64*3249) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( tcix[3] < (64*3249) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( tcix[4] < (64*3249) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( tcix[5] < (64*3249) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( tcix[6] < (64*3249) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( tcix[7] < (64*3249) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+1)/3249) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (64*3249) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( tcix[1] < (64*3249) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( tcix[2] < (64*3249) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( tcix[3] < (64*3249) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( tcix[4] < (64*3249) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( tcix[5] < (64*3249) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( tcix[6] < (64*3249) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( tcix[7] < (64*3249) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+2)/3249) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (64*3249) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( tcix[1] < (64*3249) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( tcix[2] < (64*3249) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( tcix[3] < (64*3249) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( tcix[4] < (64*3249) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( tcix[5] < (64*3249) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( tcix[6] < (64*3249) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( tcix[7] < (64*3249) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+3)/3249) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (64*3249) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( tcix[1] < (64*3249) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( tcix[2] < (64*3249) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( tcix[3] < (64*3249) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( tcix[4] < (64*3249) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( tcix[5] < (64*3249) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( tcix[6] < (64*3249) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( tcix[7] < (64*3249) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+4)/3249) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (64*3249) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( tcix[1] < (64*3249) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( tcix[2] < (64*3249) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( tcix[3] < (64*3249) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( tcix[4] < (64*3249) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( tcix[5] < (64*3249) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( tcix[6] < (64*3249) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( tcix[7] < (64*3249) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+5)/3249) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (64*3249) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( tcix[1] < (64*3249) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( tcix[2] < (64*3249) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( tcix[3] < (64*3249) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( tcix[4] < (64*3249) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( tcix[5] < (64*3249) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( tcix[6] < (64*3249) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( tcix[7] < (64*3249) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+6)/3249) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (64*3249) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( tcix[1] < (64*3249) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( tcix[2] < (64*3249) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( tcix[3] < (64*3249) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( tcix[4] < (64*3249) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( tcix[5] < (64*3249) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( tcix[6] < (64*3249) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( tcix[7] < (64*3249) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+7)/3249) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (64*3249) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( tcix[1] < (64*3249) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( tcix[2] < (64*3249) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( tcix[3] < (64*3249) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( tcix[4] < (64*3249) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( tcix[5] < (64*3249) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( tcix[6] < (64*3249) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( tcix[7] < (64*3249) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_dim_0 = 57 */
/* in_dim_1 = 57 */
/* conv_has_relu = 1 */
/* out_chans = 64 */
/* write_xposed = 0 */
/* in_chans = 64 */
/* rtc_func_name = k1conv__num_imgs_20__in_dim_0_57__in_dim_1_57__conv_has_relu_1__out_chans_64__write_xposed_0__in_chans_64 */
/* t_tile_sz = 8 */
/* out_ix_x_dim = 57 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%57) */
/* out_ix_y_dim = 57 */
/* out_ix_y_sz = 57 */
/* out_ix_y_nomod = (out_ix/57) */
/* out_ix_y = ((out_ix/57)%%57) */
/* out_ix_chan_dim = 64 */
/* out_ix_chan_sz = 3249 */
/* out_ix_chan_nomod = (out_ix/3249) */
/* out_ix_chan = ((out_ix/3249)%%64) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 207936 */
/* out_ix_img_nomod = (out_ix/207936) */
/* out_ix_img = (out_ix/207936) */
/* out_ix_sz = 4158720 */
/* tpb = 128 */
/* in_chan_tile = 8 */
/* LOC_ID_1D_out_chan_tile_dim = 8 */
/* LOC_ID_1D_out_chan_tile_sz = 1 */
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%8) */
/* LOC_ID_1D_pels_tile_dim = 16 */
/* LOC_ID_1D_pels_tile_sz = 8 */
/* LOC_ID_1D_pels_tile_nomod = (LOC_ID_1D/8) */
/* LOC_ID_1D_pels_tile = (LOC_ID_1D/8) */
/* LOC_ID_1D_sz = 128 */
/* GRP_ID_1D_out_chan_blk_dim = 1 */
/* GRP_ID_1D_out_chan_blk_sz = 1 */
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */
/* GRP_ID_1D_pels_blk_dim = 508 */
/* GRP_ID_1D_pels_blk_sz = 1 */
/* GRP_ID_1D_pels_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_pels_blk = GRP_ID_1D */
/* GRP_ID_1D_sz = 508 */
/* in_ix_blk_pel_dim = 128 */
/* in_ix_blk_pel_sz = 1 */
/* in_ix_blk_pel_nomod = in_ix */
/* in_ix_blk_pel = (in_ix%%128) */
/* in_ix_blk_iter_chan_dim = 8 */
/* in_ix_blk_iter_chan_sz = 128 */
/* in_ix_blk_iter_chan_nomod = (in_ix/128) */
/* in_ix_blk_iter_chan = ((in_ix/128)%%8) */
/* in_ix_blk_iter_dim = 8 */
/* in_ix_blk_iter_sz = 1024 */
/* in_ix_blk_iter_nomod = (in_ix/1024) */
/* in_ix_blk_iter = ((in_ix/1024)%%8) */
/* in_ix_blk_dim = 508 */
/* in_ix_blk_sz = 8192 */
/* in_ix_blk_nomod = (in_ix/8192) */
/* in_ix_blk = (in_ix/8192) */
/* in_ix_sz = 4161536 */
/* blk_filt_ix_sz = 64 */
/* filts_smem_sz = 512 */
/* in_smem_sz = 1024 */
/* out_smem_sz = 1024 */
/* all_smem_sz = 1536 */
/* filts_xp_ix_out_chan_tile_dim = 8 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%8) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 8 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/8) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/8)%%8) */
/* filts_xp_ix_in_chan_dim = 64 */
/* filts_xp_ix_in_chan_sz = 64 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/64) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/64)%%64) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 4096 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/4096) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/4096) */
/* filts_xp_ix_sz = 4096 */
/* out_chan_bias_smem_load_iter = 1 */
/* filts_off_adj = LOC_ID_1D */
/* smem_loads = // begin smem_loads
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)];
filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)];
filts_smem[(LOC_ID_1D + %(tpb) * 2)] = filts[filts_off+(%(tpb)*2)];
filts_smem[(LOC_ID_1D + %(tpb) * 3)] = filts[filts_off+(%(tpb)*3)];
in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ];
in_smem[(LOC_ID_1D + %(tpb) * 1)] = in[ blk_in_ix_base + (%(tpb)*1) ];
in_smem[(LOC_ID_1D + %(tpb) * 2)] = in[ blk_in_ix_base + (%(tpb)*2) ];
in_smem[(LOC_ID_1D + %(tpb) * 3)] = in[ blk_in_ix_base + (%(tpb)*3) ];
in_smem[(LOC_ID_1D + %(tpb) * 4)] = in[ blk_in_ix_base + (%(tpb)*4) ];
in_smem[(LOC_ID_1D + %(tpb) * 5)] = in[ blk_in_ix_base + (%(tpb)*5) ];
in_smem[(LOC_ID_1D + %(tpb) * 6)] = in[ blk_in_ix_base + (%(tpb)*6) ];
in_smem[(LOC_ID_1D + %(tpb) * 7)] = in[ blk_in_ix_base + (%(tpb)*7) ];
// end smem_loads */
/* out_chan_tile = (%(LOC_ID_1D_out_chan_tile)+%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim)) */
/* out_chan_ix = (%(out_chan_tile)*%(t_tile_sz)) */
/* t_smem_ld_pel_pel_dim = 128 */
/* t_smem_ld_pel_pel_sz = 1 */
/* t_smem_ld_pel_pel_nomod = t_smem_ld_pel */
/* t_smem_ld_pel_pel = (t_smem_ld_pel%%128) */
/* t_smem_ld_pel_chan_dim = 8 */
/* t_smem_ld_pel_chan_sz = 128 */
/* t_smem_ld_pel_chan_nomod = (t_smem_ld_pel/128) */
/* t_smem_ld_pel_chan = (t_smem_ld_pel/128) */
/* t_smem_ld_pel_sz = 1024 */
/* out_pel_0 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+0) */
/* out_pel_0_pel_dim = 3249 */
/* out_pel_0_pel_sz = 1 */
/* out_pel_0_pel_nomod = %(out_pel_0) */
/* out_pel_0_pel = (%(out_pel_0)%%3249) */
/* out_pel_0_img_dim = 20 */
/* out_pel_0_img_sz = 3249 */
/* out_pel_0_img_nomod = (%(out_pel_0)/3249) */
/* out_pel_0_img = (%(out_pel_0)/3249) */
/* out_pel_0_sz = 64980 */
/* out_pel_1 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+1) */
/* out_pel_1_pel_dim = 3249 */
/* out_pel_1_pel_sz = 1 */
/* out_pel_1_pel_nomod = %(out_pel_1) */
/* out_pel_1_pel = (%(out_pel_1)%%3249) */
/* out_pel_1_img_dim = 20 */
/* out_pel_1_img_sz = 3249 */
/* out_pel_1_img_nomod = (%(out_pel_1)/3249) */
/* out_pel_1_img = (%(out_pel_1)/3249) */
/* out_pel_1_sz = 64980 */
/* out_pel_2 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+2) */
/* out_pel_2_pel_dim = 3249 */
/* out_pel_2_pel_sz = 1 */
/* out_pel_2_pel_nomod = %(out_pel_2) */
/* out_pel_2_pel = (%(out_pel_2)%%3249) */
/* out_pel_2_img_dim = 20 */
/* out_pel_2_img_sz = 3249 */
/* out_pel_2_img_nomod = (%(out_pel_2)/3249) */
/* out_pel_2_img = (%(out_pel_2)/3249) */
/* out_pel_2_sz = 64980 */
/* out_pel_3 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+3) */
/* out_pel_3_pel_dim = 3249 */
/* out_pel_3_pel_sz = 1 */
/* out_pel_3_pel_nomod = %(out_pel_3) */
/* out_pel_3_pel = (%(out_pel_3)%%3249) */
/* out_pel_3_img_dim = 20 */
/* out_pel_3_img_sz = 3249 */
/* out_pel_3_img_nomod = (%(out_pel_3)/3249) */
/* out_pel_3_img = (%(out_pel_3)/3249) */
/* out_pel_3_sz = 64980 */
/* out_pel_4 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+4) */
/* out_pel_4_pel_dim = 3249 */
/* out_pel_4_pel_sz = 1 */
/* out_pel_4_pel_nomod = %(out_pel_4) */
/* out_pel_4_pel = (%(out_pel_4)%%3249) */
/* out_pel_4_img_dim = 20 */
/* out_pel_4_img_sz = 3249 */
/* out_pel_4_img_nomod = (%(out_pel_4)/3249) */
/* out_pel_4_img = (%(out_pel_4)/3249) */
/* out_pel_4_sz = 64980 */
/* out_pel_5 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+5) */
/* out_pel_5_pel_dim = 3249 */
/* out_pel_5_pel_sz = 1 */
/* out_pel_5_pel_nomod = %(out_pel_5) */
/* out_pel_5_pel = (%(out_pel_5)%%3249) */
/* out_pel_5_img_dim = 20 */
/* out_pel_5_img_sz = 3249 */
/* out_pel_5_img_nomod = (%(out_pel_5)/3249) */
/* out_pel_5_img = (%(out_pel_5)/3249) */
/* out_pel_5_sz = 64980 */
/* out_pel_6 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+6) */
/* out_pel_6_pel_dim = 3249 */
/* out_pel_6_pel_sz = 1 */
/* out_pel_6_pel_nomod = %(out_pel_6) */
/* out_pel_6_pel = (%(out_pel_6)%%3249) */
/* out_pel_6_img_dim = 20 */
/* out_pel_6_img_sz = 3249 */
/* out_pel_6_img_nomod = (%(out_pel_6)/3249) */
/* out_pel_6_img = (%(out_pel_6)/3249) */
/* out_pel_6_sz = 64980 */
/* out_pel_7 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+7) */
/* out_pel_7_pel_dim = 3249 */
/* out_pel_7_pel_sz = 1 */
/* out_pel_7_pel_nomod = %(out_pel_7) */
/* out_pel_7_pel = (%(out_pel_7)%%3249) */
/* out_pel_7_img_dim = 20 */
/* out_pel_7_img_sz = 3249 */
/* out_pel_7_img_nomod = (%(out_pel_7)/3249) */
/* out_pel_7_img = (%(out_pel_7)/3249) */
/* out_pel_7_sz = 64980 */
/* t_tile_stores = // begin t_tile_stores
int32_t tpix[%(t_tile_sz)];
int32_t tcix[%(t_tile_sz)];
tpix[0] = %(out_pel_0_img)*%(out_ix_img_sz) + %(out_pel_0_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[1] = %(out_pel_1_img)*%(out_ix_img_sz) + %(out_pel_1_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[2] = %(out_pel_2_img)*%(out_ix_img_sz) + %(out_pel_2_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[3] = %(out_pel_3_img)*%(out_ix_img_sz) + %(out_pel_3_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[4] = %(out_pel_4_img)*%(out_ix_img_sz) + %(out_pel_4_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[5] = %(out_pel_5_img)*%(out_ix_img_sz) + %(out_pel_5_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[6] = %(out_pel_6_img)*%(out_ix_img_sz) + %(out_pel_6_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[7] = %(out_pel_7_img)*%(out_ix_img_sz) + %(out_pel_7_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tcix[0] = (%(out_chan_ix)+0)*%(out_ix_chan_sz); // cache out chan ixs
tcix[1] = (%(out_chan_ix)+1)*%(out_ix_chan_sz); // cache out chan ixs
tcix[2] = (%(out_chan_ix)+2)*%(out_ix_chan_sz); // cache out chan ixs
tcix[3] = (%(out_chan_ix)+3)*%(out_ix_chan_sz); // cache out chan ixs
tcix[4] = (%(out_chan_ix)+4)*%(out_ix_chan_sz); // cache out chan ixs
tcix[5] = (%(out_chan_ix)+5)*%(out_ix_chan_sz); // cache out chan ixs
tcix[6] = (%(out_chan_ix)+6)*%(out_ix_chan_sz); // cache out chan ixs
tcix[7] = (%(out_chan_ix)+7)*%(out_ix_chan_sz); // cache out chan ixs
if( %(out_pel_0_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( %(out_pel_1_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( %(out_pel_2_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( %(out_pel_3_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( %(out_pel_4_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( %(out_pel_5_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( %(out_pel_6_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( %(out_pel_7_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores */
/* t_tile_dummy_stores = out_off[0] = max(0.0f,out_tile[0]+filts_strip[0]);
out_off[128] = max(0.0f,out_tile[1]+filts_strip[1]);
out_off[256] = max(0.0f,out_tile[2]+filts_strip[2]);
out_off[384] = max(0.0f,out_tile[3]+filts_strip[3]);
out_off[512] = max(0.0f,out_tile[4]+filts_strip[4]);
out_off[640] = max(0.0f,out_tile[5]+filts_strip[5]);
out_off[768] = max(0.0f,out_tile[6]+filts_strip[6]);
out_off[896] = max(0.0f,out_tile[7]+filts_strip[7]);
out_off[1024] = max(0.0f,out_tile[8]+filts_strip[0]);
out_off[1152] = max(0.0f,out_tile[9]+filts_strip[1]);
out_off[1280] = max(0.0f,out_tile[10]+filts_strip[2]);
out_off[1408] = max(0.0f,out_tile[11]+filts_strip[3]);
out_off[1536] = max(0.0f,out_tile[12]+filts_strip[4]);
out_off[1664] = max(0.0f,out_tile[13]+filts_strip[5]);
out_off[1792] = max(0.0f,out_tile[14]+filts_strip[6]);
out_off[1920] = max(0.0f,out_tile[15]+filts_strip[7]);
out_off[2048] = max(0.0f,out_tile[16]+filts_strip[0]);
out_off[2176] = max(0.0f,out_tile[17]+filts_strip[1]);
out_off[2304] = max(0.0f,out_tile[18]+filts_strip[2]);
out_off[2432] = max(0.0f,out_tile[19]+filts_strip[3]);
out_off[2560] = max(0.0f,out_tile[20]+filts_strip[4]);
out_off[2688] = max(0.0f,out_tile[21]+filts_strip[5]);
out_off[2816] = max(0.0f,out_tile[22]+filts_strip[6]);
out_off[2944] = max(0.0f,out_tile[23]+filts_strip[7]);
out_off[3072] = max(0.0f,out_tile[24]+filts_strip[0]);
out_off[3200] = max(0.0f,out_tile[25]+filts_strip[1]);
out_off[3328] = max(0.0f,out_tile[26]+filts_strip[2]);
out_off[3456] = max(0.0f,out_tile[27]+filts_strip[3]);
out_off[3584] = max(0.0f,out_tile[28]+filts_strip[4]);
out_off[3712] = max(0.0f,out_tile[29]+filts_strip[5]);
out_off[3840] = max(0.0f,out_tile[30]+filts_strip[6]);
out_off[3968] = max(0.0f,out_tile[31]+filts_strip[7]);
out_off[4096] = max(0.0f,out_tile[32]+filts_strip[0]);
out_off[4224] = max(0.0f,out_tile[33]+filts_strip[1]);
out_off[4352] = max(0.0f,out_tile[34]+filts_strip[2]);
out_off[4480] = max(0.0f,out_tile[35]+filts_strip[3]);
out_off[4608] = max(0.0f,out_tile[36]+filts_strip[4]);
out_off[4736] = max(0.0f,out_tile[37]+filts_strip[5]);
out_off[4864] = max(0.0f,out_tile[38]+filts_strip[6]);
out_off[4992] = max(0.0f,out_tile[39]+filts_strip[7]);
out_off[5120] = max(0.0f,out_tile[40]+filts_strip[0]);
out_off[5248] = max(0.0f,out_tile[41]+filts_strip[1]);
out_off[5376] = max(0.0f,out_tile[42]+filts_strip[2]);
out_off[5504] = max(0.0f,out_tile[43]+filts_strip[3]);
out_off[5632] = max(0.0f,out_tile[44]+filts_strip[4]);
out_off[5760] = max(0.0f,out_tile[45]+filts_strip[5]);
out_off[5888] = max(0.0f,out_tile[46]+filts_strip[6]);
out_off[6016] = max(0.0f,out_tile[47]+filts_strip[7]);
out_off[6144] = max(0.0f,out_tile[48]+filts_strip[0]);
out_off[6272] = max(0.0f,out_tile[49]+filts_strip[1]);
out_off[6400] = max(0.0f,out_tile[50]+filts_strip[2]);
out_off[6528] = max(0.0f,out_tile[51]+filts_strip[3]);
out_off[6656] = max(0.0f,out_tile[52]+filts_strip[4]);
out_off[6784] = max(0.0f,out_tile[53]+filts_strip[5]);
out_off[6912] = max(0.0f,out_tile[54]+filts_strip[6]);
out_off[7040] = max(0.0f,out_tile[55]+filts_strip[7]);
out_off[7168] = max(0.0f,out_tile[56]+filts_strip[0]);
out_off[7296] = max(0.0f,out_tile[57]+filts_strip[1]);
out_off[7424] = max(0.0f,out_tile[58]+filts_strip[2]);
out_off[7552] = max(0.0f,out_tile[59]+filts_strip[3]);
out_off[7680] = max(0.0f,out_tile[60]+filts_strip[4]);
out_off[7808] = max(0.0f,out_tile[61]+filts_strip[5]);
out_off[7936] = max(0.0f,out_tile[62]+filts_strip[6]);
out_off[8064] = max(0.0f,out_tile[63]+filts_strip[7]);
*/
/* t_tile_bias_loads = // begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)];
// end t_tile_bias_loads */
/* inner_loop_body = // begin inner_loop_body
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[3*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[3*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[3*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[3*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[3*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[3*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[3*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[3*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[4*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[4*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[4*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[4*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[4*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[4*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[4*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[4*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[5*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[5*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[5*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[5*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[5*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[5*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[5*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[5*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[6*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[6*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[6*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[6*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[6*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[6*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[6*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[6*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[7*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[7*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[7*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[7*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[7*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[7*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[7*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[7*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
*/
CUCL_GLOBAL_KERNEL void xpose_in__num_imgs_20__in_chan_tile_8__pad_in_chans_64__in_chans_64__ysz_57__xsz_57__tix_pels_tile_sz_16__bix_pels_blk_sz_508( GASQ float const * const in, GASQ float * const out ) {
int32_t const out_ix = GLOB_ID_1D;
int32_t const chan_ix = ((out_ix/1024)%8)*8 + ((out_ix/128)%8);
int32_t const pel_ix = (out_ix/8192)*128 + (out_ix%128);
float v = 0.0f;
if( ( chan_ix < 64 ) && ( (pel_ix/3249) < 20 ) ) {
v = in[ (pel_ix/3249)*207936 +
chan_ix*3249 +
((pel_ix/57)%57)*57 +
(pel_ix%57)*1 ];
}
out[out_ix] = v;
}
/*
in_pels = num_img * in.sz.dims_prod()
num_in_blks = u32_ceil_div( in_pels, block_chan_pels )
normal in dims: img, chan, y, x OR img, chan, pels // where pels = x,y dims merged
block_iters = u32_ceil_div( chan, in_chan_tile ) // for ccp1, 96/8=12
pad_chan = block_iter * in_chan_tile // pad by up to (in_chan_tile-1) [typ. 8; pad with zeros? garbage okay?]
block_chan_pels = t_tile_sz*tix_pels_tile_sz // typically 8*8=64
block_iter_pels = block_chan_pels * in_chan_tile; // typically 512
block_pels = 12*512 = 6144 // note: 24576 bytes, prob. too big for SM to fully cache, but 512=2K (per-iter cache) is fine.
xposed in dims (inner): (block_iter, block_iter_chan, block_iter_pel) == block_pel
sz (inner): (block_iters, in_chan_tile, block_chan_pels) == block_pels (only inner 2 dims need to be linear?)
*/
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_chan_tile = 8 */
/* pad_in_chans = 64 */
/* in_chans = 64 */
/* ysz = 57 */
/* xsz = 57 */
/* tix_pels_tile_sz = 16 */
/* bix_pels_blk_sz = 508 */
/* rtc_func_name = xpose_in__num_imgs_20__in_chan_tile_8__pad_in_chans_64__in_chans_64__ysz_57__xsz_57__tix_pels_tile_sz_16__bix_pels_blk_sz_508 */
/* out_ix_blk_pel_dim = 128 */
/* out_ix_blk_pel_sz = 1 */
/* out_ix_blk_pel_nomod = out_ix */
/* out_ix_blk_pel = (out_ix%%128) */
/* out_ix_blk_iter_chan_dim = 8 */
/* out_ix_blk_iter_chan_sz = 128 */
/* out_ix_blk_iter_chan_nomod = (out_ix/128) */
/* out_ix_blk_iter_chan = ((out_ix/128)%%8) */
/* out_ix_blk_iter_dim = 8 */
/* out_ix_blk_iter_sz = 1024 */
/* out_ix_blk_iter_nomod = (out_ix/1024) */
/* out_ix_blk_iter = ((out_ix/1024)%%8) */
/* out_ix_blk_dim = 508 */
/* out_ix_blk_sz = 8192 */
/* out_ix_blk_nomod = (out_ix/8192) */
/* out_ix_blk = (out_ix/8192) */
/* out_ix_sz = 4161536 */
/* pel_ix_x_dim = 57 */
/* pel_ix_x_sz = 1 */
/* pel_ix_x_nomod = pel_ix */
/* pel_ix_x = (pel_ix%%57) */
/* pel_ix_y_dim = 57 */
/* pel_ix_y_sz = 57 */
/* pel_ix_y_nomod = (pel_ix/57) */
/* pel_ix_y = ((pel_ix/57)%%57) */
/* pel_ix_img_dim = 20 */
/* pel_ix_img_sz = 3249 */
/* pel_ix_img_nomod = (pel_ix/3249) */
/* pel_ix_img = (pel_ix/3249) */
/* pel_ix_sz = 64980 */
/* in_ix_x_dim = 57 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%57) */
/* in_ix_y_dim = 57 */
/* in_ix_y_sz = 57 */
/* in_ix_y_nomod = (in_ix/57) */
/* in_ix_y = ((in_ix/57)%%57) */
/* in_ix_chan_dim = 64 */
/* in_ix_chan_sz = 3249 */
/* in_ix_chan_nomod = (in_ix/3249) */
/* in_ix_chan = ((in_ix/3249)%%64) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 207936 */
/* in_ix_img_nomod = (in_ix/207936) */
/* in_ix_img = (in_ix/207936) */
/* in_ix_sz = 4158720 */
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_64__in_chans_64__kysz_1__kxsz_1( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile
{
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg
int32_t const filts_ix = GLOB_ID_1D;
if( filts_ix >= 4096 ) { return; }
int32_t const fioc = (filts_ix/64);
float val = 0.0f;
int32_t const filts_xp_ix =
(fioc/64)*4096 +
(fioc%8)*8 +
((fioc/8)%8)*1 +
(filts_ix%64)*64 +
(filts_ix%1)*64 +
(filts_ix%1)*64;
#if 1
val = in[filts_ix];
#else
if( (filts_ix%64) == 0 ) {
// if( ((filts_ix%1) == 5) && ((filts_ix%1) == 5) )
{
val = (filts_ix%1)*100 + (filts_ix%1);
}
}
#endif
out[filts_xp_ix] = val;
}
// -- template substituion table used: --
/* out_chans = 64 */
/* in_chans = 64 */
/* kysz = 1 */
/* kxsz = 1 */
/* rtc_func_name = xpose_filts__out_chans_64__in_chans_64__kysz_1__kxsz_1 */
/* t_tile_sz = 8 */
/* filts_ix_x_dim = 1 */
/* filts_ix_x_sz = 1 */
/* filts_ix_x_nomod = filts_ix */
/* filts_ix_x = (filts_ix%%1) */
/* filts_ix_y_dim = 1 */
/* filts_ix_y_sz = 1 */
/* filts_ix_y_nomod = filts_ix */
/* filts_ix_y = (filts_ix%%1) */
/* filts_ix_in_chan_dim = 64 */
/* filts_ix_in_chan_sz = 1 */
/* filts_ix_in_chan_nomod = filts_ix */
/* filts_ix_in_chan = (filts_ix%%64) */
/* filts_ix_out_chan_dim = 64 */
/* filts_ix_out_chan_sz = 64 */
/* filts_ix_out_chan_nomod = (filts_ix/64) */
/* filts_ix_out_chan = (filts_ix/64) */
/* filts_ix_sz = 4096 */
/* filts_xp_ix_out_chan_tile_dim = 8 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%8) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 8 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/8) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/8)%%8) */
/* filts_xp_ix_x_dim = 1 */
/* filts_xp_ix_x_sz = 64 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/64) */
/* filts_xp_ix_x = ((filts_xp_ix/64)%%1) */
/* filts_xp_ix_y_dim = 1 */
/* filts_xp_ix_y_sz = 64 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/64) */
/* filts_xp_ix_y = ((filts_xp_ix/64)%%1) */
/* filts_xp_ix_in_chan_dim = 64 */
/* filts_xp_ix_in_chan_sz = 64 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/64) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/64)%%64) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 4096 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/4096) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/4096) */
/* filts_xp_ix_sz = 4096 */
/* fioc_out_chan_reg_dim = 8 */
/* fioc_out_chan_reg_sz = 1 */
/* fioc_out_chan_reg_nomod = fioc */
/* fioc_out_chan_reg = (fioc%%8) */
/* fioc_out_chan_tile_dim = 8 */
/* fioc_out_chan_tile_sz = 8 */
/* fioc_out_chan_tile_nomod = (fioc/8) */
/* fioc_out_chan_tile = ((fioc/8)%%8) */
/* fioc_out_chan_blk_dim = 1 */
/* fioc_out_chan_blk_sz = 64 */
/* fioc_out_chan_blk_nomod = (fioc/64) */
/* fioc_out_chan_blk = (fioc/64) */
/* fioc_sz = 64 */
// each thread: computes 8x8 block of out
// loop over k dim
CUCL_GLOBAL_KERNEL void tconv__num_imgs_20__in_dim_0_57__in_dim_1_57__kern_sz_3__stride_1__in_pad_1__t_tile_sz_8__conv_has_relu_1__out_chans_192__in_chans_64( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) {
LOCSHAR_MEM float all_smem[1024]; // note: max(filts+in,out) == max(384+120,1024)
LSMASQ float * const filts_smem = all_smem;
LSMASQ float * const in_smem = filts_smem + 384;
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz )
float in_strip[10]; // segment of input line sufficient for one unrolling of inner loop
int32_t blk_in_ix_base = (GRP_ID_1D/2)*7680 + LOC_ID_1D;// index of first input pel to load for this thread
int32_t const blk_filt_ix_base = (GRP_ID_1D%2)*73728; // index of first out chan
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D;
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%16);
int32_t out_line = (GRP_ID_1D/16)*8; // first out_line of block
int32_t const blk_fli = (out_line/57); // image of first out_line of block
out_line += (LOC_ID_1D/16); // adjust to out_line of this thread
// offset in lines to deal with >1 img/block = (number of prior images (partial or full) in this block) * (adj to next img)
int32_t const img_off_lines = ((out_line/57) - blk_fli)*(3-1);
int32_t const in_y = (out_line%57)*1 - 1;
for( int32_t in_chan = 0; in_chan != 64; ++in_chan ) {
BARRIER_SYNC;
// begin in_smem_loads
if( (LOC_ID_1D + 128 * 0) < 120) { in_smem[(LOC_ID_1D + 128 * 0)] = in[ blk_in_ix_base + (128*0) ];}
blk_in_ix_base += 120;
// end in_smem_loads;
for( int32_t ky = 0; ky != 3; ++ky ) {
if( ky != 0 ) { BARRIER_SYNC; }
// begin filt_smem_loads
filts_smem[(LOC_ID_1D + 128 * 0)] = filts[filts_off+(128*0)];
filts_smem[(LOC_ID_1D + 128 * 1)] = filts[filts_off+(128*1)];
filts_smem[(LOC_ID_1D + 128 * 2)] = filts[filts_off+(128*2)];
filts_off += 384;
// end filt_smem_loads;
BARRIER_SYNC;
if( (out_line/57) >= 20 ) { continue; } // required: skip lines from invalid images (read might be invalid)
if( ((in_y+ky) < 0) || ((in_y+ky)>57) ) { continue; } // optimization: skip known-to-be-padding input lines
LSMASQ float * const in_smem_off = in_smem + ((LOC_ID_1D/16)*1+ky+img_off_lines)*10;
// begin inner_loop_body
in_strip[0] = in_smem_off[0];
in_strip[1] = in_smem_off[1];
in_strip[2] = in_smem_off[2];
in_strip[3] = in_smem_off[3];
in_strip[4] = in_smem_off[4];
in_strip[5] = in_smem_off[5];
in_strip[6] = in_smem_off[6];
in_strip[7] = in_smem_off[7];
in_strip[8] = in_smem_off[8];
in_strip[9] = in_smem_off[9];
filts_strip[0] = filts_smem_off[0*128+0*16];
filts_strip[1] = filts_smem_off[0*128+1*16];
filts_strip[2] = filts_smem_off[0*128+2*16];
filts_strip[3] = filts_smem_off[0*128+3*16];
filts_strip[4] = filts_smem_off[0*128+4*16];
filts_strip[5] = filts_smem_off[0*128+5*16];
filts_strip[6] = filts_smem_off[0*128+6*16];
filts_strip[7] = filts_smem_off[0*128+7*16];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[1*128+0*16];
filts_strip[1] = filts_smem_off[1*128+1*16];
filts_strip[2] = filts_smem_off[1*128+2*16];
filts_strip[3] = filts_smem_off[1*128+3*16];
filts_strip[4] = filts_smem_off[1*128+4*16];
filts_strip[5] = filts_smem_off[1*128+5*16];
filts_strip[6] = filts_smem_off[1*128+6*16];
filts_strip[7] = filts_smem_off[1*128+7*16];
out_tile[0] += filts_strip[0]*in_strip[1];
out_tile[1] += filts_strip[1]*in_strip[1];
out_tile[2] += filts_strip[2]*in_strip[1];
out_tile[3] += filts_strip[3]*in_strip[1];
out_tile[4] += filts_strip[4]*in_strip[1];
out_tile[5] += filts_strip[5]*in_strip[1];
out_tile[6] += filts_strip[6]*in_strip[1];
out_tile[7] += filts_strip[7]*in_strip[1];
out_tile[8] += filts_strip[0]*in_strip[2];
out_tile[9] += filts_strip[1]*in_strip[2];
out_tile[10] += filts_strip[2]*in_strip[2];
out_tile[11] += filts_strip[3]*in_strip[2];
out_tile[12] += filts_strip[4]*in_strip[2];
out_tile[13] += filts_strip[5]*in_strip[2];
out_tile[14] += filts_strip[6]*in_strip[2];
out_tile[15] += filts_strip[7]*in_strip[2];
out_tile[16] += filts_strip[0]*in_strip[3];
out_tile[17] += filts_strip[1]*in_strip[3];
out_tile[18] += filts_strip[2]*in_strip[3];
out_tile[19] += filts_strip[3]*in_strip[3];
out_tile[20] += filts_strip[4]*in_strip[3];
out_tile[21] += filts_strip[5]*in_strip[3];
out_tile[22] += filts_strip[6]*in_strip[3];
out_tile[23] += filts_strip[7]*in_strip[3];
out_tile[24] += filts_strip[0]*in_strip[4];
out_tile[25] += filts_strip[1]*in_strip[4];
out_tile[26] += filts_strip[2]*in_strip[4];
out_tile[27] += filts_strip[3]*in_strip[4];
out_tile[28] += filts_strip[4]*in_strip[4];
out_tile[29] += filts_strip[5]*in_strip[4];
out_tile[30] += filts_strip[6]*in_strip[4];
out_tile[31] += filts_strip[7]*in_strip[4];
out_tile[32] += filts_strip[0]*in_strip[5];
out_tile[33] += filts_strip[1]*in_strip[5];
out_tile[34] += filts_strip[2]*in_strip[5];
out_tile[35] += filts_strip[3]*in_strip[5];
out_tile[36] += filts_strip[4]*in_strip[5];
out_tile[37] += filts_strip[5]*in_strip[5];
out_tile[38] += filts_strip[6]*in_strip[5];
out_tile[39] += filts_strip[7]*in_strip[5];
out_tile[40] += filts_strip[0]*in_strip[6];
out_tile[41] += filts_strip[1]*in_strip[6];
out_tile[42] += filts_strip[2]*in_strip[6];
out_tile[43] += filts_strip[3]*in_strip[6];
out_tile[44] += filts_strip[4]*in_strip[6];
out_tile[45] += filts_strip[5]*in_strip[6];
out_tile[46] += filts_strip[6]*in_strip[6];
out_tile[47] += filts_strip[7]*in_strip[6];
out_tile[48] += filts_strip[0]*in_strip[7];
out_tile[49] += filts_strip[1]*in_strip[7];
out_tile[50] += filts_strip[2]*in_strip[7];
out_tile[51] += filts_strip[3]*in_strip[7];
out_tile[52] += filts_strip[4]*in_strip[7];
out_tile[53] += filts_strip[5]*in_strip[7];
out_tile[54] += filts_strip[6]*in_strip[7];
out_tile[55] += filts_strip[7]*in_strip[7];
out_tile[56] += filts_strip[0]*in_strip[8];
out_tile[57] += filts_strip[1]*in_strip[8];
out_tile[58] += filts_strip[2]*in_strip[8];
out_tile[59] += filts_strip[3]*in_strip[8];
out_tile[60] += filts_strip[4]*in_strip[8];
out_tile[61] += filts_strip[5]*in_strip[8];
out_tile[62] += filts_strip[6]*in_strip[8];
out_tile[63] += filts_strip[7]*in_strip[8];
filts_strip[0] = filts_smem_off[2*128+0*16];
filts_strip[1] = filts_smem_off[2*128+1*16];
filts_strip[2] = filts_smem_off[2*128+2*16];
filts_strip[3] = filts_smem_off[2*128+3*16];
filts_strip[4] = filts_smem_off[2*128+4*16];
filts_strip[5] = filts_smem_off[2*128+5*16];
filts_strip[6] = filts_smem_off[2*128+6*16];
filts_strip[7] = filts_smem_off[2*128+7*16];
out_tile[0] += filts_strip[0]*in_strip[2];
out_tile[1] += filts_strip[1]*in_strip[2];
out_tile[2] += filts_strip[2]*in_strip[2];
out_tile[3] += filts_strip[3]*in_strip[2];
out_tile[4] += filts_strip[4]*in_strip[2];
out_tile[5] += filts_strip[5]*in_strip[2];
out_tile[6] += filts_strip[6]*in_strip[2];
out_tile[7] += filts_strip[7]*in_strip[2];
out_tile[8] += filts_strip[0]*in_strip[3];
out_tile[9] += filts_strip[1]*in_strip[3];
out_tile[10] += filts_strip[2]*in_strip[3];
out_tile[11] += filts_strip[3]*in_strip[3];
out_tile[12] += filts_strip[4]*in_strip[3];
out_tile[13] += filts_strip[5]*in_strip[3];
out_tile[14] += filts_strip[6]*in_strip[3];
out_tile[15] += filts_strip[7]*in_strip[3];
out_tile[16] += filts_strip[0]*in_strip[4];
out_tile[17] += filts_strip[1]*in_strip[4];
out_tile[18] += filts_strip[2]*in_strip[4];
out_tile[19] += filts_strip[3]*in_strip[4];
out_tile[20] += filts_strip[4]*in_strip[4];
out_tile[21] += filts_strip[5]*in_strip[4];
out_tile[22] += filts_strip[6]*in_strip[4];
out_tile[23] += filts_strip[7]*in_strip[4];
out_tile[24] += filts_strip[0]*in_strip[5];
out_tile[25] += filts_strip[1]*in_strip[5];
out_tile[26] += filts_strip[2]*in_strip[5];
out_tile[27] += filts_strip[3]*in_strip[5];
out_tile[28] += filts_strip[4]*in_strip[5];
out_tile[29] += filts_strip[5]*in_strip[5];
out_tile[30] += filts_strip[6]*in_strip[5];
out_tile[31] += filts_strip[7]*in_strip[5];
out_tile[32] += filts_strip[0]*in_strip[6];
out_tile[33] += filts_strip[1]*in_strip[6];
out_tile[34] += filts_strip[2]*in_strip[6];
out_tile[35] += filts_strip[3]*in_strip[6];
out_tile[36] += filts_strip[4]*in_strip[6];
out_tile[37] += filts_strip[5]*in_strip[6];
out_tile[38] += filts_strip[6]*in_strip[6];
out_tile[39] += filts_strip[7]*in_strip[6];
out_tile[40] += filts_strip[0]*in_strip[7];
out_tile[41] += filts_strip[1]*in_strip[7];
out_tile[42] += filts_strip[2]*in_strip[7];
out_tile[43] += filts_strip[3]*in_strip[7];
out_tile[44] += filts_strip[4]*in_strip[7];
out_tile[45] += filts_strip[5]*in_strip[7];
out_tile[46] += filts_strip[6]*in_strip[7];
out_tile[47] += filts_strip[7]*in_strip[7];
out_tile[48] += filts_strip[0]*in_strip[8];
out_tile[49] += filts_strip[1]*in_strip[8];
out_tile[50] += filts_strip[2]*in_strip[8];
out_tile[51] += filts_strip[3]*in_strip[8];
out_tile[52] += filts_strip[4]*in_strip[8];
out_tile[53] += filts_strip[5]*in_strip[8];
out_tile[54] += filts_strip[6]*in_strip[8];
out_tile[55] += filts_strip[7]*in_strip[8];
out_tile[56] += filts_strip[0]*in_strip[9];
out_tile[57] += filts_strip[1]*in_strip[9];
out_tile[58] += filts_strip[2]*in_strip[9];
out_tile[59] += filts_strip[3]*in_strip[9];
out_tile[60] += filts_strip[4]*in_strip[9];
out_tile[61] += filts_strip[5]*in_strip[9];
out_tile[62] += filts_strip[6]*in_strip[9];
out_tile[63] += filts_strip[7]*in_strip[9];
;
}
}
if( flags == 2 ) { return; }
BARRIER_SYNC;
for( int32_t i = 0; i != 1; ++i ) {
int32_t const t_smem_bias_ix = LOC_ID_1D+128*i;
if( t_smem_bias_ix < 128 ) {
int32_t const ocix_base = (GRP_ID_1D%2)*128;
int32_t const load_reg = t_smem_bias_ix / 16;
int32_t const load_tile = t_smem_bias_ix % 16;
int32_t const ocix = ocix_base + load_tile*8 + load_reg;
if( ocix < 192 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; }
}
}
BARRIER_SYNC;
// begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*16];
filts_strip[1] = filts_smem_off[1*16];
filts_strip[2] = filts_smem_off[2*16];
filts_strip[3] = filts_smem_off[3*16];
filts_strip[4] = filts_smem_off[4*16];
filts_strip[5] = filts_smem_off[5*16];
filts_strip[6] = filts_smem_off[6*16];
filts_strip[7] = filts_smem_off[7*16];
// end t_tile_bias_loads;
if( flags == 1 ) { return; }
// begin t_tile_stores
if( (out_line/57) >= 20 ) { return; }
int32_t out_x = ((GRP_ID_1D/2)%8)*8;
int32_t out_chan = ((GRP_ID_1D%2)*16 + (LOC_ID_1D%16))*8;
GASQ float * out_off = out + (out_line/57)*623808 + out_chan*3249 + (out_line%57)*57 + out_x*1 ;
if( (out_x + 0) >= 57 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 192 ) { out_off[ 0*3249 + 0*1 ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( (out_chan + 1) < 192 ) { out_off[ 1*3249 + 0*1 ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( (out_chan + 2) < 192 ) { out_off[ 2*3249 + 0*1 ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( (out_chan + 3) < 192 ) { out_off[ 3*3249 + 0*1 ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( (out_chan + 4) < 192 ) { out_off[ 4*3249 + 0*1 ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( (out_chan + 5) < 192 ) { out_off[ 5*3249 + 0*1 ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( (out_chan + 6) < 192 ) { out_off[ 6*3249 + 0*1 ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( (out_chan + 7) < 192 ) { out_off[ 7*3249 + 0*1 ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( (out_x + 1) >= 57 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 192 ) { out_off[ 0*3249 + 1*1 ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( (out_chan + 1) < 192 ) { out_off[ 1*3249 + 1*1 ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( (out_chan + 2) < 192 ) { out_off[ 2*3249 + 1*1 ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( (out_chan + 3) < 192 ) { out_off[ 3*3249 + 1*1 ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( (out_chan + 4) < 192 ) { out_off[ 4*3249 + 1*1 ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( (out_chan + 5) < 192 ) { out_off[ 5*3249 + 1*1 ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( (out_chan + 6) < 192 ) { out_off[ 6*3249 + 1*1 ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( (out_chan + 7) < 192 ) { out_off[ 7*3249 + 1*1 ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( (out_x + 2) >= 57 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 192 ) { out_off[ 0*3249 + 2*1 ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( (out_chan + 1) < 192 ) { out_off[ 1*3249 + 2*1 ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( (out_chan + 2) < 192 ) { out_off[ 2*3249 + 2*1 ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( (out_chan + 3) < 192 ) { out_off[ 3*3249 + 2*1 ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( (out_chan + 4) < 192 ) { out_off[ 4*3249 + 2*1 ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( (out_chan + 5) < 192 ) { out_off[ 5*3249 + 2*1 ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( (out_chan + 6) < 192 ) { out_off[ 6*3249 + 2*1 ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( (out_chan + 7) < 192 ) { out_off[ 7*3249 + 2*1 ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( (out_x + 3) >= 57 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 192 ) { out_off[ 0*3249 + 3*1 ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( (out_chan + 1) < 192 ) { out_off[ 1*3249 + 3*1 ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( (out_chan + 2) < 192 ) { out_off[ 2*3249 + 3*1 ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( (out_chan + 3) < 192 ) { out_off[ 3*3249 + 3*1 ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( (out_chan + 4) < 192 ) { out_off[ 4*3249 + 3*1 ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( (out_chan + 5) < 192 ) { out_off[ 5*3249 + 3*1 ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( (out_chan + 6) < 192 ) { out_off[ 6*3249 + 3*1 ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( (out_chan + 7) < 192 ) { out_off[ 7*3249 + 3*1 ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( (out_x + 4) >= 57 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 192 ) { out_off[ 0*3249 + 4*1 ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( (out_chan + 1) < 192 ) { out_off[ 1*3249 + 4*1 ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( (out_chan + 2) < 192 ) { out_off[ 2*3249 + 4*1 ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( (out_chan + 3) < 192 ) { out_off[ 3*3249 + 4*1 ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( (out_chan + 4) < 192 ) { out_off[ 4*3249 + 4*1 ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( (out_chan + 5) < 192 ) { out_off[ 5*3249 + 4*1 ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( (out_chan + 6) < 192 ) { out_off[ 6*3249 + 4*1 ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( (out_chan + 7) < 192 ) { out_off[ 7*3249 + 4*1 ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( (out_x + 5) >= 57 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 192 ) { out_off[ 0*3249 + 5*1 ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( (out_chan + 1) < 192 ) { out_off[ 1*3249 + 5*1 ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( (out_chan + 2) < 192 ) { out_off[ 2*3249 + 5*1 ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( (out_chan + 3) < 192 ) { out_off[ 3*3249 + 5*1 ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( (out_chan + 4) < 192 ) { out_off[ 4*3249 + 5*1 ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( (out_chan + 5) < 192 ) { out_off[ 5*3249 + 5*1 ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( (out_chan + 6) < 192 ) { out_off[ 6*3249 + 5*1 ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( (out_chan + 7) < 192 ) { out_off[ 7*3249 + 5*1 ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( (out_x + 6) >= 57 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 192 ) { out_off[ 0*3249 + 6*1 ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( (out_chan + 1) < 192 ) { out_off[ 1*3249 + 6*1 ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( (out_chan + 2) < 192 ) { out_off[ 2*3249 + 6*1 ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( (out_chan + 3) < 192 ) { out_off[ 3*3249 + 6*1 ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( (out_chan + 4) < 192 ) { out_off[ 4*3249 + 6*1 ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( (out_chan + 5) < 192 ) { out_off[ 5*3249 + 6*1 ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( (out_chan + 6) < 192 ) { out_off[ 6*3249 + 6*1 ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( (out_chan + 7) < 192 ) { out_off[ 7*3249 + 6*1 ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( (out_x + 7) >= 57 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 192 ) { out_off[ 0*3249 + 7*1 ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( (out_chan + 1) < 192 ) { out_off[ 1*3249 + 7*1 ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( (out_chan + 2) < 192 ) { out_off[ 2*3249 + 7*1 ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( (out_chan + 3) < 192 ) { out_off[ 3*3249 + 7*1 ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( (out_chan + 4) < 192 ) { out_off[ 4*3249 + 7*1 ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( (out_chan + 5) < 192 ) { out_off[ 5*3249 + 7*1 ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( (out_chan + 6) < 192 ) { out_off[ 6*3249 + 7*1 ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( (out_chan + 7) < 192 ) { out_off[ 7*3249 + 7*1 ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_dim_0 = 57 */
/* in_dim_1 = 57 */
/* kern_sz = 3 */
/* stride = 1 */
/* in_pad = 1 */
/* t_tile_sz = 8 */
/* conv_has_relu = 1 */
/* out_chans = 192 */
/* in_chans = 64 */
/* rtc_func_name = tconv__num_imgs_20__in_dim_0_57__in_dim_1_57__kern_sz_3__stride_1__in_pad_1__t_tile_sz_8__conv_has_relu_1__out_chans_192__in_chans_64 */
/* out_ix_x_dim = 57 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%57) */
/* out_ix_y_dim = 57 */
/* out_ix_y_sz = 57 */
/* out_ix_y_nomod = (out_ix/57) */
/* out_ix_y = ((out_ix/57)%%57) */
/* out_ix_chan_dim = 192 */
/* out_ix_chan_sz = 3249 */
/* out_ix_chan_nomod = (out_ix/3249) */
/* out_ix_chan = ((out_ix/3249)%%192) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 623808 */
/* out_ix_img_nomod = (out_ix/623808) */
/* out_ix_img = (out_ix/623808) */
/* out_ix_sz = 12476160 */
/* tpb = 128 */
/* out_line_y_dim = 57 */
/* out_line_y_sz = 1 */
/* out_line_y_nomod = out_line */
/* out_line_y = (out_line%%57) */
/* out_line_img_dim = 20 */
/* out_line_img_sz = 57 */
/* out_line_img_nomod = (out_line/57) */
/* out_line_img = (out_line/57) */
/* out_line_sz = 1140 */
/* in_ix_blk_x_dim = 10 */
/* in_ix_blk_x_sz = 1 */
/* in_ix_blk_x_nomod = in_ix */
/* in_ix_blk_x = (in_ix%%10) */
/* in_ix_blk_y_dim = 12 */
/* in_ix_blk_y_sz = 10 */
/* in_ix_blk_y_nomod = (in_ix/10) */
/* in_ix_blk_y = ((in_ix/10)%%12) */
/* in_ix_blk_in_chan_dim = 64 */
/* in_ix_blk_in_chan_sz = 120 */
/* in_ix_blk_in_chan_nomod = (in_ix/120) */
/* in_ix_blk_in_chan = ((in_ix/120)%%64) */
/* in_ix_blk_bx_dim = 8 */
/* in_ix_blk_bx_sz = 7680 */
/* in_ix_blk_bx_nomod = (in_ix/7680) */
/* in_ix_blk_bx = ((in_ix/7680)%%8) */
/* in_ix_blk_bline_dim = 143 */
/* in_ix_blk_bline_sz = 61440 */
/* in_ix_blk_bline_nomod = (in_ix/61440) */
/* in_ix_blk_bline = (in_ix/61440) */
/* in_ix_sz = 8785920 */
/* LOC_ID_1D_out_chan_tile_dim = 16 */
/* LOC_ID_1D_out_chan_tile_sz = 1 */
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%16) */
/* LOC_ID_1D_blk_y_dim = 8 */
/* LOC_ID_1D_blk_y_sz = 16 */
/* LOC_ID_1D_blk_y_nomod = (LOC_ID_1D/16) */
/* LOC_ID_1D_blk_y = (LOC_ID_1D/16) */
/* LOC_ID_1D_sz = 128 */
/* GRP_ID_1D_out_chan_blk_dim = 2 */
/* GRP_ID_1D_out_chan_blk_sz = 1 */
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%2) */
/* GRP_ID_1D_blk_bx_dim = 8 */
/* GRP_ID_1D_blk_bx_sz = 2 */
/* GRP_ID_1D_blk_bx_nomod = (GRP_ID_1D/2) */
/* GRP_ID_1D_blk_bx = ((GRP_ID_1D/2)%%8) */
/* GRP_ID_1D_blk_bline_dim = 143 */
/* GRP_ID_1D_blk_bline_sz = 16 */
/* GRP_ID_1D_blk_bline_nomod = (GRP_ID_1D/16) */
/* GRP_ID_1D_blk_bline = (GRP_ID_1D/16) */
/* GRP_ID_1D_sz = 2288 */
/* blk_filt_ix_sz = 128 */
/* filts_smem_sz = 384 */
/* in_smem_sz = 120 */
/* out_smem_sz = 1024 */
/* all_smem_sz = 1024 */
/* filts_xp_ix_out_chan_tile_dim = 16 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%16) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 16 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/16) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/16)%%8) */
/* filts_xp_ix_x_dim = 3 */
/* filts_xp_ix_x_sz = 128 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/128) */
/* filts_xp_ix_x = ((filts_xp_ix/128)%%3) */
/* filts_xp_ix_y_dim = 3 */
/* filts_xp_ix_y_sz = 384 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/384) */
/* filts_xp_ix_y = ((filts_xp_ix/384)%%3) */
/* filts_xp_ix_in_chan_dim = 64 */
/* filts_xp_ix_in_chan_sz = 1152 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/1152) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/1152)%%64) */
/* filts_xp_ix_out_chan_blk_dim = 2 */
/* filts_xp_ix_out_chan_blk_sz = 73728 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/73728) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/73728) */
/* filts_xp_ix_sz = 147456 */
/* out_chan_bias_smem_load_iter = 1 */
/* filts_off_adj = LOC_ID_1D */
/* filt_smem_loads = // begin filt_smem_loads
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)];
filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)];
filts_smem[(LOC_ID_1D + %(tpb) * 2)] = filts[filts_off+(%(tpb)*2)];
filts_off += %(filts_xp_ix_y_sz);
// end filt_smem_loads */
/* in_smem_loads = // begin in_smem_loads
if( (LOC_ID_1D + %(tpb) * 0) < %(in_smem_sz)) { in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ];}
blk_in_ix_base += %(in_ix_blk_in_chan_sz);
// end in_smem_loads */
/* inner_loop_body = // begin inner_loop_body
in_strip[0] = in_smem_off[0];
in_strip[1] = in_smem_off[1];
in_strip[2] = in_smem_off[2];
in_strip[3] = in_smem_off[3];
in_strip[4] = in_smem_off[4];
in_strip[5] = in_smem_off[5];
in_strip[6] = in_smem_off[6];
in_strip[7] = in_smem_off[7];
in_strip[8] = in_smem_off[8];
in_strip[9] = in_smem_off[9];
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[1];
out_tile[1] += filts_strip[1]*in_strip[1];
out_tile[2] += filts_strip[2]*in_strip[1];
out_tile[3] += filts_strip[3]*in_strip[1];
out_tile[4] += filts_strip[4]*in_strip[1];
out_tile[5] += filts_strip[5]*in_strip[1];
out_tile[6] += filts_strip[6]*in_strip[1];
out_tile[7] += filts_strip[7]*in_strip[1];
out_tile[8] += filts_strip[0]*in_strip[2];
out_tile[9] += filts_strip[1]*in_strip[2];
out_tile[10] += filts_strip[2]*in_strip[2];
out_tile[11] += filts_strip[3]*in_strip[2];
out_tile[12] += filts_strip[4]*in_strip[2];
out_tile[13] += filts_strip[5]*in_strip[2];
out_tile[14] += filts_strip[6]*in_strip[2];
out_tile[15] += filts_strip[7]*in_strip[2];
out_tile[16] += filts_strip[0]*in_strip[3];
out_tile[17] += filts_strip[1]*in_strip[3];
out_tile[18] += filts_strip[2]*in_strip[3];
out_tile[19] += filts_strip[3]*in_strip[3];
out_tile[20] += filts_strip[4]*in_strip[3];
out_tile[21] += filts_strip[5]*in_strip[3];
out_tile[22] += filts_strip[6]*in_strip[3];
out_tile[23] += filts_strip[7]*in_strip[3];
out_tile[24] += filts_strip[0]*in_strip[4];
out_tile[25] += filts_strip[1]*in_strip[4];
out_tile[26] += filts_strip[2]*in_strip[4];
out_tile[27] += filts_strip[3]*in_strip[4];
out_tile[28] += filts_strip[4]*in_strip[4];
out_tile[29] += filts_strip[5]*in_strip[4];
out_tile[30] += filts_strip[6]*in_strip[4];
out_tile[31] += filts_strip[7]*in_strip[4];
out_tile[32] += filts_strip[0]*in_strip[5];
out_tile[33] += filts_strip[1]*in_strip[5];
out_tile[34] += filts_strip[2]*in_strip[5];
out_tile[35] += filts_strip[3]*in_strip[5];
out_tile[36] += filts_strip[4]*in_strip[5];
out_tile[37] += filts_strip[5]*in_strip[5];
out_tile[38] += filts_strip[6]*in_strip[5];
out_tile[39] += filts_strip[7]*in_strip[5];
out_tile[40] += filts_strip[0]*in_strip[6];
out_tile[41] += filts_strip[1]*in_strip[6];
out_tile[42] += filts_strip[2]*in_strip[6];
out_tile[43] += filts_strip[3]*in_strip[6];
out_tile[44] += filts_strip[4]*in_strip[6];
out_tile[45] += filts_strip[5]*in_strip[6];
out_tile[46] += filts_strip[6]*in_strip[6];
out_tile[47] += filts_strip[7]*in_strip[6];
out_tile[48] += filts_strip[0]*in_strip[7];
out_tile[49] += filts_strip[1]*in_strip[7];
out_tile[50] += filts_strip[2]*in_strip[7];
out_tile[51] += filts_strip[3]*in_strip[7];
out_tile[52] += filts_strip[4]*in_strip[7];
out_tile[53] += filts_strip[5]*in_strip[7];
out_tile[54] += filts_strip[6]*in_strip[7];
out_tile[55] += filts_strip[7]*in_strip[7];
out_tile[56] += filts_strip[0]*in_strip[8];
out_tile[57] += filts_strip[1]*in_strip[8];
out_tile[58] += filts_strip[2]*in_strip[8];
out_tile[59] += filts_strip[3]*in_strip[8];
out_tile[60] += filts_strip[4]*in_strip[8];
out_tile[61] += filts_strip[5]*in_strip[8];
out_tile[62] += filts_strip[6]*in_strip[8];
out_tile[63] += filts_strip[7]*in_strip[8];
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[2];
out_tile[1] += filts_strip[1]*in_strip[2];
out_tile[2] += filts_strip[2]*in_strip[2];
out_tile[3] += filts_strip[3]*in_strip[2];
out_tile[4] += filts_strip[4]*in_strip[2];
out_tile[5] += filts_strip[5]*in_strip[2];
out_tile[6] += filts_strip[6]*in_strip[2];
out_tile[7] += filts_strip[7]*in_strip[2];
out_tile[8] += filts_strip[0]*in_strip[3];
out_tile[9] += filts_strip[1]*in_strip[3];
out_tile[10] += filts_strip[2]*in_strip[3];
out_tile[11] += filts_strip[3]*in_strip[3];
out_tile[12] += filts_strip[4]*in_strip[3];
out_tile[13] += filts_strip[5]*in_strip[3];
out_tile[14] += filts_strip[6]*in_strip[3];
out_tile[15] += filts_strip[7]*in_strip[3];
out_tile[16] += filts_strip[0]*in_strip[4];
out_tile[17] += filts_strip[1]*in_strip[4];
out_tile[18] += filts_strip[2]*in_strip[4];
out_tile[19] += filts_strip[3]*in_strip[4];
out_tile[20] += filts_strip[4]*in_strip[4];
out_tile[21] += filts_strip[5]*in_strip[4];
out_tile[22] += filts_strip[6]*in_strip[4];
out_tile[23] += filts_strip[7]*in_strip[4];
out_tile[24] += filts_strip[0]*in_strip[5];
out_tile[25] += filts_strip[1]*in_strip[5];
out_tile[26] += filts_strip[2]*in_strip[5];
out_tile[27] += filts_strip[3]*in_strip[5];
out_tile[28] += filts_strip[4]*in_strip[5];
out_tile[29] += filts_strip[5]*in_strip[5];
out_tile[30] += filts_strip[6]*in_strip[5];
out_tile[31] += filts_strip[7]*in_strip[5];
out_tile[32] += filts_strip[0]*in_strip[6];
out_tile[33] += filts_strip[1]*in_strip[6];
out_tile[34] += filts_strip[2]*in_strip[6];
out_tile[35] += filts_strip[3]*in_strip[6];
out_tile[36] += filts_strip[4]*in_strip[6];
out_tile[37] += filts_strip[5]*in_strip[6];
out_tile[38] += filts_strip[6]*in_strip[6];
out_tile[39] += filts_strip[7]*in_strip[6];
out_tile[40] += filts_strip[0]*in_strip[7];
out_tile[41] += filts_strip[1]*in_strip[7];
out_tile[42] += filts_strip[2]*in_strip[7];
out_tile[43] += filts_strip[3]*in_strip[7];
out_tile[44] += filts_strip[4]*in_strip[7];
out_tile[45] += filts_strip[5]*in_strip[7];
out_tile[46] += filts_strip[6]*in_strip[7];
out_tile[47] += filts_strip[7]*in_strip[7];
out_tile[48] += filts_strip[0]*in_strip[8];
out_tile[49] += filts_strip[1]*in_strip[8];
out_tile[50] += filts_strip[2]*in_strip[8];
out_tile[51] += filts_strip[3]*in_strip[8];
out_tile[52] += filts_strip[4]*in_strip[8];
out_tile[53] += filts_strip[5]*in_strip[8];
out_tile[54] += filts_strip[6]*in_strip[8];
out_tile[55] += filts_strip[7]*in_strip[8];
out_tile[56] += filts_strip[0]*in_strip[9];
out_tile[57] += filts_strip[1]*in_strip[9];
out_tile[58] += filts_strip[2]*in_strip[9];
out_tile[59] += filts_strip[3]*in_strip[9];
out_tile[60] += filts_strip[4]*in_strip[9];
out_tile[61] += filts_strip[5]*in_strip[9];
out_tile[62] += filts_strip[6]*in_strip[9];
out_tile[63] += filts_strip[7]*in_strip[9];
*/
/* t_tile_bias_loads = // begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)];
// end t_tile_bias_loads */
/* t_tile_stores = // begin t_tile_stores
if( %(out_line_img) >= %(out_ix_img_dim) ) { return; }
int32_t out_x = %(GRP_ID_1D_blk_bx)*%(t_tile_sz);
int32_t out_chan = (%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim) + %(LOC_ID_1D_out_chan_tile))*%(t_tile_sz);
GASQ float * out_off = out + %(out_line_img)*%(out_ix_img_sz) + out_chan*%(out_ix_chan_sz) + %(out_line_y)*%(out_ix_y_sz) + out_x*%(out_ix_x_sz) ;
if( (out_x + 0) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( (out_x + 1) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( (out_x + 2) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( (out_x + 3) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( (out_x + 4) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( (out_x + 5) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( (out_x + 6) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( (out_x + 7) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores */
CUCL_GLOBAL_KERNEL void in_tile_xpose__num_imgs_20__stride_1__kern_sz_3__in_pad_1__in_chans_64__ysz_57__xsz_57__tix_pels_tile_sz_8__t_tile_sz_8__bix_pels_blk_sz_1144( GASQ float const * const in, GASQ float * const out ) {
int32_t const out_ix = GLOB_ID_1D;
if( out_ix >= 8785920 ) { return; }
int32_t const out_line = (out_ix/61440)*8;
int32_t const fi_skip_in_lines = (out_line%57)*1;
int32_t const in_line = (((out_ix/10)%12)+fi_skip_in_lines);
int32_t const img_in_lines = (57 - 1)*1 + 3;
int32_t const img_off = in_line/img_in_lines;
int32_t const img = (out_line/57) + img_off;
int32_t const iy = (in_line % img_in_lines) - 1; //(out_line%57)*1 + ((out_ix/10)%12) - 1;
int32_t const ix = ((out_ix/7680)%8)*8*1 + (out_ix%10) - 1;
float v = 0.0f;
if( 1
&& ( ix >= 0 )
&& ( iy >= 0 )
&& ( ix < 57 )
&& ( iy < 57 )
&& ( img < 20 )
)
{
v = in[ img*207936 +
((out_ix/120)%64)*3249 +
iy*57 +
ix*1 ];
}
out[out_ix] = v;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* stride = 1 */
/* kern_sz = 3 */
/* in_pad = 1 */
/* in_chans = 64 */
/* ysz = 57 */
/* xsz = 57 */
/* tix_pels_tile_sz = 8 */
/* t_tile_sz = 8 */
/* bix_pels_blk_sz = 1144 */
/* rtc_func_name = in_tile_xpose__num_imgs_20__stride_1__kern_sz_3__in_pad_1__in_chans_64__ysz_57__xsz_57__tix_pels_tile_sz_8__t_tile_sz_8__bix_pels_blk_sz_1144 */
/* out_ix_blk_x_dim = 10 */
/* out_ix_blk_x_sz = 1 */
/* out_ix_blk_x_nomod = out_ix */
/* out_ix_blk_x = (out_ix%%10) */
/* out_ix_blk_y_dim = 12 */
/* out_ix_blk_y_sz = 10 */
/* out_ix_blk_y_nomod = (out_ix/10) */
/* out_ix_blk_y = ((out_ix/10)%%12) */
/* out_ix_blk_in_chan_dim = 64 */
/* out_ix_blk_in_chan_sz = 120 */
/* out_ix_blk_in_chan_nomod = (out_ix/120) */
/* out_ix_blk_in_chan = ((out_ix/120)%%64) */
/* out_ix_blk_bx_dim = 8 */
/* out_ix_blk_bx_sz = 7680 */
/* out_ix_blk_bx_nomod = (out_ix/7680) */
/* out_ix_blk_bx = ((out_ix/7680)%%8) */
/* out_ix_blk_bline_dim = 143 */
/* out_ix_blk_bline_sz = 61440 */
/* out_ix_blk_bline_nomod = (out_ix/61440) */
/* out_ix_blk_bline = (out_ix/61440) */
/* out_ix_sz = 8785920 */
/* out_line_y_dim = 57 */
/* out_line_y_sz = 1 */
/* out_line_y_nomod = out_line */
/* out_line_y = (out_line%%57) */
/* out_line_img_dim = 20 */
/* out_line_img_sz = 57 */
/* out_line_img_nomod = (out_line/57) */
/* out_line_img = (out_line/57) */
/* out_line_sz = 1140 */
/* in_ix_x_dim = 57 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%57) */
/* in_ix_y_dim = 57 */
/* in_ix_y_sz = 57 */
/* in_ix_y_nomod = (in_ix/57) */
/* in_ix_y = ((in_ix/57)%%57) */
/* in_ix_chan_dim = 64 */
/* in_ix_chan_sz = 3249 */
/* in_ix_chan_nomod = (in_ix/3249) */
/* in_ix_chan = ((in_ix/3249)%%64) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 207936 */
/* in_ix_img_nomod = (in_ix/207936) */
/* in_ix_img = (in_ix/207936) */
/* in_ix_sz = 4158720 */
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_192__in_chans_64__kysz_3__kxsz_3( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile
{
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg
int32_t const filts_ix = GLOB_ID_1D;
if( filts_ix >= 110592 ) { return; }
int32_t const fioc = (filts_ix/576);
float val = 0.0f;
int32_t const filts_xp_ix =
(fioc/128)*73728 +
(fioc%8)*16 +
((fioc/8)%16)*1 +
((filts_ix/9)%64)*1152 +
((filts_ix/3)%3)*384 +
(filts_ix%3)*128;
#if 1
val = in[filts_ix];
#else
if( ((filts_ix/9)%64) == 0 ) {
// if( ((filts_ix%3) == 5) && (((filts_ix/3)%3) == 5) )
{
val = (filts_ix%3)*100 + ((filts_ix/3)%3);
}
}
#endif
out[filts_xp_ix] = val;
}
// -- template substituion table used: --
/* out_chans = 192 */
/* in_chans = 64 */
/* kysz = 3 */
/* kxsz = 3 */
/* rtc_func_name = xpose_filts__out_chans_192__in_chans_64__kysz_3__kxsz_3 */
/* t_tile_sz = 8 */
/* filts_ix_x_dim = 3 */
/* filts_ix_x_sz = 1 */
/* filts_ix_x_nomod = filts_ix */
/* filts_ix_x = (filts_ix%%3) */
/* filts_ix_y_dim = 3 */
/* filts_ix_y_sz = 3 */
/* filts_ix_y_nomod = (filts_ix/3) */
/* filts_ix_y = ((filts_ix/3)%%3) */
/* filts_ix_in_chan_dim = 64 */
/* filts_ix_in_chan_sz = 9 */
/* filts_ix_in_chan_nomod = (filts_ix/9) */
/* filts_ix_in_chan = ((filts_ix/9)%%64) */
/* filts_ix_out_chan_dim = 192 */
/* filts_ix_out_chan_sz = 576 */
/* filts_ix_out_chan_nomod = (filts_ix/576) */
/* filts_ix_out_chan = (filts_ix/576) */
/* filts_ix_sz = 110592 */
/* filts_xp_ix_out_chan_tile_dim = 16 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%16) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 16 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/16) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/16)%%8) */
/* filts_xp_ix_x_dim = 3 */
/* filts_xp_ix_x_sz = 128 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/128) */
/* filts_xp_ix_x = ((filts_xp_ix/128)%%3) */
/* filts_xp_ix_y_dim = 3 */
/* filts_xp_ix_y_sz = 384 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/384) */
/* filts_xp_ix_y = ((filts_xp_ix/384)%%3) */
/* filts_xp_ix_in_chan_dim = 64 */
/* filts_xp_ix_in_chan_sz = 1152 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/1152) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/1152)%%64) */
/* filts_xp_ix_out_chan_blk_dim = 2 */
/* filts_xp_ix_out_chan_blk_sz = 73728 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/73728) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/73728) */
/* filts_xp_ix_sz = 147456 */
/* fioc_out_chan_reg_dim = 8 */
/* fioc_out_chan_reg_sz = 1 */
/* fioc_out_chan_reg_nomod = fioc */
/* fioc_out_chan_reg = (fioc%%8) */
/* fioc_out_chan_tile_dim = 16 */
/* fioc_out_chan_tile_sz = 8 */
/* fioc_out_chan_tile_nomod = (fioc/8) */
/* fioc_out_chan_tile = ((fioc/8)%%16) */
/* fioc_out_chan_blk_dim = 2 */
/* fioc_out_chan_blk_sz = 128 */
/* fioc_out_chan_blk_nomod = (fioc/128) */
/* fioc_out_chan_blk = (fioc/128) */
/* fioc_sz = 256 */
// each thread: computes outputs across chan dim, using inputs across chan dim
CUCL_GLOBAL_KERNEL void lrn__num_imgs_20__chans_192__ysz_57__xsz_57__local_size_5__alpha_0_0001__beta_0_75__k_1( GASQ float const * const in, GASQ float * const out ) {
int32_t const tix = GLOB_ID_1D;
if( tix >= 64980 ) { return; }
// iteratate over chans
float ls_buf[5] = {0.0f};
int32_t const hls = 5 >> 1;
int32_t const out_base_ix = (tix/3249)*623808 + ((tix/57)%57)*57 + (tix%57)*1;
for( int32_t in_chan_ix = 0; in_chan_ix < 192 + hls; ++in_chan_ix ) {
int32_t const in_off = in_chan_ix*3249;
int32_t const lsb_ix = in_chan_ix % 5;
ls_buf[lsb_ix] = (in_chan_ix < 192) ? in[out_base_ix + in_off] : 0.0f;
if( in_chan_ix >= hls ) {
int32_t const out_chan_ix = in_chan_ix - hls;
float ls_sum = 0.0f;
for( int32_t i = 0; i != 5; ++i ) { ls_sum += ls_buf[i]*ls_buf[i]; }
float const scale = powf( (1 + 0.0001*ls_sum/5), -0.75 );
out[out_base_ix + out_chan_ix*3249] = ls_buf[(lsb_ix+5-hls) % 5] * scale;
}
}
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* chans = 192 */
/* ysz = 57 */
/* xsz = 57 */
/* local_size = 5 */
/* alpha = 0.0001 */
/* beta = 0.75 */
/* k = 1 */
/* rtc_func_name = lrn__num_imgs_20__chans_192__ysz_57__xsz_57__local_size_5__alpha_0_0001__beta_0_75__k_1 */
/* tix_x_dim = 57 */
/* tix_x_sz = 1 */
/* tix_x_nomod = tix */
/* tix_x = (tix%%57) */
/* tix_y_dim = 57 */
/* tix_y_sz = 57 */
/* tix_y_nomod = (tix/57) */
/* tix_y = ((tix/57)%%57) */
/* tix_img_dim = 20 */
/* tix_img_sz = 3249 */
/* tix_img_nomod = (tix/3249) */
/* tix_img = (tix/3249) */
/* tix_sz = 64980 */
/* out_ix_x_dim = 57 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%57) */
/* out_ix_y_dim = 57 */
/* out_ix_y_sz = 57 */
/* out_ix_y_nomod = (out_ix/57) */
/* out_ix_y = ((out_ix/57)%%57) */
/* out_ix_chan_dim = 192 */
/* out_ix_chan_sz = 3249 */
/* out_ix_chan_nomod = (out_ix/3249) */
/* out_ix_chan = ((out_ix/3249)%%192) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 623808 */
/* out_ix_img_nomod = (out_ix/623808) */
/* out_ix_img = (out_ix/623808) */
/* out_ix_sz = 12476160 */
CUCL_GLOBAL_KERNEL void pool__num_imgs_20__in_pad_0__in_dim_0_57__in_dim_1_57__conv_has_relu_0__kern_sz_3__stride_2__out_chans_192__avg_pool_0( GASQ float const * const in, GASQ float * const out ) {
int32_t const out_ix = GLOB_ID_1D;
if( out_ix >= 3010560 ) { return; }
float out_v = 0.0f;
for( int32_t kx = 0; kx != 3; ++kx ) {
for( int32_t ky = 0; ky != 3; ++ky ) {
float v = 0;
int const in_ix_y = ((out_ix/28)%28)*2 + ky - 0;
int const in_ix_x = (out_ix%28)*2 + kx - 0;
if(in_ix_y >= 0 && in_ix_x >= 0 && in_ix_x < 57 && in_ix_y < 57 ) {
int32_t const in_ix = (out_ix/150528)*623808 + ((out_ix/784)%192)*3249 +
in_ix_y*57 + in_ix_x*1;
v = in[in_ix];
}
out_v = max( out_v, v );
}
}
;
out[out_ix] = out_v;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_pad = 0 */
/* in_dim_0 = 57 */
/* in_dim_1 = 57 */
/* conv_has_relu = 0 */
/* kern_sz = 3 */
/* stride = 2 */
/* out_chans = 192 */
/* avg_pool = 0 */
/* rtc_func_name = pool__num_imgs_20__in_pad_0__in_dim_0_57__in_dim_1_57__conv_has_relu_0__kern_sz_3__stride_2__out_chans_192__avg_pool_0 */
/* t_tile_sz = 8 */
/* out_ix_x_dim = 28 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%28) */
/* out_ix_y_dim = 28 */
/* out_ix_y_sz = 28 */
/* out_ix_y_nomod = (out_ix/28) */
/* out_ix_y = ((out_ix/28)%%28) */
/* out_ix_chan_dim = 192 */
/* out_ix_chan_sz = 784 */
/* out_ix_chan_nomod = (out_ix/784) */
/* out_ix_chan = ((out_ix/784)%%192) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 150528 */
/* out_ix_img_nomod = (out_ix/150528) */
/* out_ix_img = (out_ix/150528) */
/* out_ix_sz = 3010560 */
/* in_ix_x_dim = 57 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%57) */
/* in_ix_y_dim = 57 */
/* in_ix_y_sz = 57 */
/* in_ix_y_nomod = (in_ix/57) */
/* in_ix_y = ((in_ix/57)%%57) */
/* in_ix_chan_dim = 192 */
/* in_ix_chan_sz = 3249 */
/* in_ix_chan_nomod = (in_ix/3249) */
/* in_ix_chan = ((in_ix/3249)%%192) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 623808 */
/* in_ix_img_nomod = (in_ix/623808) */
/* in_ix_img = (in_ix/623808) */
/* in_ix_sz = 12476160 */
/* op = out_v = max( out_v, v ) */
/* op_post = */
// each thread: computes 8x8 block of out
// loop over k dim
CUCL_GLOBAL_KERNEL void k1conv__num_imgs_20__in_dim_0_28__in_dim_1_28__conv_has_relu_1__out_chans_96__write_xposed_0__in_chans_192( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) {
//int32_t const blk_in_ix_sz = 10*8;
LOCSHAR_MEM float all_smem[1408]; // note: max(filts+in,out) == max(768+640,960)
LSMASQ float * const filts_smem = all_smem;
LSMASQ float * const in_smem = filts_smem + 768;
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz )
float in_strip[8]; // segment of input line sufficient for one unrolling of inner loop
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*18432; // index of first out chan
int32_t blk_in_ix_base = GRP_ID_1D*15360 + LOC_ID_1D;// index of first input pel to load for this thread
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%12);
LSMASQ float * const in_smem_off = in_smem + 8*(LOC_ID_1D/12);
LSMASQ float * const out_smem_off = all_smem + LOC_ID_1D;
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D;
// iteratate over filter elements
for( int32_t blk_iter = 0; blk_iter != 24; ++blk_iter ) {
BARRIER_SYNC;
// begin smem_loads
filts_smem[(LOC_ID_1D + 120 * 0)] = filts[filts_off+(120*0)];
filts_smem[(LOC_ID_1D + 120 * 1)] = filts[filts_off+(120*1)];
filts_smem[(LOC_ID_1D + 120 * 2)] = filts[filts_off+(120*2)];
filts_smem[(LOC_ID_1D + 120 * 3)] = filts[filts_off+(120*3)];
filts_smem[(LOC_ID_1D + 120 * 4)] = filts[filts_off+(120*4)];
filts_smem[(LOC_ID_1D + 120 * 5)] = filts[filts_off+(120*5)];
if( (LOC_ID_1D + 120 * 6) < 768 ) { filts_smem[(LOC_ID_1D + 120 * 6)] = filts[filts_off+(120*6)];}
in_smem[(LOC_ID_1D + 120 * 0)] = in[ blk_in_ix_base + (120*0) ];
in_smem[(LOC_ID_1D + 120 * 1)] = in[ blk_in_ix_base + (120*1) ];
in_smem[(LOC_ID_1D + 120 * 2)] = in[ blk_in_ix_base + (120*2) ];
in_smem[(LOC_ID_1D + 120 * 3)] = in[ blk_in_ix_base + (120*3) ];
in_smem[(LOC_ID_1D + 120 * 4)] = in[ blk_in_ix_base + (120*4) ];
if( (LOC_ID_1D + 120 * 5) < 640) { in_smem[(LOC_ID_1D + 120 * 5)] = in[ blk_in_ix_base + (120*5) ];}
// end smem_loads;
BARRIER_SYNC;
filts_off += 96*8;
blk_in_ix_base += 640;
// begin inner_loop_body
filts_strip[0] = filts_smem_off[0*96+0*12];
filts_strip[1] = filts_smem_off[0*96+1*12];
filts_strip[2] = filts_smem_off[0*96+2*12];
filts_strip[3] = filts_smem_off[0*96+3*12];
filts_strip[4] = filts_smem_off[0*96+4*12];
filts_strip[5] = filts_smem_off[0*96+5*12];
filts_strip[6] = filts_smem_off[0*96+6*12];
filts_strip[7] = filts_smem_off[0*96+7*12];
in_strip[0] = in_smem_off[(0*8*10+0)];
in_strip[1] = in_smem_off[(0*8*10+1)];
in_strip[2] = in_smem_off[(0*8*10+2)];
in_strip[3] = in_smem_off[(0*8*10+3)];
in_strip[4] = in_smem_off[(0*8*10+4)];
in_strip[5] = in_smem_off[(0*8*10+5)];
in_strip[6] = in_smem_off[(0*8*10+6)];
in_strip[7] = in_smem_off[(0*8*10+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[1*96+0*12];
filts_strip[1] = filts_smem_off[1*96+1*12];
filts_strip[2] = filts_smem_off[1*96+2*12];
filts_strip[3] = filts_smem_off[1*96+3*12];
filts_strip[4] = filts_smem_off[1*96+4*12];
filts_strip[5] = filts_smem_off[1*96+5*12];
filts_strip[6] = filts_smem_off[1*96+6*12];
filts_strip[7] = filts_smem_off[1*96+7*12];
in_strip[0] = in_smem_off[(1*8*10+0)];
in_strip[1] = in_smem_off[(1*8*10+1)];
in_strip[2] = in_smem_off[(1*8*10+2)];
in_strip[3] = in_smem_off[(1*8*10+3)];
in_strip[4] = in_smem_off[(1*8*10+4)];
in_strip[5] = in_smem_off[(1*8*10+5)];
in_strip[6] = in_smem_off[(1*8*10+6)];
in_strip[7] = in_smem_off[(1*8*10+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[2*96+0*12];
filts_strip[1] = filts_smem_off[2*96+1*12];
filts_strip[2] = filts_smem_off[2*96+2*12];
filts_strip[3] = filts_smem_off[2*96+3*12];
filts_strip[4] = filts_smem_off[2*96+4*12];
filts_strip[5] = filts_smem_off[2*96+5*12];
filts_strip[6] = filts_smem_off[2*96+6*12];
filts_strip[7] = filts_smem_off[2*96+7*12];
in_strip[0] = in_smem_off[(2*8*10+0)];
in_strip[1] = in_smem_off[(2*8*10+1)];
in_strip[2] = in_smem_off[(2*8*10+2)];
in_strip[3] = in_smem_off[(2*8*10+3)];
in_strip[4] = in_smem_off[(2*8*10+4)];
in_strip[5] = in_smem_off[(2*8*10+5)];
in_strip[6] = in_smem_off[(2*8*10+6)];
in_strip[7] = in_smem_off[(2*8*10+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[3*96+0*12];
filts_strip[1] = filts_smem_off[3*96+1*12];
filts_strip[2] = filts_smem_off[3*96+2*12];
filts_strip[3] = filts_smem_off[3*96+3*12];
filts_strip[4] = filts_smem_off[3*96+4*12];
filts_strip[5] = filts_smem_off[3*96+5*12];
filts_strip[6] = filts_smem_off[3*96+6*12];
filts_strip[7] = filts_smem_off[3*96+7*12];
in_strip[0] = in_smem_off[(3*8*10+0)];
in_strip[1] = in_smem_off[(3*8*10+1)];
in_strip[2] = in_smem_off[(3*8*10+2)];
in_strip[3] = in_smem_off[(3*8*10+3)];
in_strip[4] = in_smem_off[(3*8*10+4)];
in_strip[5] = in_smem_off[(3*8*10+5)];
in_strip[6] = in_smem_off[(3*8*10+6)];
in_strip[7] = in_smem_off[(3*8*10+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[4*96+0*12];
filts_strip[1] = filts_smem_off[4*96+1*12];
filts_strip[2] = filts_smem_off[4*96+2*12];
filts_strip[3] = filts_smem_off[4*96+3*12];
filts_strip[4] = filts_smem_off[4*96+4*12];
filts_strip[5] = filts_smem_off[4*96+5*12];
filts_strip[6] = filts_smem_off[4*96+6*12];
filts_strip[7] = filts_smem_off[4*96+7*12];
in_strip[0] = in_smem_off[(4*8*10+0)];
in_strip[1] = in_smem_off[(4*8*10+1)];
in_strip[2] = in_smem_off[(4*8*10+2)];
in_strip[3] = in_smem_off[(4*8*10+3)];
in_strip[4] = in_smem_off[(4*8*10+4)];
in_strip[5] = in_smem_off[(4*8*10+5)];
in_strip[6] = in_smem_off[(4*8*10+6)];
in_strip[7] = in_smem_off[(4*8*10+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[5*96+0*12];
filts_strip[1] = filts_smem_off[5*96+1*12];
filts_strip[2] = filts_smem_off[5*96+2*12];
filts_strip[3] = filts_smem_off[5*96+3*12];
filts_strip[4] = filts_smem_off[5*96+4*12];
filts_strip[5] = filts_smem_off[5*96+5*12];
filts_strip[6] = filts_smem_off[5*96+6*12];
filts_strip[7] = filts_smem_off[5*96+7*12];
in_strip[0] = in_smem_off[(5*8*10+0)];
in_strip[1] = in_smem_off[(5*8*10+1)];
in_strip[2] = in_smem_off[(5*8*10+2)];
in_strip[3] = in_smem_off[(5*8*10+3)];
in_strip[4] = in_smem_off[(5*8*10+4)];
in_strip[5] = in_smem_off[(5*8*10+5)];
in_strip[6] = in_smem_off[(5*8*10+6)];
in_strip[7] = in_smem_off[(5*8*10+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[6*96+0*12];
filts_strip[1] = filts_smem_off[6*96+1*12];
filts_strip[2] = filts_smem_off[6*96+2*12];
filts_strip[3] = filts_smem_off[6*96+3*12];
filts_strip[4] = filts_smem_off[6*96+4*12];
filts_strip[5] = filts_smem_off[6*96+5*12];
filts_strip[6] = filts_smem_off[6*96+6*12];
filts_strip[7] = filts_smem_off[6*96+7*12];
in_strip[0] = in_smem_off[(6*8*10+0)];
in_strip[1] = in_smem_off[(6*8*10+1)];
in_strip[2] = in_smem_off[(6*8*10+2)];
in_strip[3] = in_smem_off[(6*8*10+3)];
in_strip[4] = in_smem_off[(6*8*10+4)];
in_strip[5] = in_smem_off[(6*8*10+5)];
in_strip[6] = in_smem_off[(6*8*10+6)];
in_strip[7] = in_smem_off[(6*8*10+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[7*96+0*12];
filts_strip[1] = filts_smem_off[7*96+1*12];
filts_strip[2] = filts_smem_off[7*96+2*12];
filts_strip[3] = filts_smem_off[7*96+3*12];
filts_strip[4] = filts_smem_off[7*96+4*12];
filts_strip[5] = filts_smem_off[7*96+5*12];
filts_strip[6] = filts_smem_off[7*96+6*12];
filts_strip[7] = filts_smem_off[7*96+7*12];
in_strip[0] = in_smem_off[(7*8*10+0)];
in_strip[1] = in_smem_off[(7*8*10+1)];
in_strip[2] = in_smem_off[(7*8*10+2)];
in_strip[3] = in_smem_off[(7*8*10+3)];
in_strip[4] = in_smem_off[(7*8*10+4)];
in_strip[5] = in_smem_off[(7*8*10+5)];
in_strip[6] = in_smem_off[(7*8*10+6)];
in_strip[7] = in_smem_off[(7*8*10+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
;
}
// load per-block biases into smem
if( flags == 2 ) { return; }
BARRIER_SYNC;
for( int32_t i = 0; i != 1; ++i ) {
int32_t const t_smem_bias_ix = LOC_ID_1D+120*i;
if( t_smem_bias_ix < 96 ) {
int32_t const ocix_base = (GRP_ID_1D%1)*96;
int32_t const load_reg = t_smem_bias_ix / 12;
int32_t const load_tile = t_smem_bias_ix % 12;
int32_t const ocix = ocix_base + load_tile*8 + load_reg;
if( ocix < 96 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; }
}
}
BARRIER_SYNC;
// load biases into filts_strip
// begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*12];
filts_strip[1] = filts_smem_off[1*12];
filts_strip[2] = filts_smem_off[2*12];
filts_strip[3] = filts_smem_off[3*12];
filts_strip[4] = filts_smem_off[4*12];
filts_strip[5] = filts_smem_off[5*12];
filts_strip[6] = filts_smem_off[6*12];
filts_strip[7] = filts_smem_off[7*12];
// end t_tile_bias_loads;
if( flags == 1 ) {
GASQ float * const out_off = out + LOC_ID_1D;
out_off[0] = max(0.0f,out_tile[0]+filts_strip[0]);
out_off[120] = max(0.0f,out_tile[1]+filts_strip[1]);
out_off[240] = max(0.0f,out_tile[2]+filts_strip[2]);
out_off[360] = max(0.0f,out_tile[3]+filts_strip[3]);
out_off[480] = max(0.0f,out_tile[4]+filts_strip[4]);
out_off[600] = max(0.0f,out_tile[5]+filts_strip[5]);
out_off[720] = max(0.0f,out_tile[6]+filts_strip[6]);
out_off[840] = max(0.0f,out_tile[7]+filts_strip[7]);
out_off[960] = max(0.0f,out_tile[8]+filts_strip[0]);
out_off[1080] = max(0.0f,out_tile[9]+filts_strip[1]);
out_off[1200] = max(0.0f,out_tile[10]+filts_strip[2]);
out_off[1320] = max(0.0f,out_tile[11]+filts_strip[3]);
out_off[1440] = max(0.0f,out_tile[12]+filts_strip[4]);
out_off[1560] = max(0.0f,out_tile[13]+filts_strip[5]);
out_off[1680] = max(0.0f,out_tile[14]+filts_strip[6]);
out_off[1800] = max(0.0f,out_tile[15]+filts_strip[7]);
out_off[1920] = max(0.0f,out_tile[16]+filts_strip[0]);
out_off[2040] = max(0.0f,out_tile[17]+filts_strip[1]);
out_off[2160] = max(0.0f,out_tile[18]+filts_strip[2]);
out_off[2280] = max(0.0f,out_tile[19]+filts_strip[3]);
out_off[2400] = max(0.0f,out_tile[20]+filts_strip[4]);
out_off[2520] = max(0.0f,out_tile[21]+filts_strip[5]);
out_off[2640] = max(0.0f,out_tile[22]+filts_strip[6]);
out_off[2760] = max(0.0f,out_tile[23]+filts_strip[7]);
out_off[2880] = max(0.0f,out_tile[24]+filts_strip[0]);
out_off[3000] = max(0.0f,out_tile[25]+filts_strip[1]);
out_off[3120] = max(0.0f,out_tile[26]+filts_strip[2]);
out_off[3240] = max(0.0f,out_tile[27]+filts_strip[3]);
out_off[3360] = max(0.0f,out_tile[28]+filts_strip[4]);
out_off[3480] = max(0.0f,out_tile[29]+filts_strip[5]);
out_off[3600] = max(0.0f,out_tile[30]+filts_strip[6]);
out_off[3720] = max(0.0f,out_tile[31]+filts_strip[7]);
out_off[3840] = max(0.0f,out_tile[32]+filts_strip[0]);
out_off[3960] = max(0.0f,out_tile[33]+filts_strip[1]);
out_off[4080] = max(0.0f,out_tile[34]+filts_strip[2]);
out_off[4200] = max(0.0f,out_tile[35]+filts_strip[3]);
out_off[4320] = max(0.0f,out_tile[36]+filts_strip[4]);
out_off[4440] = max(0.0f,out_tile[37]+filts_strip[5]);
out_off[4560] = max(0.0f,out_tile[38]+filts_strip[6]);
out_off[4680] = max(0.0f,out_tile[39]+filts_strip[7]);
out_off[4800] = max(0.0f,out_tile[40]+filts_strip[0]);
out_off[4920] = max(0.0f,out_tile[41]+filts_strip[1]);
out_off[5040] = max(0.0f,out_tile[42]+filts_strip[2]);
out_off[5160] = max(0.0f,out_tile[43]+filts_strip[3]);
out_off[5280] = max(0.0f,out_tile[44]+filts_strip[4]);
out_off[5400] = max(0.0f,out_tile[45]+filts_strip[5]);
out_off[5520] = max(0.0f,out_tile[46]+filts_strip[6]);
out_off[5640] = max(0.0f,out_tile[47]+filts_strip[7]);
out_off[5760] = max(0.0f,out_tile[48]+filts_strip[0]);
out_off[5880] = max(0.0f,out_tile[49]+filts_strip[1]);
out_off[6000] = max(0.0f,out_tile[50]+filts_strip[2]);
out_off[6120] = max(0.0f,out_tile[51]+filts_strip[3]);
out_off[6240] = max(0.0f,out_tile[52]+filts_strip[4]);
out_off[6360] = max(0.0f,out_tile[53]+filts_strip[5]);
out_off[6480] = max(0.0f,out_tile[54]+filts_strip[6]);
out_off[6600] = max(0.0f,out_tile[55]+filts_strip[7]);
out_off[6720] = max(0.0f,out_tile[56]+filts_strip[0]);
out_off[6840] = max(0.0f,out_tile[57]+filts_strip[1]);
out_off[6960] = max(0.0f,out_tile[58]+filts_strip[2]);
out_off[7080] = max(0.0f,out_tile[59]+filts_strip[3]);
out_off[7200] = max(0.0f,out_tile[60]+filts_strip[4]);
out_off[7320] = max(0.0f,out_tile[61]+filts_strip[5]);
out_off[7440] = max(0.0f,out_tile[62]+filts_strip[6]);
out_off[7560] = max(0.0f,out_tile[63]+filts_strip[7]);
;
return;
}
// add bias to each elem of out_tile[] and store the results to out[]
// begin t_tile_stores
int32_t tpix[8];
int32_t tcix[8];
tpix[0] = ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+0)/784)*75264 + ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+0)%784)*1 ; // cache out patch ixs
tpix[1] = ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+1)/784)*75264 + ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+1)%784)*1 ; // cache out patch ixs
tpix[2] = ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+2)/784)*75264 + ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+2)%784)*1 ; // cache out patch ixs
tpix[3] = ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+3)/784)*75264 + ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+3)%784)*1 ; // cache out patch ixs
tpix[4] = ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+4)/784)*75264 + ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+4)%784)*1 ; // cache out patch ixs
tpix[5] = ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+5)/784)*75264 + ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+5)%784)*1 ; // cache out patch ixs
tpix[6] = ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+6)/784)*75264 + ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+6)%784)*1 ; // cache out patch ixs
tpix[7] = ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+7)/784)*75264 + ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+7)%784)*1 ; // cache out patch ixs
tcix[0] = ((((LOC_ID_1D%12)+(GRP_ID_1D%1)*12)*8)+0)*784; // cache out chan ixs
tcix[1] = ((((LOC_ID_1D%12)+(GRP_ID_1D%1)*12)*8)+1)*784; // cache out chan ixs
tcix[2] = ((((LOC_ID_1D%12)+(GRP_ID_1D%1)*12)*8)+2)*784; // cache out chan ixs
tcix[3] = ((((LOC_ID_1D%12)+(GRP_ID_1D%1)*12)*8)+3)*784; // cache out chan ixs
tcix[4] = ((((LOC_ID_1D%12)+(GRP_ID_1D%1)*12)*8)+4)*784; // cache out chan ixs
tcix[5] = ((((LOC_ID_1D%12)+(GRP_ID_1D%1)*12)*8)+5)*784; // cache out chan ixs
tcix[6] = ((((LOC_ID_1D%12)+(GRP_ID_1D%1)*12)*8)+6)*784; // cache out chan ixs
tcix[7] = ((((LOC_ID_1D%12)+(GRP_ID_1D%1)*12)*8)+7)*784; // cache out chan ixs
if( ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+0)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (96*784) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( tcix[1] < (96*784) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( tcix[2] < (96*784) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( tcix[3] < (96*784) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( tcix[4] < (96*784) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( tcix[5] < (96*784) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( tcix[6] < (96*784) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( tcix[7] < (96*784) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+1)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (96*784) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( tcix[1] < (96*784) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( tcix[2] < (96*784) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( tcix[3] < (96*784) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( tcix[4] < (96*784) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( tcix[5] < (96*784) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( tcix[6] < (96*784) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( tcix[7] < (96*784) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+2)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (96*784) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( tcix[1] < (96*784) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( tcix[2] < (96*784) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( tcix[3] < (96*784) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( tcix[4] < (96*784) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( tcix[5] < (96*784) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( tcix[6] < (96*784) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( tcix[7] < (96*784) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+3)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (96*784) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( tcix[1] < (96*784) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( tcix[2] < (96*784) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( tcix[3] < (96*784) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( tcix[4] < (96*784) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( tcix[5] < (96*784) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( tcix[6] < (96*784) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( tcix[7] < (96*784) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+4)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (96*784) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( tcix[1] < (96*784) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( tcix[2] < (96*784) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( tcix[3] < (96*784) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( tcix[4] < (96*784) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( tcix[5] < (96*784) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( tcix[6] < (96*784) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( tcix[7] < (96*784) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+5)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (96*784) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( tcix[1] < (96*784) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( tcix[2] < (96*784) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( tcix[3] < (96*784) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( tcix[4] < (96*784) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( tcix[5] < (96*784) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( tcix[6] < (96*784) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( tcix[7] < (96*784) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+6)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (96*784) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( tcix[1] < (96*784) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( tcix[2] < (96*784) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( tcix[3] < (96*784) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( tcix[4] < (96*784) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( tcix[5] < (96*784) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( tcix[6] < (96*784) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( tcix[7] < (96*784) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+7)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (96*784) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( tcix[1] < (96*784) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( tcix[2] < (96*784) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( tcix[3] < (96*784) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( tcix[4] < (96*784) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( tcix[5] < (96*784) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( tcix[6] < (96*784) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( tcix[7] < (96*784) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_dim_0 = 28 */
/* in_dim_1 = 28 */
/* conv_has_relu = 1 */
/* out_chans = 96 */
/* write_xposed = 0 */
/* in_chans = 192 */
/* rtc_func_name = k1conv__num_imgs_20__in_dim_0_28__in_dim_1_28__conv_has_relu_1__out_chans_96__write_xposed_0__in_chans_192 */
/* t_tile_sz = 8 */
/* out_ix_x_dim = 28 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%28) */
/* out_ix_y_dim = 28 */
/* out_ix_y_sz = 28 */
/* out_ix_y_nomod = (out_ix/28) */
/* out_ix_y = ((out_ix/28)%%28) */
/* out_ix_chan_dim = 96 */
/* out_ix_chan_sz = 784 */
/* out_ix_chan_nomod = (out_ix/784) */
/* out_ix_chan = ((out_ix/784)%%96) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 75264 */
/* out_ix_img_nomod = (out_ix/75264) */
/* out_ix_img = (out_ix/75264) */
/* out_ix_sz = 1505280 */
/* tpb = 120 */
/* in_chan_tile = 8 */
/* LOC_ID_1D_out_chan_tile_dim = 12 */
/* LOC_ID_1D_out_chan_tile_sz = 1 */
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%12) */
/* LOC_ID_1D_pels_tile_dim = 10 */
/* LOC_ID_1D_pels_tile_sz = 12 */
/* LOC_ID_1D_pels_tile_nomod = (LOC_ID_1D/12) */
/* LOC_ID_1D_pels_tile = (LOC_ID_1D/12) */
/* LOC_ID_1D_sz = 120 */
/* GRP_ID_1D_out_chan_blk_dim = 1 */
/* GRP_ID_1D_out_chan_blk_sz = 1 */
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */
/* GRP_ID_1D_pels_blk_dim = 196 */
/* GRP_ID_1D_pels_blk_sz = 1 */
/* GRP_ID_1D_pels_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_pels_blk = GRP_ID_1D */
/* GRP_ID_1D_sz = 196 */
/* in_ix_blk_pel_dim = 80 */
/* in_ix_blk_pel_sz = 1 */
/* in_ix_blk_pel_nomod = in_ix */
/* in_ix_blk_pel = (in_ix%%80) */
/* in_ix_blk_iter_chan_dim = 8 */
/* in_ix_blk_iter_chan_sz = 80 */
/* in_ix_blk_iter_chan_nomod = (in_ix/80) */
/* in_ix_blk_iter_chan = ((in_ix/80)%%8) */
/* in_ix_blk_iter_dim = 24 */
/* in_ix_blk_iter_sz = 640 */
/* in_ix_blk_iter_nomod = (in_ix/640) */
/* in_ix_blk_iter = ((in_ix/640)%%24) */
/* in_ix_blk_dim = 196 */
/* in_ix_blk_sz = 15360 */
/* in_ix_blk_nomod = (in_ix/15360) */
/* in_ix_blk = (in_ix/15360) */
/* in_ix_sz = 3010560 */
/* blk_filt_ix_sz = 96 */
/* filts_smem_sz = 768 */
/* in_smem_sz = 640 */
/* out_smem_sz = 960 */
/* all_smem_sz = 1408 */
/* filts_xp_ix_out_chan_tile_dim = 12 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%12) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 12 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/12) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/12)%%8) */
/* filts_xp_ix_in_chan_dim = 192 */
/* filts_xp_ix_in_chan_sz = 96 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/96) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/96)%%192) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 18432 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/18432) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/18432) */
/* filts_xp_ix_sz = 18432 */
/* out_chan_bias_smem_load_iter = 1 */
/* filts_off_adj = LOC_ID_1D */
/* smem_loads = // begin smem_loads
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)];
filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)];
filts_smem[(LOC_ID_1D + %(tpb) * 2)] = filts[filts_off+(%(tpb)*2)];
filts_smem[(LOC_ID_1D + %(tpb) * 3)] = filts[filts_off+(%(tpb)*3)];
filts_smem[(LOC_ID_1D + %(tpb) * 4)] = filts[filts_off+(%(tpb)*4)];
filts_smem[(LOC_ID_1D + %(tpb) * 5)] = filts[filts_off+(%(tpb)*5)];
if( (LOC_ID_1D + %(tpb) * 6) < %(filts_smem_sz) ) { filts_smem[(LOC_ID_1D + %(tpb) * 6)] = filts[filts_off+(%(tpb)*6)];}
in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ];
in_smem[(LOC_ID_1D + %(tpb) * 1)] = in[ blk_in_ix_base + (%(tpb)*1) ];
in_smem[(LOC_ID_1D + %(tpb) * 2)] = in[ blk_in_ix_base + (%(tpb)*2) ];
in_smem[(LOC_ID_1D + %(tpb) * 3)] = in[ blk_in_ix_base + (%(tpb)*3) ];
in_smem[(LOC_ID_1D + %(tpb) * 4)] = in[ blk_in_ix_base + (%(tpb)*4) ];
if( (LOC_ID_1D + %(tpb) * 5) < %(in_ix_blk_iter_sz)) { in_smem[(LOC_ID_1D + %(tpb) * 5)] = in[ blk_in_ix_base + (%(tpb)*5) ];}
// end smem_loads */
/* out_chan_tile = (%(LOC_ID_1D_out_chan_tile)+%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim)) */
/* out_chan_ix = (%(out_chan_tile)*%(t_tile_sz)) */
/* t_smem_ld_pel_pel_dim = 80 */
/* t_smem_ld_pel_pel_sz = 1 */
/* t_smem_ld_pel_pel_nomod = t_smem_ld_pel */
/* t_smem_ld_pel_pel = (t_smem_ld_pel%%80) */
/* t_smem_ld_pel_chan_dim = 8 */
/* t_smem_ld_pel_chan_sz = 80 */
/* t_smem_ld_pel_chan_nomod = (t_smem_ld_pel/80) */
/* t_smem_ld_pel_chan = (t_smem_ld_pel/80) */
/* t_smem_ld_pel_sz = 640 */
/* out_pel_0 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+0) */
/* out_pel_0_pel_dim = 784 */
/* out_pel_0_pel_sz = 1 */
/* out_pel_0_pel_nomod = %(out_pel_0) */
/* out_pel_0_pel = (%(out_pel_0)%%784) */
/* out_pel_0_img_dim = 20 */
/* out_pel_0_img_sz = 784 */
/* out_pel_0_img_nomod = (%(out_pel_0)/784) */
/* out_pel_0_img = (%(out_pel_0)/784) */
/* out_pel_0_sz = 15680 */
/* out_pel_1 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+1) */
/* out_pel_1_pel_dim = 784 */
/* out_pel_1_pel_sz = 1 */
/* out_pel_1_pel_nomod = %(out_pel_1) */
/* out_pel_1_pel = (%(out_pel_1)%%784) */
/* out_pel_1_img_dim = 20 */
/* out_pel_1_img_sz = 784 */
/* out_pel_1_img_nomod = (%(out_pel_1)/784) */
/* out_pel_1_img = (%(out_pel_1)/784) */
/* out_pel_1_sz = 15680 */
/* out_pel_2 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+2) */
/* out_pel_2_pel_dim = 784 */
/* out_pel_2_pel_sz = 1 */
/* out_pel_2_pel_nomod = %(out_pel_2) */
/* out_pel_2_pel = (%(out_pel_2)%%784) */
/* out_pel_2_img_dim = 20 */
/* out_pel_2_img_sz = 784 */
/* out_pel_2_img_nomod = (%(out_pel_2)/784) */
/* out_pel_2_img = (%(out_pel_2)/784) */
/* out_pel_2_sz = 15680 */
/* out_pel_3 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+3) */
/* out_pel_3_pel_dim = 784 */
/* out_pel_3_pel_sz = 1 */
/* out_pel_3_pel_nomod = %(out_pel_3) */
/* out_pel_3_pel = (%(out_pel_3)%%784) */
/* out_pel_3_img_dim = 20 */
/* out_pel_3_img_sz = 784 */
/* out_pel_3_img_nomod = (%(out_pel_3)/784) */
/* out_pel_3_img = (%(out_pel_3)/784) */
/* out_pel_3_sz = 15680 */
/* out_pel_4 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+4) */
/* out_pel_4_pel_dim = 784 */
/* out_pel_4_pel_sz = 1 */
/* out_pel_4_pel_nomod = %(out_pel_4) */
/* out_pel_4_pel = (%(out_pel_4)%%784) */
/* out_pel_4_img_dim = 20 */
/* out_pel_4_img_sz = 784 */
/* out_pel_4_img_nomod = (%(out_pel_4)/784) */
/* out_pel_4_img = (%(out_pel_4)/784) */
/* out_pel_4_sz = 15680 */
/* out_pel_5 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+5) */
/* out_pel_5_pel_dim = 784 */
/* out_pel_5_pel_sz = 1 */
/* out_pel_5_pel_nomod = %(out_pel_5) */
/* out_pel_5_pel = (%(out_pel_5)%%784) */
/* out_pel_5_img_dim = 20 */
/* out_pel_5_img_sz = 784 */
/* out_pel_5_img_nomod = (%(out_pel_5)/784) */
/* out_pel_5_img = (%(out_pel_5)/784) */
/* out_pel_5_sz = 15680 */
/* out_pel_6 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+6) */
/* out_pel_6_pel_dim = 784 */
/* out_pel_6_pel_sz = 1 */
/* out_pel_6_pel_nomod = %(out_pel_6) */
/* out_pel_6_pel = (%(out_pel_6)%%784) */
/* out_pel_6_img_dim = 20 */
/* out_pel_6_img_sz = 784 */
/* out_pel_6_img_nomod = (%(out_pel_6)/784) */
/* out_pel_6_img = (%(out_pel_6)/784) */
/* out_pel_6_sz = 15680 */
/* out_pel_7 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+7) */
/* out_pel_7_pel_dim = 784 */
/* out_pel_7_pel_sz = 1 */
/* out_pel_7_pel_nomod = %(out_pel_7) */
/* out_pel_7_pel = (%(out_pel_7)%%784) */
/* out_pel_7_img_dim = 20 */
/* out_pel_7_img_sz = 784 */
/* out_pel_7_img_nomod = (%(out_pel_7)/784) */
/* out_pel_7_img = (%(out_pel_7)/784) */
/* out_pel_7_sz = 15680 */
/* t_tile_stores = // begin t_tile_stores
int32_t tpix[%(t_tile_sz)];
int32_t tcix[%(t_tile_sz)];
tpix[0] = %(out_pel_0_img)*%(out_ix_img_sz) + %(out_pel_0_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[1] = %(out_pel_1_img)*%(out_ix_img_sz) + %(out_pel_1_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[2] = %(out_pel_2_img)*%(out_ix_img_sz) + %(out_pel_2_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[3] = %(out_pel_3_img)*%(out_ix_img_sz) + %(out_pel_3_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[4] = %(out_pel_4_img)*%(out_ix_img_sz) + %(out_pel_4_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[5] = %(out_pel_5_img)*%(out_ix_img_sz) + %(out_pel_5_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[6] = %(out_pel_6_img)*%(out_ix_img_sz) + %(out_pel_6_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[7] = %(out_pel_7_img)*%(out_ix_img_sz) + %(out_pel_7_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tcix[0] = (%(out_chan_ix)+0)*%(out_ix_chan_sz); // cache out chan ixs
tcix[1] = (%(out_chan_ix)+1)*%(out_ix_chan_sz); // cache out chan ixs
tcix[2] = (%(out_chan_ix)+2)*%(out_ix_chan_sz); // cache out chan ixs
tcix[3] = (%(out_chan_ix)+3)*%(out_ix_chan_sz); // cache out chan ixs
tcix[4] = (%(out_chan_ix)+4)*%(out_ix_chan_sz); // cache out chan ixs
tcix[5] = (%(out_chan_ix)+5)*%(out_ix_chan_sz); // cache out chan ixs
tcix[6] = (%(out_chan_ix)+6)*%(out_ix_chan_sz); // cache out chan ixs
tcix[7] = (%(out_chan_ix)+7)*%(out_ix_chan_sz); // cache out chan ixs
if( %(out_pel_0_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( %(out_pel_1_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( %(out_pel_2_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( %(out_pel_3_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( %(out_pel_4_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( %(out_pel_5_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( %(out_pel_6_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( %(out_pel_7_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores */
/* t_tile_dummy_stores = out_off[0] = max(0.0f,out_tile[0]+filts_strip[0]);
out_off[120] = max(0.0f,out_tile[1]+filts_strip[1]);
out_off[240] = max(0.0f,out_tile[2]+filts_strip[2]);
out_off[360] = max(0.0f,out_tile[3]+filts_strip[3]);
out_off[480] = max(0.0f,out_tile[4]+filts_strip[4]);
out_off[600] = max(0.0f,out_tile[5]+filts_strip[5]);
out_off[720] = max(0.0f,out_tile[6]+filts_strip[6]);
out_off[840] = max(0.0f,out_tile[7]+filts_strip[7]);
out_off[960] = max(0.0f,out_tile[8]+filts_strip[0]);
out_off[1080] = max(0.0f,out_tile[9]+filts_strip[1]);
out_off[1200] = max(0.0f,out_tile[10]+filts_strip[2]);
out_off[1320] = max(0.0f,out_tile[11]+filts_strip[3]);
out_off[1440] = max(0.0f,out_tile[12]+filts_strip[4]);
out_off[1560] = max(0.0f,out_tile[13]+filts_strip[5]);
out_off[1680] = max(0.0f,out_tile[14]+filts_strip[6]);
out_off[1800] = max(0.0f,out_tile[15]+filts_strip[7]);
out_off[1920] = max(0.0f,out_tile[16]+filts_strip[0]);
out_off[2040] = max(0.0f,out_tile[17]+filts_strip[1]);
out_off[2160] = max(0.0f,out_tile[18]+filts_strip[2]);
out_off[2280] = max(0.0f,out_tile[19]+filts_strip[3]);
out_off[2400] = max(0.0f,out_tile[20]+filts_strip[4]);
out_off[2520] = max(0.0f,out_tile[21]+filts_strip[5]);
out_off[2640] = max(0.0f,out_tile[22]+filts_strip[6]);
out_off[2760] = max(0.0f,out_tile[23]+filts_strip[7]);
out_off[2880] = max(0.0f,out_tile[24]+filts_strip[0]);
out_off[3000] = max(0.0f,out_tile[25]+filts_strip[1]);
out_off[3120] = max(0.0f,out_tile[26]+filts_strip[2]);
out_off[3240] = max(0.0f,out_tile[27]+filts_strip[3]);
out_off[3360] = max(0.0f,out_tile[28]+filts_strip[4]);
out_off[3480] = max(0.0f,out_tile[29]+filts_strip[5]);
out_off[3600] = max(0.0f,out_tile[30]+filts_strip[6]);
out_off[3720] = max(0.0f,out_tile[31]+filts_strip[7]);
out_off[3840] = max(0.0f,out_tile[32]+filts_strip[0]);
out_off[3960] = max(0.0f,out_tile[33]+filts_strip[1]);
out_off[4080] = max(0.0f,out_tile[34]+filts_strip[2]);
out_off[4200] = max(0.0f,out_tile[35]+filts_strip[3]);
out_off[4320] = max(0.0f,out_tile[36]+filts_strip[4]);
out_off[4440] = max(0.0f,out_tile[37]+filts_strip[5]);
out_off[4560] = max(0.0f,out_tile[38]+filts_strip[6]);
out_off[4680] = max(0.0f,out_tile[39]+filts_strip[7]);
out_off[4800] = max(0.0f,out_tile[40]+filts_strip[0]);
out_off[4920] = max(0.0f,out_tile[41]+filts_strip[1]);
out_off[5040] = max(0.0f,out_tile[42]+filts_strip[2]);
out_off[5160] = max(0.0f,out_tile[43]+filts_strip[3]);
out_off[5280] = max(0.0f,out_tile[44]+filts_strip[4]);
out_off[5400] = max(0.0f,out_tile[45]+filts_strip[5]);
out_off[5520] = max(0.0f,out_tile[46]+filts_strip[6]);
out_off[5640] = max(0.0f,out_tile[47]+filts_strip[7]);
out_off[5760] = max(0.0f,out_tile[48]+filts_strip[0]);
out_off[5880] = max(0.0f,out_tile[49]+filts_strip[1]);
out_off[6000] = max(0.0f,out_tile[50]+filts_strip[2]);
out_off[6120] = max(0.0f,out_tile[51]+filts_strip[3]);
out_off[6240] = max(0.0f,out_tile[52]+filts_strip[4]);
out_off[6360] = max(0.0f,out_tile[53]+filts_strip[5]);
out_off[6480] = max(0.0f,out_tile[54]+filts_strip[6]);
out_off[6600] = max(0.0f,out_tile[55]+filts_strip[7]);
out_off[6720] = max(0.0f,out_tile[56]+filts_strip[0]);
out_off[6840] = max(0.0f,out_tile[57]+filts_strip[1]);
out_off[6960] = max(0.0f,out_tile[58]+filts_strip[2]);
out_off[7080] = max(0.0f,out_tile[59]+filts_strip[3]);
out_off[7200] = max(0.0f,out_tile[60]+filts_strip[4]);
out_off[7320] = max(0.0f,out_tile[61]+filts_strip[5]);
out_off[7440] = max(0.0f,out_tile[62]+filts_strip[6]);
out_off[7560] = max(0.0f,out_tile[63]+filts_strip[7]);
*/
/* t_tile_bias_loads = // begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)];
// end t_tile_bias_loads */
/* inner_loop_body = // begin inner_loop_body
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[3*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[3*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[3*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[3*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[3*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[3*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[3*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[3*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[4*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[4*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[4*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[4*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[4*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[4*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[4*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[4*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[5*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[5*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[5*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[5*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[5*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[5*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[5*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[5*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[6*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[6*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[6*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[6*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[6*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[6*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[6*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[6*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[7*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[7*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[7*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[7*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[7*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[7*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[7*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[7*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
*/
CUCL_GLOBAL_KERNEL void xpose_in__num_imgs_20__in_chan_tile_8__pad_in_chans_192__in_chans_192__ysz_28__xsz_28__tix_pels_tile_sz_10__bix_pels_blk_sz_196( GASQ float const * const in, GASQ float * const out ) {
int32_t const out_ix = GLOB_ID_1D;
int32_t const chan_ix = ((out_ix/640)%24)*8 + ((out_ix/80)%8);
int32_t const pel_ix = (out_ix/15360)*80 + (out_ix%80);
float v = 0.0f;
if( ( chan_ix < 192 ) && ( (pel_ix/784) < 20 ) ) {
v = in[ (pel_ix/784)*150528 +
chan_ix*784 +
((pel_ix/28)%28)*28 +
(pel_ix%28)*1 ];
}
out[out_ix] = v;
}
/*
in_pels = num_img * in.sz.dims_prod()
num_in_blks = u32_ceil_div( in_pels, block_chan_pels )
normal in dims: img, chan, y, x OR img, chan, pels // where pels = x,y dims merged
block_iters = u32_ceil_div( chan, in_chan_tile ) // for ccp1, 96/8=12
pad_chan = block_iter * in_chan_tile // pad by up to (in_chan_tile-1) [typ. 8; pad with zeros? garbage okay?]
block_chan_pels = t_tile_sz*tix_pels_tile_sz // typically 8*8=64
block_iter_pels = block_chan_pels * in_chan_tile; // typically 512
block_pels = 12*512 = 6144 // note: 24576 bytes, prob. too big for SM to fully cache, but 512=2K (per-iter cache) is fine.
xposed in dims (inner): (block_iter, block_iter_chan, block_iter_pel) == block_pel
sz (inner): (block_iters, in_chan_tile, block_chan_pels) == block_pels (only inner 2 dims need to be linear?)
*/
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_chan_tile = 8 */
/* pad_in_chans = 192 */
/* in_chans = 192 */
/* ysz = 28 */
/* xsz = 28 */
/* tix_pels_tile_sz = 10 */
/* bix_pels_blk_sz = 196 */
/* rtc_func_name = xpose_in__num_imgs_20__in_chan_tile_8__pad_in_chans_192__in_chans_192__ysz_28__xsz_28__tix_pels_tile_sz_10__bix_pels_blk_sz_196 */
/* out_ix_blk_pel_dim = 80 */
/* out_ix_blk_pel_sz = 1 */
/* out_ix_blk_pel_nomod = out_ix */
/* out_ix_blk_pel = (out_ix%%80) */
/* out_ix_blk_iter_chan_dim = 8 */
/* out_ix_blk_iter_chan_sz = 80 */
/* out_ix_blk_iter_chan_nomod = (out_ix/80) */
/* out_ix_blk_iter_chan = ((out_ix/80)%%8) */
/* out_ix_blk_iter_dim = 24 */
/* out_ix_blk_iter_sz = 640 */
/* out_ix_blk_iter_nomod = (out_ix/640) */
/* out_ix_blk_iter = ((out_ix/640)%%24) */
/* out_ix_blk_dim = 196 */
/* out_ix_blk_sz = 15360 */
/* out_ix_blk_nomod = (out_ix/15360) */
/* out_ix_blk = (out_ix/15360) */
/* out_ix_sz = 3010560 */
/* pel_ix_x_dim = 28 */
/* pel_ix_x_sz = 1 */
/* pel_ix_x_nomod = pel_ix */
/* pel_ix_x = (pel_ix%%28) */
/* pel_ix_y_dim = 28 */
/* pel_ix_y_sz = 28 */
/* pel_ix_y_nomod = (pel_ix/28) */
/* pel_ix_y = ((pel_ix/28)%%28) */
/* pel_ix_img_dim = 20 */
/* pel_ix_img_sz = 784 */
/* pel_ix_img_nomod = (pel_ix/784) */
/* pel_ix_img = (pel_ix/784) */
/* pel_ix_sz = 15680 */
/* in_ix_x_dim = 28 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%28) */
/* in_ix_y_dim = 28 */
/* in_ix_y_sz = 28 */
/* in_ix_y_nomod = (in_ix/28) */
/* in_ix_y = ((in_ix/28)%%28) */
/* in_ix_chan_dim = 192 */
/* in_ix_chan_sz = 784 */
/* in_ix_chan_nomod = (in_ix/784) */
/* in_ix_chan = ((in_ix/784)%%192) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 150528 */
/* in_ix_img_nomod = (in_ix/150528) */
/* in_ix_img = (in_ix/150528) */
/* in_ix_sz = 3010560 */
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_96__in_chans_192__kysz_1__kxsz_1( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile
{
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg
int32_t const filts_ix = GLOB_ID_1D;
if( filts_ix >= 18432 ) { return; }
int32_t const fioc = (filts_ix/192);
float val = 0.0f;
int32_t const filts_xp_ix =
(fioc/96)*18432 +
(fioc%8)*12 +
((fioc/8)%12)*1 +
(filts_ix%192)*96 +
(filts_ix%1)*96 +
(filts_ix%1)*96;
#if 1
val = in[filts_ix];
#else
if( (filts_ix%192) == 0 ) {
// if( ((filts_ix%1) == 5) && ((filts_ix%1) == 5) )
{
val = (filts_ix%1)*100 + (filts_ix%1);
}
}
#endif
out[filts_xp_ix] = val;
}
// -- template substituion table used: --
/* out_chans = 96 */
/* in_chans = 192 */
/* kysz = 1 */
/* kxsz = 1 */
/* rtc_func_name = xpose_filts__out_chans_96__in_chans_192__kysz_1__kxsz_1 */
/* t_tile_sz = 8 */
/* filts_ix_x_dim = 1 */
/* filts_ix_x_sz = 1 */
/* filts_ix_x_nomod = filts_ix */
/* filts_ix_x = (filts_ix%%1) */
/* filts_ix_y_dim = 1 */
/* filts_ix_y_sz = 1 */
/* filts_ix_y_nomod = filts_ix */
/* filts_ix_y = (filts_ix%%1) */
/* filts_ix_in_chan_dim = 192 */
/* filts_ix_in_chan_sz = 1 */
/* filts_ix_in_chan_nomod = filts_ix */
/* filts_ix_in_chan = (filts_ix%%192) */
/* filts_ix_out_chan_dim = 96 */
/* filts_ix_out_chan_sz = 192 */
/* filts_ix_out_chan_nomod = (filts_ix/192) */
/* filts_ix_out_chan = (filts_ix/192) */
/* filts_ix_sz = 18432 */
/* filts_xp_ix_out_chan_tile_dim = 12 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%12) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 12 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/12) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/12)%%8) */
/* filts_xp_ix_x_dim = 1 */
/* filts_xp_ix_x_sz = 96 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/96) */
/* filts_xp_ix_x = ((filts_xp_ix/96)%%1) */
/* filts_xp_ix_y_dim = 1 */
/* filts_xp_ix_y_sz = 96 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/96) */
/* filts_xp_ix_y = ((filts_xp_ix/96)%%1) */
/* filts_xp_ix_in_chan_dim = 192 */
/* filts_xp_ix_in_chan_sz = 96 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/96) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/96)%%192) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 18432 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/18432) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/18432) */
/* filts_xp_ix_sz = 18432 */
/* fioc_out_chan_reg_dim = 8 */
/* fioc_out_chan_reg_sz = 1 */
/* fioc_out_chan_reg_nomod = fioc */
/* fioc_out_chan_reg = (fioc%%8) */
/* fioc_out_chan_tile_dim = 12 */
/* fioc_out_chan_tile_sz = 8 */
/* fioc_out_chan_tile_nomod = (fioc/8) */
/* fioc_out_chan_tile = ((fioc/8)%%12) */
/* fioc_out_chan_blk_dim = 1 */
/* fioc_out_chan_blk_sz = 96 */
/* fioc_out_chan_blk_nomod = (fioc/96) */
/* fioc_out_chan_blk = (fioc/96) */
/* fioc_sz = 96 */
// each thread: computes 8x8 block of out
// loop over k dim
CUCL_GLOBAL_KERNEL void tconv__num_imgs_20__in_dim_0_28__in_dim_1_28__kern_sz_3__stride_1__in_pad_1__t_tile_sz_8__conv_has_relu_1__out_chans_128__in_chans_96( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) {
LOCSHAR_MEM float all_smem[1024]; // note: max(filts+in,out) == max(384+120,1024)
LSMASQ float * const filts_smem = all_smem;
LSMASQ float * const in_smem = filts_smem + 384;
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz )
float in_strip[10]; // segment of input line sufficient for one unrolling of inner loop
int32_t blk_in_ix_base = GRP_ID_1D*11520 + LOC_ID_1D;// index of first input pel to load for this thread
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*110592; // index of first out chan
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D;
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%16);
int32_t out_line = (GRP_ID_1D/4)*8; // first out_line of block
int32_t const blk_fli = (out_line/28); // image of first out_line of block
out_line += (LOC_ID_1D/16); // adjust to out_line of this thread
// offset in lines to deal with >1 img/block = (number of prior images (partial or full) in this block) * (adj to next img)
int32_t const img_off_lines = ((out_line/28) - blk_fli)*(3-1);
int32_t const in_y = (out_line%28)*1 - 1;
for( int32_t in_chan = 0; in_chan != 96; ++in_chan ) {
BARRIER_SYNC;
// begin in_smem_loads
if( (LOC_ID_1D + 128 * 0) < 120) { in_smem[(LOC_ID_1D + 128 * 0)] = in[ blk_in_ix_base + (128*0) ];}
blk_in_ix_base += 120;
// end in_smem_loads;
for( int32_t ky = 0; ky != 3; ++ky ) {
if( ky != 0 ) { BARRIER_SYNC; }
// begin filt_smem_loads
filts_smem[(LOC_ID_1D + 128 * 0)] = filts[filts_off+(128*0)];
filts_smem[(LOC_ID_1D + 128 * 1)] = filts[filts_off+(128*1)];
filts_smem[(LOC_ID_1D + 128 * 2)] = filts[filts_off+(128*2)];
filts_off += 384;
// end filt_smem_loads;
BARRIER_SYNC;
if( (out_line/28) >= 20 ) { continue; } // required: skip lines from invalid images (read might be invalid)
if( ((in_y+ky) < 0) || ((in_y+ky)>28) ) { continue; } // optimization: skip known-to-be-padding input lines
LSMASQ float * const in_smem_off = in_smem + ((LOC_ID_1D/16)*1+ky+img_off_lines)*10;
// begin inner_loop_body
in_strip[0] = in_smem_off[0];
in_strip[1] = in_smem_off[1];
in_strip[2] = in_smem_off[2];
in_strip[3] = in_smem_off[3];
in_strip[4] = in_smem_off[4];
in_strip[5] = in_smem_off[5];
in_strip[6] = in_smem_off[6];
in_strip[7] = in_smem_off[7];
in_strip[8] = in_smem_off[8];
in_strip[9] = in_smem_off[9];
filts_strip[0] = filts_smem_off[0*128+0*16];
filts_strip[1] = filts_smem_off[0*128+1*16];
filts_strip[2] = filts_smem_off[0*128+2*16];
filts_strip[3] = filts_smem_off[0*128+3*16];
filts_strip[4] = filts_smem_off[0*128+4*16];
filts_strip[5] = filts_smem_off[0*128+5*16];
filts_strip[6] = filts_smem_off[0*128+6*16];
filts_strip[7] = filts_smem_off[0*128+7*16];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[1*128+0*16];
filts_strip[1] = filts_smem_off[1*128+1*16];
filts_strip[2] = filts_smem_off[1*128+2*16];
filts_strip[3] = filts_smem_off[1*128+3*16];
filts_strip[4] = filts_smem_off[1*128+4*16];
filts_strip[5] = filts_smem_off[1*128+5*16];
filts_strip[6] = filts_smem_off[1*128+6*16];
filts_strip[7] = filts_smem_off[1*128+7*16];
out_tile[0] += filts_strip[0]*in_strip[1];
out_tile[1] += filts_strip[1]*in_strip[1];
out_tile[2] += filts_strip[2]*in_strip[1];
out_tile[3] += filts_strip[3]*in_strip[1];
out_tile[4] += filts_strip[4]*in_strip[1];
out_tile[5] += filts_strip[5]*in_strip[1];
out_tile[6] += filts_strip[6]*in_strip[1];
out_tile[7] += filts_strip[7]*in_strip[1];
out_tile[8] += filts_strip[0]*in_strip[2];
out_tile[9] += filts_strip[1]*in_strip[2];
out_tile[10] += filts_strip[2]*in_strip[2];
out_tile[11] += filts_strip[3]*in_strip[2];
out_tile[12] += filts_strip[4]*in_strip[2];
out_tile[13] += filts_strip[5]*in_strip[2];
out_tile[14] += filts_strip[6]*in_strip[2];
out_tile[15] += filts_strip[7]*in_strip[2];
out_tile[16] += filts_strip[0]*in_strip[3];
out_tile[17] += filts_strip[1]*in_strip[3];
out_tile[18] += filts_strip[2]*in_strip[3];
out_tile[19] += filts_strip[3]*in_strip[3];
out_tile[20] += filts_strip[4]*in_strip[3];
out_tile[21] += filts_strip[5]*in_strip[3];
out_tile[22] += filts_strip[6]*in_strip[3];
out_tile[23] += filts_strip[7]*in_strip[3];
out_tile[24] += filts_strip[0]*in_strip[4];
out_tile[25] += filts_strip[1]*in_strip[4];
out_tile[26] += filts_strip[2]*in_strip[4];
out_tile[27] += filts_strip[3]*in_strip[4];
out_tile[28] += filts_strip[4]*in_strip[4];
out_tile[29] += filts_strip[5]*in_strip[4];
out_tile[30] += filts_strip[6]*in_strip[4];
out_tile[31] += filts_strip[7]*in_strip[4];
out_tile[32] += filts_strip[0]*in_strip[5];
out_tile[33] += filts_strip[1]*in_strip[5];
out_tile[34] += filts_strip[2]*in_strip[5];
out_tile[35] += filts_strip[3]*in_strip[5];
out_tile[36] += filts_strip[4]*in_strip[5];
out_tile[37] += filts_strip[5]*in_strip[5];
out_tile[38] += filts_strip[6]*in_strip[5];
out_tile[39] += filts_strip[7]*in_strip[5];
out_tile[40] += filts_strip[0]*in_strip[6];
out_tile[41] += filts_strip[1]*in_strip[6];
out_tile[42] += filts_strip[2]*in_strip[6];
out_tile[43] += filts_strip[3]*in_strip[6];
out_tile[44] += filts_strip[4]*in_strip[6];
out_tile[45] += filts_strip[5]*in_strip[6];
out_tile[46] += filts_strip[6]*in_strip[6];
out_tile[47] += filts_strip[7]*in_strip[6];
out_tile[48] += filts_strip[0]*in_strip[7];
out_tile[49] += filts_strip[1]*in_strip[7];
out_tile[50] += filts_strip[2]*in_strip[7];
out_tile[51] += filts_strip[3]*in_strip[7];
out_tile[52] += filts_strip[4]*in_strip[7];
out_tile[53] += filts_strip[5]*in_strip[7];
out_tile[54] += filts_strip[6]*in_strip[7];
out_tile[55] += filts_strip[7]*in_strip[7];
out_tile[56] += filts_strip[0]*in_strip[8];
out_tile[57] += filts_strip[1]*in_strip[8];
out_tile[58] += filts_strip[2]*in_strip[8];
out_tile[59] += filts_strip[3]*in_strip[8];
out_tile[60] += filts_strip[4]*in_strip[8];
out_tile[61] += filts_strip[5]*in_strip[8];
out_tile[62] += filts_strip[6]*in_strip[8];
out_tile[63] += filts_strip[7]*in_strip[8];
filts_strip[0] = filts_smem_off[2*128+0*16];
filts_strip[1] = filts_smem_off[2*128+1*16];
filts_strip[2] = filts_smem_off[2*128+2*16];
filts_strip[3] = filts_smem_off[2*128+3*16];
filts_strip[4] = filts_smem_off[2*128+4*16];
filts_strip[5] = filts_smem_off[2*128+5*16];
filts_strip[6] = filts_smem_off[2*128+6*16];
filts_strip[7] = filts_smem_off[2*128+7*16];
out_tile[0] += filts_strip[0]*in_strip[2];
out_tile[1] += filts_strip[1]*in_strip[2];
out_tile[2] += filts_strip[2]*in_strip[2];
out_tile[3] += filts_strip[3]*in_strip[2];
out_tile[4] += filts_strip[4]*in_strip[2];
out_tile[5] += filts_strip[5]*in_strip[2];
out_tile[6] += filts_strip[6]*in_strip[2];
out_tile[7] += filts_strip[7]*in_strip[2];
out_tile[8] += filts_strip[0]*in_strip[3];
out_tile[9] += filts_strip[1]*in_strip[3];
out_tile[10] += filts_strip[2]*in_strip[3];
out_tile[11] += filts_strip[3]*in_strip[3];
out_tile[12] += filts_strip[4]*in_strip[3];
out_tile[13] += filts_strip[5]*in_strip[3];
out_tile[14] += filts_strip[6]*in_strip[3];
out_tile[15] += filts_strip[7]*in_strip[3];
out_tile[16] += filts_strip[0]*in_strip[4];
out_tile[17] += filts_strip[1]*in_strip[4];
out_tile[18] += filts_strip[2]*in_strip[4];
out_tile[19] += filts_strip[3]*in_strip[4];
out_tile[20] += filts_strip[4]*in_strip[4];
out_tile[21] += filts_strip[5]*in_strip[4];
out_tile[22] += filts_strip[6]*in_strip[4];
out_tile[23] += filts_strip[7]*in_strip[4];
out_tile[24] += filts_strip[0]*in_strip[5];
out_tile[25] += filts_strip[1]*in_strip[5];
out_tile[26] += filts_strip[2]*in_strip[5];
out_tile[27] += filts_strip[3]*in_strip[5];
out_tile[28] += filts_strip[4]*in_strip[5];
out_tile[29] += filts_strip[5]*in_strip[5];
out_tile[30] += filts_strip[6]*in_strip[5];
out_tile[31] += filts_strip[7]*in_strip[5];
out_tile[32] += filts_strip[0]*in_strip[6];
out_tile[33] += filts_strip[1]*in_strip[6];
out_tile[34] += filts_strip[2]*in_strip[6];
out_tile[35] += filts_strip[3]*in_strip[6];
out_tile[36] += filts_strip[4]*in_strip[6];
out_tile[37] += filts_strip[5]*in_strip[6];
out_tile[38] += filts_strip[6]*in_strip[6];
out_tile[39] += filts_strip[7]*in_strip[6];
out_tile[40] += filts_strip[0]*in_strip[7];
out_tile[41] += filts_strip[1]*in_strip[7];
out_tile[42] += filts_strip[2]*in_strip[7];
out_tile[43] += filts_strip[3]*in_strip[7];
out_tile[44] += filts_strip[4]*in_strip[7];
out_tile[45] += filts_strip[5]*in_strip[7];
out_tile[46] += filts_strip[6]*in_strip[7];
out_tile[47] += filts_strip[7]*in_strip[7];
out_tile[48] += filts_strip[0]*in_strip[8];
out_tile[49] += filts_strip[1]*in_strip[8];
out_tile[50] += filts_strip[2]*in_strip[8];
out_tile[51] += filts_strip[3]*in_strip[8];
out_tile[52] += filts_strip[4]*in_strip[8];
out_tile[53] += filts_strip[5]*in_strip[8];
out_tile[54] += filts_strip[6]*in_strip[8];
out_tile[55] += filts_strip[7]*in_strip[8];
out_tile[56] += filts_strip[0]*in_strip[9];
out_tile[57] += filts_strip[1]*in_strip[9];
out_tile[58] += filts_strip[2]*in_strip[9];
out_tile[59] += filts_strip[3]*in_strip[9];
out_tile[60] += filts_strip[4]*in_strip[9];
out_tile[61] += filts_strip[5]*in_strip[9];
out_tile[62] += filts_strip[6]*in_strip[9];
out_tile[63] += filts_strip[7]*in_strip[9];
;
}
}
if( flags == 2 ) { return; }
BARRIER_SYNC;
for( int32_t i = 0; i != 1; ++i ) {
int32_t const t_smem_bias_ix = LOC_ID_1D+128*i;
if( t_smem_bias_ix < 128 ) {
int32_t const ocix_base = (GRP_ID_1D%1)*128;
int32_t const load_reg = t_smem_bias_ix / 16;
int32_t const load_tile = t_smem_bias_ix % 16;
int32_t const ocix = ocix_base + load_tile*8 + load_reg;
if( ocix < 128 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; }
}
}
BARRIER_SYNC;
// begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*16];
filts_strip[1] = filts_smem_off[1*16];
filts_strip[2] = filts_smem_off[2*16];
filts_strip[3] = filts_smem_off[3*16];
filts_strip[4] = filts_smem_off[4*16];
filts_strip[5] = filts_smem_off[5*16];
filts_strip[6] = filts_smem_off[6*16];
filts_strip[7] = filts_smem_off[7*16];
// end t_tile_bias_loads;
if( flags == 1 ) { return; }
// begin t_tile_stores
if( (out_line/28) >= 20 ) { return; }
int32_t out_x = (GRP_ID_1D%4)*8;
int32_t out_chan = ((GRP_ID_1D%1)*16 + (LOC_ID_1D%16))*8;
GASQ float * out_off = out + (out_line/28)*100352 + out_chan*784 + (out_line%28)*28 + out_x*1 ;
if( (out_x + 0) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 128 ) { out_off[ 0*784 + 0*1 ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( (out_chan + 1) < 128 ) { out_off[ 1*784 + 0*1 ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( (out_chan + 2) < 128 ) { out_off[ 2*784 + 0*1 ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( (out_chan + 3) < 128 ) { out_off[ 3*784 + 0*1 ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( (out_chan + 4) < 128 ) { out_off[ 4*784 + 0*1 ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( (out_chan + 5) < 128 ) { out_off[ 5*784 + 0*1 ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( (out_chan + 6) < 128 ) { out_off[ 6*784 + 0*1 ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( (out_chan + 7) < 128 ) { out_off[ 7*784 + 0*1 ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( (out_x + 1) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 128 ) { out_off[ 0*784 + 1*1 ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( (out_chan + 1) < 128 ) { out_off[ 1*784 + 1*1 ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( (out_chan + 2) < 128 ) { out_off[ 2*784 + 1*1 ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( (out_chan + 3) < 128 ) { out_off[ 3*784 + 1*1 ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( (out_chan + 4) < 128 ) { out_off[ 4*784 + 1*1 ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( (out_chan + 5) < 128 ) { out_off[ 5*784 + 1*1 ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( (out_chan + 6) < 128 ) { out_off[ 6*784 + 1*1 ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( (out_chan + 7) < 128 ) { out_off[ 7*784 + 1*1 ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( (out_x + 2) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 128 ) { out_off[ 0*784 + 2*1 ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( (out_chan + 1) < 128 ) { out_off[ 1*784 + 2*1 ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( (out_chan + 2) < 128 ) { out_off[ 2*784 + 2*1 ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( (out_chan + 3) < 128 ) { out_off[ 3*784 + 2*1 ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( (out_chan + 4) < 128 ) { out_off[ 4*784 + 2*1 ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( (out_chan + 5) < 128 ) { out_off[ 5*784 + 2*1 ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( (out_chan + 6) < 128 ) { out_off[ 6*784 + 2*1 ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( (out_chan + 7) < 128 ) { out_off[ 7*784 + 2*1 ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( (out_x + 3) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 128 ) { out_off[ 0*784 + 3*1 ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( (out_chan + 1) < 128 ) { out_off[ 1*784 + 3*1 ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( (out_chan + 2) < 128 ) { out_off[ 2*784 + 3*1 ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( (out_chan + 3) < 128 ) { out_off[ 3*784 + 3*1 ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( (out_chan + 4) < 128 ) { out_off[ 4*784 + 3*1 ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( (out_chan + 5) < 128 ) { out_off[ 5*784 + 3*1 ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( (out_chan + 6) < 128 ) { out_off[ 6*784 + 3*1 ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( (out_chan + 7) < 128 ) { out_off[ 7*784 + 3*1 ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( (out_x + 4) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 128 ) { out_off[ 0*784 + 4*1 ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( (out_chan + 1) < 128 ) { out_off[ 1*784 + 4*1 ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( (out_chan + 2) < 128 ) { out_off[ 2*784 + 4*1 ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( (out_chan + 3) < 128 ) { out_off[ 3*784 + 4*1 ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( (out_chan + 4) < 128 ) { out_off[ 4*784 + 4*1 ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( (out_chan + 5) < 128 ) { out_off[ 5*784 + 4*1 ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( (out_chan + 6) < 128 ) { out_off[ 6*784 + 4*1 ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( (out_chan + 7) < 128 ) { out_off[ 7*784 + 4*1 ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( (out_x + 5) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 128 ) { out_off[ 0*784 + 5*1 ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( (out_chan + 1) < 128 ) { out_off[ 1*784 + 5*1 ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( (out_chan + 2) < 128 ) { out_off[ 2*784 + 5*1 ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( (out_chan + 3) < 128 ) { out_off[ 3*784 + 5*1 ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( (out_chan + 4) < 128 ) { out_off[ 4*784 + 5*1 ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( (out_chan + 5) < 128 ) { out_off[ 5*784 + 5*1 ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( (out_chan + 6) < 128 ) { out_off[ 6*784 + 5*1 ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( (out_chan + 7) < 128 ) { out_off[ 7*784 + 5*1 ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( (out_x + 6) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 128 ) { out_off[ 0*784 + 6*1 ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( (out_chan + 1) < 128 ) { out_off[ 1*784 + 6*1 ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( (out_chan + 2) < 128 ) { out_off[ 2*784 + 6*1 ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( (out_chan + 3) < 128 ) { out_off[ 3*784 + 6*1 ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( (out_chan + 4) < 128 ) { out_off[ 4*784 + 6*1 ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( (out_chan + 5) < 128 ) { out_off[ 5*784 + 6*1 ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( (out_chan + 6) < 128 ) { out_off[ 6*784 + 6*1 ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( (out_chan + 7) < 128 ) { out_off[ 7*784 + 6*1 ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( (out_x + 7) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 128 ) { out_off[ 0*784 + 7*1 ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( (out_chan + 1) < 128 ) { out_off[ 1*784 + 7*1 ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( (out_chan + 2) < 128 ) { out_off[ 2*784 + 7*1 ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( (out_chan + 3) < 128 ) { out_off[ 3*784 + 7*1 ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( (out_chan + 4) < 128 ) { out_off[ 4*784 + 7*1 ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( (out_chan + 5) < 128 ) { out_off[ 5*784 + 7*1 ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( (out_chan + 6) < 128 ) { out_off[ 6*784 + 7*1 ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( (out_chan + 7) < 128 ) { out_off[ 7*784 + 7*1 ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_dim_0 = 28 */
/* in_dim_1 = 28 */
/* kern_sz = 3 */
/* stride = 1 */
/* in_pad = 1 */
/* t_tile_sz = 8 */
/* conv_has_relu = 1 */
/* out_chans = 128 */
/* in_chans = 96 */
/* rtc_func_name = tconv__num_imgs_20__in_dim_0_28__in_dim_1_28__kern_sz_3__stride_1__in_pad_1__t_tile_sz_8__conv_has_relu_1__out_chans_128__in_chans_96 */
/* out_ix_x_dim = 28 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%28) */
/* out_ix_y_dim = 28 */
/* out_ix_y_sz = 28 */
/* out_ix_y_nomod = (out_ix/28) */
/* out_ix_y = ((out_ix/28)%%28) */
/* out_ix_chan_dim = 128 */
/* out_ix_chan_sz = 784 */
/* out_ix_chan_nomod = (out_ix/784) */
/* out_ix_chan = ((out_ix/784)%%128) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 100352 */
/* out_ix_img_nomod = (out_ix/100352) */
/* out_ix_img = (out_ix/100352) */
/* out_ix_sz = 2007040 */
/* tpb = 128 */
/* out_line_y_dim = 28 */
/* out_line_y_sz = 1 */
/* out_line_y_nomod = out_line */
/* out_line_y = (out_line%%28) */
/* out_line_img_dim = 20 */
/* out_line_img_sz = 28 */
/* out_line_img_nomod = (out_line/28) */
/* out_line_img = (out_line/28) */
/* out_line_sz = 560 */
/* in_ix_blk_x_dim = 10 */
/* in_ix_blk_x_sz = 1 */
/* in_ix_blk_x_nomod = in_ix */
/* in_ix_blk_x = (in_ix%%10) */
/* in_ix_blk_y_dim = 12 */
/* in_ix_blk_y_sz = 10 */
/* in_ix_blk_y_nomod = (in_ix/10) */
/* in_ix_blk_y = ((in_ix/10)%%12) */
/* in_ix_blk_in_chan_dim = 96 */
/* in_ix_blk_in_chan_sz = 120 */
/* in_ix_blk_in_chan_nomod = (in_ix/120) */
/* in_ix_blk_in_chan = ((in_ix/120)%%96) */
/* in_ix_blk_bx_dim = 4 */
/* in_ix_blk_bx_sz = 11520 */
/* in_ix_blk_bx_nomod = (in_ix/11520) */
/* in_ix_blk_bx = ((in_ix/11520)%%4) */
/* in_ix_blk_bline_dim = 70 */
/* in_ix_blk_bline_sz = 46080 */
/* in_ix_blk_bline_nomod = (in_ix/46080) */
/* in_ix_blk_bline = (in_ix/46080) */
/* in_ix_sz = 3225600 */
/* LOC_ID_1D_out_chan_tile_dim = 16 */
/* LOC_ID_1D_out_chan_tile_sz = 1 */
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%16) */
/* LOC_ID_1D_blk_y_dim = 8 */
/* LOC_ID_1D_blk_y_sz = 16 */
/* LOC_ID_1D_blk_y_nomod = (LOC_ID_1D/16) */
/* LOC_ID_1D_blk_y = (LOC_ID_1D/16) */
/* LOC_ID_1D_sz = 128 */
/* GRP_ID_1D_out_chan_blk_dim = 1 */
/* GRP_ID_1D_out_chan_blk_sz = 1 */
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */
/* GRP_ID_1D_blk_bx_dim = 4 */
/* GRP_ID_1D_blk_bx_sz = 1 */
/* GRP_ID_1D_blk_bx_nomod = GRP_ID_1D */
/* GRP_ID_1D_blk_bx = (GRP_ID_1D%%4) */
/* GRP_ID_1D_blk_bline_dim = 70 */
/* GRP_ID_1D_blk_bline_sz = 4 */
/* GRP_ID_1D_blk_bline_nomod = (GRP_ID_1D/4) */
/* GRP_ID_1D_blk_bline = (GRP_ID_1D/4) */
/* GRP_ID_1D_sz = 280 */
/* blk_filt_ix_sz = 128 */
/* filts_smem_sz = 384 */
/* in_smem_sz = 120 */
/* out_smem_sz = 1024 */
/* all_smem_sz = 1024 */
/* filts_xp_ix_out_chan_tile_dim = 16 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%16) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 16 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/16) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/16)%%8) */
/* filts_xp_ix_x_dim = 3 */
/* filts_xp_ix_x_sz = 128 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/128) */
/* filts_xp_ix_x = ((filts_xp_ix/128)%%3) */
/* filts_xp_ix_y_dim = 3 */
/* filts_xp_ix_y_sz = 384 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/384) */
/* filts_xp_ix_y = ((filts_xp_ix/384)%%3) */
/* filts_xp_ix_in_chan_dim = 96 */
/* filts_xp_ix_in_chan_sz = 1152 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/1152) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/1152)%%96) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 110592 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/110592) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/110592) */
/* filts_xp_ix_sz = 110592 */
/* out_chan_bias_smem_load_iter = 1 */
/* filts_off_adj = LOC_ID_1D */
/* filt_smem_loads = // begin filt_smem_loads
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)];
filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)];
filts_smem[(LOC_ID_1D + %(tpb) * 2)] = filts[filts_off+(%(tpb)*2)];
filts_off += %(filts_xp_ix_y_sz);
// end filt_smem_loads */
/* in_smem_loads = // begin in_smem_loads
if( (LOC_ID_1D + %(tpb) * 0) < %(in_smem_sz)) { in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ];}
blk_in_ix_base += %(in_ix_blk_in_chan_sz);
// end in_smem_loads */
/* inner_loop_body = // begin inner_loop_body
in_strip[0] = in_smem_off[0];
in_strip[1] = in_smem_off[1];
in_strip[2] = in_smem_off[2];
in_strip[3] = in_smem_off[3];
in_strip[4] = in_smem_off[4];
in_strip[5] = in_smem_off[5];
in_strip[6] = in_smem_off[6];
in_strip[7] = in_smem_off[7];
in_strip[8] = in_smem_off[8];
in_strip[9] = in_smem_off[9];
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[1];
out_tile[1] += filts_strip[1]*in_strip[1];
out_tile[2] += filts_strip[2]*in_strip[1];
out_tile[3] += filts_strip[3]*in_strip[1];
out_tile[4] += filts_strip[4]*in_strip[1];
out_tile[5] += filts_strip[5]*in_strip[1];
out_tile[6] += filts_strip[6]*in_strip[1];
out_tile[7] += filts_strip[7]*in_strip[1];
out_tile[8] += filts_strip[0]*in_strip[2];
out_tile[9] += filts_strip[1]*in_strip[2];
out_tile[10] += filts_strip[2]*in_strip[2];
out_tile[11] += filts_strip[3]*in_strip[2];
out_tile[12] += filts_strip[4]*in_strip[2];
out_tile[13] += filts_strip[5]*in_strip[2];
out_tile[14] += filts_strip[6]*in_strip[2];
out_tile[15] += filts_strip[7]*in_strip[2];
out_tile[16] += filts_strip[0]*in_strip[3];
out_tile[17] += filts_strip[1]*in_strip[3];
out_tile[18] += filts_strip[2]*in_strip[3];
out_tile[19] += filts_strip[3]*in_strip[3];
out_tile[20] += filts_strip[4]*in_strip[3];
out_tile[21] += filts_strip[5]*in_strip[3];
out_tile[22] += filts_strip[6]*in_strip[3];
out_tile[23] += filts_strip[7]*in_strip[3];
out_tile[24] += filts_strip[0]*in_strip[4];
out_tile[25] += filts_strip[1]*in_strip[4];
out_tile[26] += filts_strip[2]*in_strip[4];
out_tile[27] += filts_strip[3]*in_strip[4];
out_tile[28] += filts_strip[4]*in_strip[4];
out_tile[29] += filts_strip[5]*in_strip[4];
out_tile[30] += filts_strip[6]*in_strip[4];
out_tile[31] += filts_strip[7]*in_strip[4];
out_tile[32] += filts_strip[0]*in_strip[5];
out_tile[33] += filts_strip[1]*in_strip[5];
out_tile[34] += filts_strip[2]*in_strip[5];
out_tile[35] += filts_strip[3]*in_strip[5];
out_tile[36] += filts_strip[4]*in_strip[5];
out_tile[37] += filts_strip[5]*in_strip[5];
out_tile[38] += filts_strip[6]*in_strip[5];
out_tile[39] += filts_strip[7]*in_strip[5];
out_tile[40] += filts_strip[0]*in_strip[6];
out_tile[41] += filts_strip[1]*in_strip[6];
out_tile[42] += filts_strip[2]*in_strip[6];
out_tile[43] += filts_strip[3]*in_strip[6];
out_tile[44] += filts_strip[4]*in_strip[6];
out_tile[45] += filts_strip[5]*in_strip[6];
out_tile[46] += filts_strip[6]*in_strip[6];
out_tile[47] += filts_strip[7]*in_strip[6];
out_tile[48] += filts_strip[0]*in_strip[7];
out_tile[49] += filts_strip[1]*in_strip[7];
out_tile[50] += filts_strip[2]*in_strip[7];
out_tile[51] += filts_strip[3]*in_strip[7];
out_tile[52] += filts_strip[4]*in_strip[7];
out_tile[53] += filts_strip[5]*in_strip[7];
out_tile[54] += filts_strip[6]*in_strip[7];
out_tile[55] += filts_strip[7]*in_strip[7];
out_tile[56] += filts_strip[0]*in_strip[8];
out_tile[57] += filts_strip[1]*in_strip[8];
out_tile[58] += filts_strip[2]*in_strip[8];
out_tile[59] += filts_strip[3]*in_strip[8];
out_tile[60] += filts_strip[4]*in_strip[8];
out_tile[61] += filts_strip[5]*in_strip[8];
out_tile[62] += filts_strip[6]*in_strip[8];
out_tile[63] += filts_strip[7]*in_strip[8];
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[2];
out_tile[1] += filts_strip[1]*in_strip[2];
out_tile[2] += filts_strip[2]*in_strip[2];
out_tile[3] += filts_strip[3]*in_strip[2];
out_tile[4] += filts_strip[4]*in_strip[2];
out_tile[5] += filts_strip[5]*in_strip[2];
out_tile[6] += filts_strip[6]*in_strip[2];
out_tile[7] += filts_strip[7]*in_strip[2];
out_tile[8] += filts_strip[0]*in_strip[3];
out_tile[9] += filts_strip[1]*in_strip[3];
out_tile[10] += filts_strip[2]*in_strip[3];
out_tile[11] += filts_strip[3]*in_strip[3];
out_tile[12] += filts_strip[4]*in_strip[3];
out_tile[13] += filts_strip[5]*in_strip[3];
out_tile[14] += filts_strip[6]*in_strip[3];
out_tile[15] += filts_strip[7]*in_strip[3];
out_tile[16] += filts_strip[0]*in_strip[4];
out_tile[17] += filts_strip[1]*in_strip[4];
out_tile[18] += filts_strip[2]*in_strip[4];
out_tile[19] += filts_strip[3]*in_strip[4];
out_tile[20] += filts_strip[4]*in_strip[4];
out_tile[21] += filts_strip[5]*in_strip[4];
out_tile[22] += filts_strip[6]*in_strip[4];
out_tile[23] += filts_strip[7]*in_strip[4];
out_tile[24] += filts_strip[0]*in_strip[5];
out_tile[25] += filts_strip[1]*in_strip[5];
out_tile[26] += filts_strip[2]*in_strip[5];
out_tile[27] += filts_strip[3]*in_strip[5];
out_tile[28] += filts_strip[4]*in_strip[5];
out_tile[29] += filts_strip[5]*in_strip[5];
out_tile[30] += filts_strip[6]*in_strip[5];
out_tile[31] += filts_strip[7]*in_strip[5];
out_tile[32] += filts_strip[0]*in_strip[6];
out_tile[33] += filts_strip[1]*in_strip[6];
out_tile[34] += filts_strip[2]*in_strip[6];
out_tile[35] += filts_strip[3]*in_strip[6];
out_tile[36] += filts_strip[4]*in_strip[6];
out_tile[37] += filts_strip[5]*in_strip[6];
out_tile[38] += filts_strip[6]*in_strip[6];
out_tile[39] += filts_strip[7]*in_strip[6];
out_tile[40] += filts_strip[0]*in_strip[7];
out_tile[41] += filts_strip[1]*in_strip[7];
out_tile[42] += filts_strip[2]*in_strip[7];
out_tile[43] += filts_strip[3]*in_strip[7];
out_tile[44] += filts_strip[4]*in_strip[7];
out_tile[45] += filts_strip[5]*in_strip[7];
out_tile[46] += filts_strip[6]*in_strip[7];
out_tile[47] += filts_strip[7]*in_strip[7];
out_tile[48] += filts_strip[0]*in_strip[8];
out_tile[49] += filts_strip[1]*in_strip[8];
out_tile[50] += filts_strip[2]*in_strip[8];
out_tile[51] += filts_strip[3]*in_strip[8];
out_tile[52] += filts_strip[4]*in_strip[8];
out_tile[53] += filts_strip[5]*in_strip[8];
out_tile[54] += filts_strip[6]*in_strip[8];
out_tile[55] += filts_strip[7]*in_strip[8];
out_tile[56] += filts_strip[0]*in_strip[9];
out_tile[57] += filts_strip[1]*in_strip[9];
out_tile[58] += filts_strip[2]*in_strip[9];
out_tile[59] += filts_strip[3]*in_strip[9];
out_tile[60] += filts_strip[4]*in_strip[9];
out_tile[61] += filts_strip[5]*in_strip[9];
out_tile[62] += filts_strip[6]*in_strip[9];
out_tile[63] += filts_strip[7]*in_strip[9];
*/
/* t_tile_bias_loads = // begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)];
// end t_tile_bias_loads */
/* t_tile_stores = // begin t_tile_stores
if( %(out_line_img) >= %(out_ix_img_dim) ) { return; }
int32_t out_x = %(GRP_ID_1D_blk_bx)*%(t_tile_sz);
int32_t out_chan = (%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim) + %(LOC_ID_1D_out_chan_tile))*%(t_tile_sz);
GASQ float * out_off = out + %(out_line_img)*%(out_ix_img_sz) + out_chan*%(out_ix_chan_sz) + %(out_line_y)*%(out_ix_y_sz) + out_x*%(out_ix_x_sz) ;
if( (out_x + 0) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( (out_x + 1) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( (out_x + 2) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( (out_x + 3) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( (out_x + 4) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( (out_x + 5) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( (out_x + 6) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( (out_x + 7) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores */
CUCL_GLOBAL_KERNEL void in_tile_xpose__num_imgs_20__stride_1__kern_sz_3__in_pad_1__in_chans_96__ysz_28__xsz_28__tix_pels_tile_sz_8__t_tile_sz_8__bix_pels_blk_sz_280( GASQ float const * const in, GASQ float * const out ) {
int32_t const out_ix = GLOB_ID_1D;
if( out_ix >= 3225600 ) { return; }
int32_t const out_line = (out_ix/46080)*8;
int32_t const fi_skip_in_lines = (out_line%28)*1;
int32_t const in_line = (((out_ix/10)%12)+fi_skip_in_lines);
int32_t const img_in_lines = (28 - 1)*1 + 3;
int32_t const img_off = in_line/img_in_lines;
int32_t const img = (out_line/28) + img_off;
int32_t const iy = (in_line % img_in_lines) - 1; //(out_line%28)*1 + ((out_ix/10)%12) - 1;
int32_t const ix = ((out_ix/11520)%4)*8*1 + (out_ix%10) - 1;
float v = 0.0f;
if( 1
&& ( ix >= 0 )
&& ( iy >= 0 )
&& ( ix < 28 )
&& ( iy < 28 )
&& ( img < 20 )
)
{
v = in[ img*75264 +
((out_ix/120)%96)*784 +
iy*28 +
ix*1 ];
}
out[out_ix] = v;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* stride = 1 */
/* kern_sz = 3 */
/* in_pad = 1 */
/* in_chans = 96 */
/* ysz = 28 */
/* xsz = 28 */
/* tix_pels_tile_sz = 8 */
/* t_tile_sz = 8 */
/* bix_pels_blk_sz = 280 */
/* rtc_func_name = in_tile_xpose__num_imgs_20__stride_1__kern_sz_3__in_pad_1__in_chans_96__ysz_28__xsz_28__tix_pels_tile_sz_8__t_tile_sz_8__bix_pels_blk_sz_280 */
/* out_ix_blk_x_dim = 10 */
/* out_ix_blk_x_sz = 1 */
/* out_ix_blk_x_nomod = out_ix */
/* out_ix_blk_x = (out_ix%%10) */
/* out_ix_blk_y_dim = 12 */
/* out_ix_blk_y_sz = 10 */
/* out_ix_blk_y_nomod = (out_ix/10) */
/* out_ix_blk_y = ((out_ix/10)%%12) */
/* out_ix_blk_in_chan_dim = 96 */
/* out_ix_blk_in_chan_sz = 120 */
/* out_ix_blk_in_chan_nomod = (out_ix/120) */
/* out_ix_blk_in_chan = ((out_ix/120)%%96) */
/* out_ix_blk_bx_dim = 4 */
/* out_ix_blk_bx_sz = 11520 */
/* out_ix_blk_bx_nomod = (out_ix/11520) */
/* out_ix_blk_bx = ((out_ix/11520)%%4) */
/* out_ix_blk_bline_dim = 70 */
/* out_ix_blk_bline_sz = 46080 */
/* out_ix_blk_bline_nomod = (out_ix/46080) */
/* out_ix_blk_bline = (out_ix/46080) */
/* out_ix_sz = 3225600 */
/* out_line_y_dim = 28 */
/* out_line_y_sz = 1 */
/* out_line_y_nomod = out_line */
/* out_line_y = (out_line%%28) */
/* out_line_img_dim = 20 */
/* out_line_img_sz = 28 */
/* out_line_img_nomod = (out_line/28) */
/* out_line_img = (out_line/28) */
/* out_line_sz = 560 */
/* in_ix_x_dim = 28 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%28) */
/* in_ix_y_dim = 28 */
/* in_ix_y_sz = 28 */
/* in_ix_y_nomod = (in_ix/28) */
/* in_ix_y = ((in_ix/28)%%28) */
/* in_ix_chan_dim = 96 */
/* in_ix_chan_sz = 784 */
/* in_ix_chan_nomod = (in_ix/784) */
/* in_ix_chan = ((in_ix/784)%%96) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 75264 */
/* in_ix_img_nomod = (in_ix/75264) */
/* in_ix_img = (in_ix/75264) */
/* in_ix_sz = 1505280 */
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_128__in_chans_96__kysz_3__kxsz_3( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile
{
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg
int32_t const filts_ix = GLOB_ID_1D;
if( filts_ix >= 110592 ) { return; }
int32_t const fioc = (filts_ix/864);
float val = 0.0f;
int32_t const filts_xp_ix =
(fioc/128)*110592 +
(fioc%8)*16 +
((fioc/8)%16)*1 +
((filts_ix/9)%96)*1152 +
((filts_ix/3)%3)*384 +
(filts_ix%3)*128;
#if 1
val = in[filts_ix];
#else
if( ((filts_ix/9)%96) == 0 ) {
// if( ((filts_ix%3) == 5) && (((filts_ix/3)%3) == 5) )
{
val = (filts_ix%3)*100 + ((filts_ix/3)%3);
}
}
#endif
out[filts_xp_ix] = val;
}
// -- template substituion table used: --
/* out_chans = 128 */
/* in_chans = 96 */
/* kysz = 3 */
/* kxsz = 3 */
/* rtc_func_name = xpose_filts__out_chans_128__in_chans_96__kysz_3__kxsz_3 */
/* t_tile_sz = 8 */
/* filts_ix_x_dim = 3 */
/* filts_ix_x_sz = 1 */
/* filts_ix_x_nomod = filts_ix */
/* filts_ix_x = (filts_ix%%3) */
/* filts_ix_y_dim = 3 */
/* filts_ix_y_sz = 3 */
/* filts_ix_y_nomod = (filts_ix/3) */
/* filts_ix_y = ((filts_ix/3)%%3) */
/* filts_ix_in_chan_dim = 96 */
/* filts_ix_in_chan_sz = 9 */
/* filts_ix_in_chan_nomod = (filts_ix/9) */
/* filts_ix_in_chan = ((filts_ix/9)%%96) */
/* filts_ix_out_chan_dim = 128 */
/* filts_ix_out_chan_sz = 864 */
/* filts_ix_out_chan_nomod = (filts_ix/864) */
/* filts_ix_out_chan = (filts_ix/864) */
/* filts_ix_sz = 110592 */
/* filts_xp_ix_out_chan_tile_dim = 16 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%16) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 16 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/16) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/16)%%8) */
/* filts_xp_ix_x_dim = 3 */
/* filts_xp_ix_x_sz = 128 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/128) */
/* filts_xp_ix_x = ((filts_xp_ix/128)%%3) */
/* filts_xp_ix_y_dim = 3 */
/* filts_xp_ix_y_sz = 384 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/384) */
/* filts_xp_ix_y = ((filts_xp_ix/384)%%3) */
/* filts_xp_ix_in_chan_dim = 96 */
/* filts_xp_ix_in_chan_sz = 1152 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/1152) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/1152)%%96) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 110592 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/110592) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/110592) */
/* filts_xp_ix_sz = 110592 */
/* fioc_out_chan_reg_dim = 8 */
/* fioc_out_chan_reg_sz = 1 */
/* fioc_out_chan_reg_nomod = fioc */
/* fioc_out_chan_reg = (fioc%%8) */
/* fioc_out_chan_tile_dim = 16 */
/* fioc_out_chan_tile_sz = 8 */
/* fioc_out_chan_tile_nomod = (fioc/8) */
/* fioc_out_chan_tile = ((fioc/8)%%16) */
/* fioc_out_chan_blk_dim = 1 */
/* fioc_out_chan_blk_sz = 128 */
/* fioc_out_chan_blk_nomod = (fioc/128) */
/* fioc_out_chan_blk = (fioc/128) */
/* fioc_sz = 128 */
// 256 tbp
// each thread: computes 8x8 block of out
// loop over k dim
CUCL_GLOBAL_KERNEL void conv__num_imgs_20__in_pad_0__in_dim_0_28__in_dim_1_28__conv_has_relu_1__kern_sz_1__stride_1__out_chans_16__in_chans_192( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out ) {
LOCSHAR_MEM float in_smem[64*8];
int32_t const blk_filt_ix_sz = 2*8;
LOCSHAR_MEM float filts_smem[2*8]; // aka blk_filt_ix_sz, which wasn't const enough OpenCL
float out_tile[8*8] = {0}; // tile of output for this thread to compute, stored in registers
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz )
float in_strip[8]; // across patches (approx square block in x/y space, favoring x if sqrt() not integer)
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*3072;
int32_t const blk_patch_ix_sz = 64*8;
int32_t const blk_patch_ix_base = GRP_ID_1D*blk_patch_ix_sz;
// iteratate over filter elements
int32_t filts_off = blk_filt_ix_base;
for( int32_t filts_ix_out_chan_elem = 0; filts_ix_out_chan_elem !=
(192 * 1 * 1); ++filts_ix_out_chan_elem ) {
BARRIER_SYNC;
if( LOC_ID_1D < blk_filt_ix_sz ) {
#ifdef NO_IOX // by default, we don't ever disable this, since it's seems about as good as it can be already
//filts_smem[LOC_ID_1D] = LOC_ID_1D;
filts_smem[LOC_ID_1D] = filts[LOC_ID_1D];
#else
filts_smem[LOC_ID_1D] = filts[filts_off + LOC_ID_1D];
#endif
}
for( int32_t i = 0; i != 4; ++i ) {
if( (LOC_ID_1D+LOC_SZ_1D*i) < blk_patch_ix_sz ) {
int32_t const t_smem_patch_ix = (blk_patch_ix_base+LOC_ID_1D+LOC_SZ_1D*i);
#ifdef NO_IO
//float v = LOC_ID_1D;
//float v = in[LOC_ID_1D];
float v = in[filts_off + LOC_ID_1D];
#else
float v = 0;
int const smem_in_ix_y = ((t_smem_patch_ix/28)%28)*1+(filts_ix_out_chan_elem%1) - 0;
int const smem_in_ix_x = (t_smem_patch_ix%28)*1+(filts_ix_out_chan_elem%1) - 0;
if(smem_in_ix_y >= 0 && smem_in_ix_x >= 0 &&
(t_smem_patch_ix/784) < 20 &&
smem_in_ix_x < 28 && smem_in_ix_y < 28 ) {
v = in[(t_smem_patch_ix/784)*150528 +
filts_ix_out_chan_elem*784 +
smem_in_ix_y*28 +
smem_in_ix_x*1];
};
#endif
in_smem[LOC_ID_1D+LOC_SZ_1D*i] = v;
}
}
filts_off += 16;
BARRIER_SYNC;
#ifdef NO_IO
// begin t_tile_dummy_loads
filts_strip[0] = filts_smem[(LOC_ID_1D % 32) + 0];
filts_strip[1] = filts_smem[(LOC_ID_1D % 32) + 1];
filts_strip[2] = filts_smem[(LOC_ID_1D % 32) + 2];
filts_strip[3] = filts_smem[(LOC_ID_1D % 32) + 3];
filts_strip[4] = filts_smem[(LOC_ID_1D % 32) + 4];
filts_strip[5] = filts_smem[(LOC_ID_1D % 32) + 5];
filts_strip[6] = filts_smem[(LOC_ID_1D % 32) + 6];
filts_strip[7] = filts_smem[(LOC_ID_1D % 32) + 7];
in_strip[0] = in_smem[(LOC_ID_1D % 32) + 0];
in_strip[1] = in_smem[(LOC_ID_1D % 32) + 1];
in_strip[2] = in_smem[(LOC_ID_1D % 32) + 2];
in_strip[3] = in_smem[(LOC_ID_1D % 32) + 3];
in_strip[4] = in_smem[(LOC_ID_1D % 32) + 4];
in_strip[5] = in_smem[(LOC_ID_1D % 32) + 5];
in_strip[6] = in_smem[(LOC_ID_1D % 32) + 6];
in_strip[7] = in_smem[(LOC_ID_1D % 32) + 7];
// end t_tile_dummy_loads;
#else
// begin t_tile_loads
filts_strip[0] = filts_smem[(LOC_ID_1D%2)+0*2];
filts_strip[1] = filts_smem[(LOC_ID_1D%2)+1*2];
filts_strip[2] = filts_smem[(LOC_ID_1D%2)+2*2];
filts_strip[3] = filts_smem[(LOC_ID_1D%2)+3*2];
filts_strip[4] = filts_smem[(LOC_ID_1D%2)+4*2];
filts_strip[5] = filts_smem[(LOC_ID_1D%2)+5*2];
filts_strip[6] = filts_smem[(LOC_ID_1D%2)+6*2];
filts_strip[7] = filts_smem[(LOC_ID_1D%2)+7*2];
in_strip[0] = in_smem[8*(LOC_ID_1D/2)+0];
in_strip[1] = in_smem[8*(LOC_ID_1D/2)+1];
in_strip[2] = in_smem[8*(LOC_ID_1D/2)+2];
in_strip[3] = in_smem[8*(LOC_ID_1D/2)+3];
in_strip[4] = in_smem[8*(LOC_ID_1D/2)+4];
in_strip[5] = in_smem[8*(LOC_ID_1D/2)+5];
in_strip[6] = in_smem[8*(LOC_ID_1D/2)+6];
in_strip[7] = in_smem[8*(LOC_ID_1D/2)+7];
// end t_tile_loads;
#endif
// (2) do 8^2 fmas into out_tile
// begin t_tile_fmas
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
// end t_tile_fmas;
}
// load per-block biases into smem
BARRIER_SYNC;
if( LOC_ID_1D < blk_filt_ix_sz ) {
int32_t const ocix_base = (GRP_ID_1D%1)*blk_filt_ix_sz;
int32_t const load_reg = LOC_ID_1D / 2;
int32_t const load_tile = LOC_ID_1D % 2;
int32_t const ocix = ocix_base + load_tile*8 + load_reg;
if( ocix < 16 ) { filts_smem[LOC_ID_1D] = biases[ ocix ]; }
//int32_t const ocix_tile = (ocix / 8) % 2;
//int32_t const ocix_reg = ocix % 8;
//filts_smem[ocix_tile * 1 + ocix_reg * 2] = biases[ocix];
}
BARRIER_SYNC;
// load biases into filts_strip
// begin t_tile_loads
filts_strip[0] = filts_smem[(LOC_ID_1D%2)+0*2];
filts_strip[1] = filts_smem[(LOC_ID_1D%2)+1*2];
filts_strip[2] = filts_smem[(LOC_ID_1D%2)+2*2];
filts_strip[3] = filts_smem[(LOC_ID_1D%2)+3*2];
filts_strip[4] = filts_smem[(LOC_ID_1D%2)+4*2];
filts_strip[5] = filts_smem[(LOC_ID_1D%2)+5*2];
filts_strip[6] = filts_smem[(LOC_ID_1D%2)+6*2];
filts_strip[7] = filts_smem[(LOC_ID_1D%2)+7*2];
in_strip[0] = in_smem[8*(LOC_ID_1D/2)+0];
in_strip[1] = in_smem[8*(LOC_ID_1D/2)+1];
in_strip[2] = in_smem[8*(LOC_ID_1D/2)+2];
in_strip[3] = in_smem[8*(LOC_ID_1D/2)+3];
in_strip[4] = in_smem[8*(LOC_ID_1D/2)+4];
in_strip[5] = in_smem[8*(LOC_ID_1D/2)+5];
in_strip[6] = in_smem[8*(LOC_ID_1D/2)+6];
in_strip[7] = in_smem[8*(LOC_ID_1D/2)+7];
// end t_tile_loads;
// add bias to each elem of out_tile[] and store the results to out[]
#ifdef NO_IO
// begin t_tile_dummy_stores
out[0] = 0.0f
+ max(0.0f,out_tile[0] + filts_strip[0])
+ max(0.0f,out_tile[1] + filts_strip[1])
+ max(0.0f,out_tile[2] + filts_strip[2])
+ max(0.0f,out_tile[3] + filts_strip[3])
+ max(0.0f,out_tile[4] + filts_strip[4])
+ max(0.0f,out_tile[5] + filts_strip[5])
+ max(0.0f,out_tile[6] + filts_strip[6])
+ max(0.0f,out_tile[7] + filts_strip[7])
+ max(0.0f,out_tile[8] + filts_strip[0])
+ max(0.0f,out_tile[9] + filts_strip[1])
+ max(0.0f,out_tile[10] + filts_strip[2])
+ max(0.0f,out_tile[11] + filts_strip[3])
+ max(0.0f,out_tile[12] + filts_strip[4])
+ max(0.0f,out_tile[13] + filts_strip[5])
+ max(0.0f,out_tile[14] + filts_strip[6])
+ max(0.0f,out_tile[15] + filts_strip[7])
+ max(0.0f,out_tile[16] + filts_strip[0])
+ max(0.0f,out_tile[17] + filts_strip[1])
+ max(0.0f,out_tile[18] + filts_strip[2])
+ max(0.0f,out_tile[19] + filts_strip[3])
+ max(0.0f,out_tile[20] + filts_strip[4])
+ max(0.0f,out_tile[21] + filts_strip[5])
+ max(0.0f,out_tile[22] + filts_strip[6])
+ max(0.0f,out_tile[23] + filts_strip[7])
+ max(0.0f,out_tile[24] + filts_strip[0])
+ max(0.0f,out_tile[25] + filts_strip[1])
+ max(0.0f,out_tile[26] + filts_strip[2])
+ max(0.0f,out_tile[27] + filts_strip[3])
+ max(0.0f,out_tile[28] + filts_strip[4])
+ max(0.0f,out_tile[29] + filts_strip[5])
+ max(0.0f,out_tile[30] + filts_strip[6])
+ max(0.0f,out_tile[31] + filts_strip[7])
+ max(0.0f,out_tile[32] + filts_strip[0])
+ max(0.0f,out_tile[33] + filts_strip[1])
+ max(0.0f,out_tile[34] + filts_strip[2])
+ max(0.0f,out_tile[35] + filts_strip[3])
+ max(0.0f,out_tile[36] + filts_strip[4])
+ max(0.0f,out_tile[37] + filts_strip[5])
+ max(0.0f,out_tile[38] + filts_strip[6])
+ max(0.0f,out_tile[39] + filts_strip[7])
+ max(0.0f,out_tile[40] + filts_strip[0])
+ max(0.0f,out_tile[41] + filts_strip[1])
+ max(0.0f,out_tile[42] + filts_strip[2])
+ max(0.0f,out_tile[43] + filts_strip[3])
+ max(0.0f,out_tile[44] + filts_strip[4])
+ max(0.0f,out_tile[45] + filts_strip[5])
+ max(0.0f,out_tile[46] + filts_strip[6])
+ max(0.0f,out_tile[47] + filts_strip[7])
+ max(0.0f,out_tile[48] + filts_strip[0])
+ max(0.0f,out_tile[49] + filts_strip[1])
+ max(0.0f,out_tile[50] + filts_strip[2])
+ max(0.0f,out_tile[51] + filts_strip[3])
+ max(0.0f,out_tile[52] + filts_strip[4])
+ max(0.0f,out_tile[53] + filts_strip[5])
+ max(0.0f,out_tile[54] + filts_strip[6])
+ max(0.0f,out_tile[55] + filts_strip[7])
+ max(0.0f,out_tile[56] + filts_strip[0])
+ max(0.0f,out_tile[57] + filts_strip[1])
+ max(0.0f,out_tile[58] + filts_strip[2])
+ max(0.0f,out_tile[59] + filts_strip[3])
+ max(0.0f,out_tile[60] + filts_strip[4])
+ max(0.0f,out_tile[61] + filts_strip[5])
+ max(0.0f,out_tile[62] + filts_strip[6])
+ max(0.0f,out_tile[63] + filts_strip[7])
;
// end t_tile_dummy_stores;
#else
// begin t_tile_stores
int32_t tpix[8];
int32_t tcix[8];
tpix[0] = ((((LOC_ID_1D/2)+GRP_ID_1D*64)*8+0)/784)*12544 +
( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+0) % 784 ); // cache out patch ixs
tpix[1] = ((((LOC_ID_1D/2)+GRP_ID_1D*64)*8+1)/784)*12544 +
( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+1) % 784 ); // cache out patch ixs
tpix[2] = ((((LOC_ID_1D/2)+GRP_ID_1D*64)*8+2)/784)*12544 +
( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+2) % 784 ); // cache out patch ixs
tpix[3] = ((((LOC_ID_1D/2)+GRP_ID_1D*64)*8+3)/784)*12544 +
( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+3) % 784 ); // cache out patch ixs
tpix[4] = ((((LOC_ID_1D/2)+GRP_ID_1D*64)*8+4)/784)*12544 +
( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+4) % 784 ); // cache out patch ixs
tpix[5] = ((((LOC_ID_1D/2)+GRP_ID_1D*64)*8+5)/784)*12544 +
( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+5) % 784 ); // cache out patch ixs
tpix[6] = ((((LOC_ID_1D/2)+GRP_ID_1D*64)*8+6)/784)*12544 +
( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+6) % 784 ); // cache out patch ixs
tpix[7] = ((((LOC_ID_1D/2)+GRP_ID_1D*64)*8+7)/784)*12544 +
( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+7) % 784 ); // cache out patch ixs
tcix[0] = ((((LOC_ID_1D%2)+(GRP_ID_1D%1)*2)*8)+0)*784; // cache out chan ixs
tcix[1] = ((((LOC_ID_1D%2)+(GRP_ID_1D%1)*2)*8)+1)*784; // cache out chan ixs
tcix[2] = ((((LOC_ID_1D%2)+(GRP_ID_1D%1)*2)*8)+2)*784; // cache out chan ixs
tcix[3] = ((((LOC_ID_1D%2)+(GRP_ID_1D%1)*2)*8)+3)*784; // cache out chan ixs
tcix[4] = ((((LOC_ID_1D%2)+(GRP_ID_1D%1)*2)*8)+4)*784; // cache out chan ixs
tcix[5] = ((((LOC_ID_1D%2)+(GRP_ID_1D%1)*2)*8)+5)*784; // cache out chan ixs
tcix[6] = ((((LOC_ID_1D%2)+(GRP_ID_1D%1)*2)*8)+6)*784; // cache out chan ixs
tcix[7] = ((((LOC_ID_1D%2)+(GRP_ID_1D%1)*2)*8)+7)*784; // cache out chan ixs
if( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+0) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (16*784) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( tcix[1] < (16*784) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( tcix[2] < (16*784) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( tcix[3] < (16*784) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( tcix[4] < (16*784) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( tcix[5] < (16*784) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( tcix[6] < (16*784) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( tcix[7] < (16*784) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+1) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (16*784) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( tcix[1] < (16*784) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( tcix[2] < (16*784) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( tcix[3] < (16*784) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( tcix[4] < (16*784) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( tcix[5] < (16*784) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( tcix[6] < (16*784) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( tcix[7] < (16*784) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+2) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (16*784) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( tcix[1] < (16*784) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( tcix[2] < (16*784) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( tcix[3] < (16*784) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( tcix[4] < (16*784) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( tcix[5] < (16*784) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( tcix[6] < (16*784) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( tcix[7] < (16*784) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+3) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (16*784) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( tcix[1] < (16*784) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( tcix[2] < (16*784) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( tcix[3] < (16*784) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( tcix[4] < (16*784) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( tcix[5] < (16*784) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( tcix[6] < (16*784) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( tcix[7] < (16*784) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+4) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (16*784) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( tcix[1] < (16*784) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( tcix[2] < (16*784) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( tcix[3] < (16*784) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( tcix[4] < (16*784) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( tcix[5] < (16*784) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( tcix[6] < (16*784) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( tcix[7] < (16*784) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+5) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (16*784) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( tcix[1] < (16*784) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( tcix[2] < (16*784) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( tcix[3] < (16*784) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( tcix[4] < (16*784) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( tcix[5] < (16*784) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( tcix[6] < (16*784) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( tcix[7] < (16*784) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+6) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (16*784) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( tcix[1] < (16*784) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( tcix[2] < (16*784) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( tcix[3] < (16*784) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( tcix[4] < (16*784) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( tcix[5] < (16*784) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( tcix[6] < (16*784) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( tcix[7] < (16*784) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+7) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (16*784) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( tcix[1] < (16*784) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( tcix[2] < (16*784) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( tcix[3] < (16*784) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( tcix[4] < (16*784) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( tcix[5] < (16*784) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( tcix[6] < (16*784) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( tcix[7] < (16*784) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores;
#endif
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_pad = 0 */
/* in_dim_0 = 28 */
/* in_dim_1 = 28 */
/* conv_has_relu = 1 */
/* kern_sz = 1 */
/* stride = 1 */
/* out_chans = 16 */
/* in_chans = 192 */
/* rtc_func_name = conv__num_imgs_20__in_pad_0__in_dim_0_28__in_dim_1_28__conv_has_relu_1__kern_sz_1__stride_1__out_chans_16__in_chans_192 */
/* t_tile_sz = 8 */
/* out_ix_x_dim = 28 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%28) */
/* out_ix_y_dim = 28 */
/* out_ix_y_sz = 28 */
/* out_ix_y_nomod = (out_ix/28) */
/* out_ix_y = ((out_ix/28)%%28) */
/* out_ix_chan_dim = 16 */
/* out_ix_chan_sz = 784 */
/* out_ix_chan_nomod = (out_ix/784) */
/* out_ix_chan = ((out_ix/784)%%16) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 12544 */
/* out_ix_img_nomod = (out_ix/12544) */
/* out_ix_img = (out_ix/12544) */
/* out_ix_sz = 250880 */
/* in_ix_x_dim = 28 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%28) */
/* in_ix_y_dim = 28 */
/* in_ix_y_sz = 28 */
/* in_ix_y_nomod = (in_ix/28) */
/* in_ix_y = ((in_ix/28)%%28) */
/* in_ix_chan_dim = 192 */
/* in_ix_chan_sz = 784 */
/* in_ix_chan_nomod = (in_ix/784) */
/* in_ix_chan = ((in_ix/784)%%192) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 150528 */
/* in_ix_img_nomod = (in_ix/150528) */
/* in_ix_img = (in_ix/150528) */
/* in_ix_sz = 3010560 */
/* t_smem_patch_ix_x_dim = 28 */
/* t_smem_patch_ix_x_sz = 1 */
/* t_smem_patch_ix_x_nomod = t_smem_patch_ix */
/* t_smem_patch_ix_x = (t_smem_patch_ix%%28) */
/* t_smem_patch_ix_y_dim = 28 */
/* t_smem_patch_ix_y_sz = 28 */
/* t_smem_patch_ix_y_nomod = (t_smem_patch_ix/28) */
/* t_smem_patch_ix_y = ((t_smem_patch_ix/28)%%28) */
/* t_smem_patch_ix_img_dim = 20 */
/* t_smem_patch_ix_img_sz = 784 */
/* t_smem_patch_ix_img_nomod = (t_smem_patch_ix/784) */
/* t_smem_patch_ix_img = (t_smem_patch_ix/784) */
/* t_smem_patch_ix_sz = 15680 */
/* filts_ix_out_chan_elem_x_dim = 1 */
/* filts_ix_out_chan_elem_x_sz = 1 */
/* filts_ix_out_chan_elem_x_nomod = filts_ix_out_chan_elem */
/* filts_ix_out_chan_elem_x = (filts_ix_out_chan_elem%%1) */
/* filts_ix_out_chan_elem_y_dim = 1 */
/* filts_ix_out_chan_elem_y_sz = 1 */
/* filts_ix_out_chan_elem_y_nomod = filts_ix_out_chan_elem */
/* filts_ix_out_chan_elem_y = (filts_ix_out_chan_elem%%1) */
/* filts_ix_out_chan_elem_in_chan_dim = 192 */
/* filts_ix_out_chan_elem_in_chan_sz = 1 */
/* filts_ix_out_chan_elem_in_chan_nomod = filts_ix_out_chan_elem */
/* filts_ix_out_chan_elem_in_chan = filts_ix_out_chan_elem */
/* filts_ix_out_chan_elem_sz = 192 */
/* LOC_ID_1D_out_chan_tile_dim = 2 */
/* LOC_ID_1D_out_chan_tile_sz = 1 */
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%2) */
/* LOC_ID_1D_patch_tile_dim = 64 */
/* LOC_ID_1D_patch_tile_sz = 2 */
/* LOC_ID_1D_patch_tile_nomod = (LOC_ID_1D/2) */
/* LOC_ID_1D_patch_tile = (LOC_ID_1D/2) */
/* LOC_ID_1D_sz = 128 */
/* filts_xp_ix_out_chan_tile_dim = 2 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%2) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 2 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/2) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/2)%%8) */
/* filts_xp_ix_x_dim = 1 */
/* filts_xp_ix_x_sz = 16 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/16) */
/* filts_xp_ix_x = ((filts_xp_ix/16)%%1) */
/* filts_xp_ix_y_dim = 1 */
/* filts_xp_ix_y_sz = 16 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/16) */
/* filts_xp_ix_y = ((filts_xp_ix/16)%%1) */
/* filts_xp_ix_in_chan_dim = 192 */
/* filts_xp_ix_in_chan_sz = 16 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/16) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/16)%%192) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 3072 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/3072) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/3072) */
/* filts_xp_ix_sz = 3072 */
/* patch_smem_load_iter = 4 */
/* GRP_ID_1D_out_chan_blk_dim = 1 */
/* GRP_ID_1D_out_chan_blk_sz = 1 */
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */
/* GRP_ID_1D_patch_blk_dim = 31 */
/* GRP_ID_1D_patch_blk_sz = 1 */
/* GRP_ID_1D_patch_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_patch_blk = GRP_ID_1D */
/* GRP_ID_1D_sz = 31 */
/* out_chan_tile = (%(LOC_ID_1D_out_chan_tile)+%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim)) */
/* patch_tile = (%(LOC_ID_1D_patch_tile)+%(GRP_ID_1D_patch_blk)*%(LOC_ID_1D_patch_tile_dim)) */
/* out_chan_ix = (%(out_chan_tile)*%(t_tile_sz)) */
/* patch_ix_0 = (%(patch_tile)*%(t_tile_sz)+0) */
/* patch_ix_0_x_dim = 28 */
/* patch_ix_0_x_sz = 1 */
/* patch_ix_0_x_nomod = %(patch_ix_0) */
/* patch_ix_0_x = (%(patch_ix_0)%%28) */
/* patch_ix_0_y_dim = 28 */
/* patch_ix_0_y_sz = 28 */
/* patch_ix_0_y_nomod = (%(patch_ix_0)/28) */
/* patch_ix_0_y = ((%(patch_ix_0)/28)%%28) */
/* patch_ix_0_img_dim = 20 */
/* patch_ix_0_img_sz = 784 */
/* patch_ix_0_img_nomod = (%(patch_ix_0)/784) */
/* patch_ix_0_img = (%(patch_ix_0)/784) */
/* patch_ix_0_sz = 15680 */
/* patch_ix_1 = (%(patch_tile)*%(t_tile_sz)+1) */
/* patch_ix_1_x_dim = 28 */
/* patch_ix_1_x_sz = 1 */
/* patch_ix_1_x_nomod = %(patch_ix_1) */
/* patch_ix_1_x = (%(patch_ix_1)%%28) */
/* patch_ix_1_y_dim = 28 */
/* patch_ix_1_y_sz = 28 */
/* patch_ix_1_y_nomod = (%(patch_ix_1)/28) */
/* patch_ix_1_y = ((%(patch_ix_1)/28)%%28) */
/* patch_ix_1_img_dim = 20 */
/* patch_ix_1_img_sz = 784 */
/* patch_ix_1_img_nomod = (%(patch_ix_1)/784) */
/* patch_ix_1_img = (%(patch_ix_1)/784) */
/* patch_ix_1_sz = 15680 */
/* patch_ix_2 = (%(patch_tile)*%(t_tile_sz)+2) */
/* patch_ix_2_x_dim = 28 */
/* patch_ix_2_x_sz = 1 */
/* patch_ix_2_x_nomod = %(patch_ix_2) */
/* patch_ix_2_x = (%(patch_ix_2)%%28) */
/* patch_ix_2_y_dim = 28 */
/* patch_ix_2_y_sz = 28 */
/* patch_ix_2_y_nomod = (%(patch_ix_2)/28) */
/* patch_ix_2_y = ((%(patch_ix_2)/28)%%28) */
/* patch_ix_2_img_dim = 20 */
/* patch_ix_2_img_sz = 784 */
/* patch_ix_2_img_nomod = (%(patch_ix_2)/784) */
/* patch_ix_2_img = (%(patch_ix_2)/784) */
/* patch_ix_2_sz = 15680 */
/* patch_ix_3 = (%(patch_tile)*%(t_tile_sz)+3) */
/* patch_ix_3_x_dim = 28 */
/* patch_ix_3_x_sz = 1 */
/* patch_ix_3_x_nomod = %(patch_ix_3) */
/* patch_ix_3_x = (%(patch_ix_3)%%28) */
/* patch_ix_3_y_dim = 28 */
/* patch_ix_3_y_sz = 28 */
/* patch_ix_3_y_nomod = (%(patch_ix_3)/28) */
/* patch_ix_3_y = ((%(patch_ix_3)/28)%%28) */
/* patch_ix_3_img_dim = 20 */
/* patch_ix_3_img_sz = 784 */
/* patch_ix_3_img_nomod = (%(patch_ix_3)/784) */
/* patch_ix_3_img = (%(patch_ix_3)/784) */
/* patch_ix_3_sz = 15680 */
/* patch_ix_4 = (%(patch_tile)*%(t_tile_sz)+4) */
/* patch_ix_4_x_dim = 28 */
/* patch_ix_4_x_sz = 1 */
/* patch_ix_4_x_nomod = %(patch_ix_4) */
/* patch_ix_4_x = (%(patch_ix_4)%%28) */
/* patch_ix_4_y_dim = 28 */
/* patch_ix_4_y_sz = 28 */
/* patch_ix_4_y_nomod = (%(patch_ix_4)/28) */
/* patch_ix_4_y = ((%(patch_ix_4)/28)%%28) */
/* patch_ix_4_img_dim = 20 */
/* patch_ix_4_img_sz = 784 */
/* patch_ix_4_img_nomod = (%(patch_ix_4)/784) */
/* patch_ix_4_img = (%(patch_ix_4)/784) */
/* patch_ix_4_sz = 15680 */
/* patch_ix_5 = (%(patch_tile)*%(t_tile_sz)+5) */
/* patch_ix_5_x_dim = 28 */
/* patch_ix_5_x_sz = 1 */
/* patch_ix_5_x_nomod = %(patch_ix_5) */
/* patch_ix_5_x = (%(patch_ix_5)%%28) */
/* patch_ix_5_y_dim = 28 */
/* patch_ix_5_y_sz = 28 */
/* patch_ix_5_y_nomod = (%(patch_ix_5)/28) */
/* patch_ix_5_y = ((%(patch_ix_5)/28)%%28) */
/* patch_ix_5_img_dim = 20 */
/* patch_ix_5_img_sz = 784 */
/* patch_ix_5_img_nomod = (%(patch_ix_5)/784) */
/* patch_ix_5_img = (%(patch_ix_5)/784) */
/* patch_ix_5_sz = 15680 */
/* patch_ix_6 = (%(patch_tile)*%(t_tile_sz)+6) */
/* patch_ix_6_x_dim = 28 */
/* patch_ix_6_x_sz = 1 */
/* patch_ix_6_x_nomod = %(patch_ix_6) */
/* patch_ix_6_x = (%(patch_ix_6)%%28) */
/* patch_ix_6_y_dim = 28 */
/* patch_ix_6_y_sz = 28 */
/* patch_ix_6_y_nomod = (%(patch_ix_6)/28) */
/* patch_ix_6_y = ((%(patch_ix_6)/28)%%28) */
/* patch_ix_6_img_dim = 20 */
/* patch_ix_6_img_sz = 784 */
/* patch_ix_6_img_nomod = (%(patch_ix_6)/784) */
/* patch_ix_6_img = (%(patch_ix_6)/784) */
/* patch_ix_6_sz = 15680 */
/* patch_ix_7 = (%(patch_tile)*%(t_tile_sz)+7) */
/* patch_ix_7_x_dim = 28 */
/* patch_ix_7_x_sz = 1 */
/* patch_ix_7_x_nomod = %(patch_ix_7) */
/* patch_ix_7_x = (%(patch_ix_7)%%28) */
/* patch_ix_7_y_dim = 28 */
/* patch_ix_7_y_sz = 28 */
/* patch_ix_7_y_nomod = (%(patch_ix_7)/28) */
/* patch_ix_7_y = ((%(patch_ix_7)/28)%%28) */
/* patch_ix_7_img_dim = 20 */
/* patch_ix_7_img_sz = 784 */
/* patch_ix_7_img_nomod = (%(patch_ix_7)/784) */
/* patch_ix_7_img = (%(patch_ix_7)/784) */
/* patch_ix_7_sz = 15680 */
/* get_in = float v = 0;
int const smem_in_ix_y = %(t_smem_patch_ix_y)*%(stride)+%(filts_ix_out_chan_elem_y) - %(in_pad);
int const smem_in_ix_x = %(t_smem_patch_ix_x)*%(stride)+%(filts_ix_out_chan_elem_x) - %(in_pad);
if(smem_in_ix_y >= 0 && smem_in_ix_x >= 0 &&
%(t_smem_patch_ix_img) < %(in_ix_img_dim) &&
smem_in_ix_x < %(in_ix_x_dim) && smem_in_ix_y < %(in_ix_y_dim) ) {
v = in[%(t_smem_patch_ix_img)*%(in_ix_img_sz) +
%(filts_ix_out_chan_elem_in_chan)*%(in_ix_chan_sz) +
smem_in_ix_y*%(in_ix_y_sz) +
smem_in_ix_x*%(in_ix_x_sz)];
} */
/* t_tile_fmas = // begin t_tile_fmas
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
// end t_tile_fmas */
/* t_tile_loads = // begin t_tile_loads
filts_strip[0] = filts_smem[%(LOC_ID_1D_out_chan_tile)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem[%(LOC_ID_1D_out_chan_tile)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem[%(LOC_ID_1D_out_chan_tile)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem[%(LOC_ID_1D_out_chan_tile)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem[%(LOC_ID_1D_out_chan_tile)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem[%(LOC_ID_1D_out_chan_tile)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem[%(LOC_ID_1D_out_chan_tile)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem[%(LOC_ID_1D_out_chan_tile)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+0];
in_strip[1] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+1];
in_strip[2] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+2];
in_strip[3] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+3];
in_strip[4] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+4];
in_strip[5] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+5];
in_strip[6] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+6];
in_strip[7] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+7];
// end t_tile_loads */
/* t_tile_dummy_loads = // begin t_tile_dummy_loads
filts_strip[0] = filts_smem[(LOC_ID_1D %% 32) + 0];
filts_strip[1] = filts_smem[(LOC_ID_1D %% 32) + 1];
filts_strip[2] = filts_smem[(LOC_ID_1D %% 32) + 2];
filts_strip[3] = filts_smem[(LOC_ID_1D %% 32) + 3];
filts_strip[4] = filts_smem[(LOC_ID_1D %% 32) + 4];
filts_strip[5] = filts_smem[(LOC_ID_1D %% 32) + 5];
filts_strip[6] = filts_smem[(LOC_ID_1D %% 32) + 6];
filts_strip[7] = filts_smem[(LOC_ID_1D %% 32) + 7];
in_strip[0] = in_smem[(LOC_ID_1D %% 32) + 0];
in_strip[1] = in_smem[(LOC_ID_1D %% 32) + 1];
in_strip[2] = in_smem[(LOC_ID_1D %% 32) + 2];
in_strip[3] = in_smem[(LOC_ID_1D %% 32) + 3];
in_strip[4] = in_smem[(LOC_ID_1D %% 32) + 4];
in_strip[5] = in_smem[(LOC_ID_1D %% 32) + 5];
in_strip[6] = in_smem[(LOC_ID_1D %% 32) + 6];
in_strip[7] = in_smem[(LOC_ID_1D %% 32) + 7];
// end t_tile_dummy_loads */
/* t_tile_stores = // begin t_tile_stores
int32_t tpix[%(t_tile_sz)];
int32_t tcix[%(t_tile_sz)];
tpix[0] = %(patch_ix_0_img)*%(out_ix_img_sz) +
( %(patch_ix_0) %% %(patch_ix_0_img_sz) ); // cache out patch ixs
tpix[1] = %(patch_ix_1_img)*%(out_ix_img_sz) +
( %(patch_ix_1) %% %(patch_ix_1_img_sz) ); // cache out patch ixs
tpix[2] = %(patch_ix_2_img)*%(out_ix_img_sz) +
( %(patch_ix_2) %% %(patch_ix_2_img_sz) ); // cache out patch ixs
tpix[3] = %(patch_ix_3_img)*%(out_ix_img_sz) +
( %(patch_ix_3) %% %(patch_ix_3_img_sz) ); // cache out patch ixs
tpix[4] = %(patch_ix_4_img)*%(out_ix_img_sz) +
( %(patch_ix_4) %% %(patch_ix_4_img_sz) ); // cache out patch ixs
tpix[5] = %(patch_ix_5_img)*%(out_ix_img_sz) +
( %(patch_ix_5) %% %(patch_ix_5_img_sz) ); // cache out patch ixs
tpix[6] = %(patch_ix_6_img)*%(out_ix_img_sz) +
( %(patch_ix_6) %% %(patch_ix_6_img_sz) ); // cache out patch ixs
tpix[7] = %(patch_ix_7_img)*%(out_ix_img_sz) +
( %(patch_ix_7) %% %(patch_ix_7_img_sz) ); // cache out patch ixs
tcix[0] = (%(out_chan_ix)+0)*%(out_ix_chan_sz); // cache out chan ixs
tcix[1] = (%(out_chan_ix)+1)*%(out_ix_chan_sz); // cache out chan ixs
tcix[2] = (%(out_chan_ix)+2)*%(out_ix_chan_sz); // cache out chan ixs
tcix[3] = (%(out_chan_ix)+3)*%(out_ix_chan_sz); // cache out chan ixs
tcix[4] = (%(out_chan_ix)+4)*%(out_ix_chan_sz); // cache out chan ixs
tcix[5] = (%(out_chan_ix)+5)*%(out_ix_chan_sz); // cache out chan ixs
tcix[6] = (%(out_chan_ix)+6)*%(out_ix_chan_sz); // cache out chan ixs
tcix[7] = (%(out_chan_ix)+7)*%(out_ix_chan_sz); // cache out chan ixs
if( %(patch_ix_0) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( %(patch_ix_1) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( %(patch_ix_2) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( %(patch_ix_3) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( %(patch_ix_4) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( %(patch_ix_5) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( %(patch_ix_6) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( %(patch_ix_7) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores */
/* t_tile_dummy_stores = // begin t_tile_dummy_stores
out[0] = 0.0f
+ max(0.0f,out_tile[0] + filts_strip[0])
+ max(0.0f,out_tile[1] + filts_strip[1])
+ max(0.0f,out_tile[2] + filts_strip[2])
+ max(0.0f,out_tile[3] + filts_strip[3])
+ max(0.0f,out_tile[4] + filts_strip[4])
+ max(0.0f,out_tile[5] + filts_strip[5])
+ max(0.0f,out_tile[6] + filts_strip[6])
+ max(0.0f,out_tile[7] + filts_strip[7])
+ max(0.0f,out_tile[8] + filts_strip[0])
+ max(0.0f,out_tile[9] + filts_strip[1])
+ max(0.0f,out_tile[10] + filts_strip[2])
+ max(0.0f,out_tile[11] + filts_strip[3])
+ max(0.0f,out_tile[12] + filts_strip[4])
+ max(0.0f,out_tile[13] + filts_strip[5])
+ max(0.0f,out_tile[14] + filts_strip[6])
+ max(0.0f,out_tile[15] + filts_strip[7])
+ max(0.0f,out_tile[16] + filts_strip[0])
+ max(0.0f,out_tile[17] + filts_strip[1])
+ max(0.0f,out_tile[18] + filts_strip[2])
+ max(0.0f,out_tile[19] + filts_strip[3])
+ max(0.0f,out_tile[20] + filts_strip[4])
+ max(0.0f,out_tile[21] + filts_strip[5])
+ max(0.0f,out_tile[22] + filts_strip[6])
+ max(0.0f,out_tile[23] + filts_strip[7])
+ max(0.0f,out_tile[24] + filts_strip[0])
+ max(0.0f,out_tile[25] + filts_strip[1])
+ max(0.0f,out_tile[26] + filts_strip[2])
+ max(0.0f,out_tile[27] + filts_strip[3])
+ max(0.0f,out_tile[28] + filts_strip[4])
+ max(0.0f,out_tile[29] + filts_strip[5])
+ max(0.0f,out_tile[30] + filts_strip[6])
+ max(0.0f,out_tile[31] + filts_strip[7])
+ max(0.0f,out_tile[32] + filts_strip[0])
+ max(0.0f,out_tile[33] + filts_strip[1])
+ max(0.0f,out_tile[34] + filts_strip[2])
+ max(0.0f,out_tile[35] + filts_strip[3])
+ max(0.0f,out_tile[36] + filts_strip[4])
+ max(0.0f,out_tile[37] + filts_strip[5])
+ max(0.0f,out_tile[38] + filts_strip[6])
+ max(0.0f,out_tile[39] + filts_strip[7])
+ max(0.0f,out_tile[40] + filts_strip[0])
+ max(0.0f,out_tile[41] + filts_strip[1])
+ max(0.0f,out_tile[42] + filts_strip[2])
+ max(0.0f,out_tile[43] + filts_strip[3])
+ max(0.0f,out_tile[44] + filts_strip[4])
+ max(0.0f,out_tile[45] + filts_strip[5])
+ max(0.0f,out_tile[46] + filts_strip[6])
+ max(0.0f,out_tile[47] + filts_strip[7])
+ max(0.0f,out_tile[48] + filts_strip[0])
+ max(0.0f,out_tile[49] + filts_strip[1])
+ max(0.0f,out_tile[50] + filts_strip[2])
+ max(0.0f,out_tile[51] + filts_strip[3])
+ max(0.0f,out_tile[52] + filts_strip[4])
+ max(0.0f,out_tile[53] + filts_strip[5])
+ max(0.0f,out_tile[54] + filts_strip[6])
+ max(0.0f,out_tile[55] + filts_strip[7])
+ max(0.0f,out_tile[56] + filts_strip[0])
+ max(0.0f,out_tile[57] + filts_strip[1])
+ max(0.0f,out_tile[58] + filts_strip[2])
+ max(0.0f,out_tile[59] + filts_strip[3])
+ max(0.0f,out_tile[60] + filts_strip[4])
+ max(0.0f,out_tile[61] + filts_strip[5])
+ max(0.0f,out_tile[62] + filts_strip[6])
+ max(0.0f,out_tile[63] + filts_strip[7])
;
// end t_tile_dummy_stores */
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_16__in_chans_192__kysz_1__kxsz_1( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile
{
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg
int32_t const filts_ix = GLOB_ID_1D;
if( filts_ix >= 3072 ) { return; }
int32_t const fioc = (filts_ix/192);
float val = 0.0f;
int32_t const filts_xp_ix =
(fioc/16)*3072 +
(fioc%8)*2 +
((fioc/8)%2)*1 +
(filts_ix%192)*16 +
(filts_ix%1)*16 +
(filts_ix%1)*16;
#if 1
val = in[filts_ix];
#else
if( (filts_ix%192) == 0 ) {
// if( ((filts_ix%1) == 5) && ((filts_ix%1) == 5) )
{
val = (filts_ix%1)*100 + (filts_ix%1);
}
}
#endif
out[filts_xp_ix] = val;
}
// -- template substituion table used: --
/* out_chans = 16 */
/* in_chans = 192 */
/* kysz = 1 */
/* kxsz = 1 */
/* rtc_func_name = xpose_filts__out_chans_16__in_chans_192__kysz_1__kxsz_1 */
/* t_tile_sz = 8 */
/* filts_ix_x_dim = 1 */
/* filts_ix_x_sz = 1 */
/* filts_ix_x_nomod = filts_ix */
/* filts_ix_x = (filts_ix%%1) */
/* filts_ix_y_dim = 1 */
/* filts_ix_y_sz = 1 */
/* filts_ix_y_nomod = filts_ix */
/* filts_ix_y = (filts_ix%%1) */
/* filts_ix_in_chan_dim = 192 */
/* filts_ix_in_chan_sz = 1 */
/* filts_ix_in_chan_nomod = filts_ix */
/* filts_ix_in_chan = (filts_ix%%192) */
/* filts_ix_out_chan_dim = 16 */
/* filts_ix_out_chan_sz = 192 */
/* filts_ix_out_chan_nomod = (filts_ix/192) */
/* filts_ix_out_chan = (filts_ix/192) */
/* filts_ix_sz = 3072 */
/* filts_xp_ix_out_chan_tile_dim = 2 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%2) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 2 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/2) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/2)%%8) */
/* filts_xp_ix_x_dim = 1 */
/* filts_xp_ix_x_sz = 16 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/16) */
/* filts_xp_ix_x = ((filts_xp_ix/16)%%1) */
/* filts_xp_ix_y_dim = 1 */
/* filts_xp_ix_y_sz = 16 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/16) */
/* filts_xp_ix_y = ((filts_xp_ix/16)%%1) */
/* filts_xp_ix_in_chan_dim = 192 */
/* filts_xp_ix_in_chan_sz = 16 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/16) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/16)%%192) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 3072 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/3072) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/3072) */
/* filts_xp_ix_sz = 3072 */
/* fioc_out_chan_reg_dim = 8 */
/* fioc_out_chan_reg_sz = 1 */
/* fioc_out_chan_reg_nomod = fioc */
/* fioc_out_chan_reg = (fioc%%8) */
/* fioc_out_chan_tile_dim = 2 */
/* fioc_out_chan_tile_sz = 8 */
/* fioc_out_chan_tile_nomod = (fioc/8) */
/* fioc_out_chan_tile = ((fioc/8)%%2) */
/* fioc_out_chan_blk_dim = 1 */
/* fioc_out_chan_blk_sz = 16 */
/* fioc_out_chan_blk_nomod = (fioc/16) */
/* fioc_out_chan_blk = (fioc/16) */
/* fioc_sz = 16 */
// each thread: computes 8x8 block of out
// loop over k dim
CUCL_GLOBAL_KERNEL void tconv__num_imgs_20__in_dim_0_28__in_dim_1_28__kern_sz_5__stride_1__in_pad_2__t_tile_sz_8__conv_has_relu_1__out_chans_32__in_chans_16( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) {
LOCSHAR_MEM float all_smem[1024]; // note: max(filts+in,out) == max(160+480,1024)
LSMASQ float * const filts_smem = all_smem;
LSMASQ float * const in_smem = filts_smem + 160;
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz )
float in_strip[12]; // segment of input line sufficient for one unrolling of inner loop
int32_t blk_in_ix_base = GRP_ID_1D*7680 + LOC_ID_1D;// index of first input pel to load for this thread
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*12800; // index of first out chan
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D;
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%4);
int32_t out_line = (GRP_ID_1D/4)*32; // first out_line of block
int32_t const blk_fli = (out_line/28); // image of first out_line of block
out_line += (LOC_ID_1D/4); // adjust to out_line of this thread
// offset in lines to deal with >1 img/block = (number of prior images (partial or full) in this block) * (adj to next img)
int32_t const img_off_lines = ((out_line/28) - blk_fli)*(5-1);
int32_t const in_y = (out_line%28)*1 - 2;
for( int32_t in_chan = 0; in_chan != 16; ++in_chan ) {
BARRIER_SYNC;
// begin in_smem_loads
in_smem[(LOC_ID_1D + 128 * 0)] = in[ blk_in_ix_base + (128*0) ];
in_smem[(LOC_ID_1D + 128 * 1)] = in[ blk_in_ix_base + (128*1) ];
in_smem[(LOC_ID_1D + 128 * 2)] = in[ blk_in_ix_base + (128*2) ];
if( (LOC_ID_1D + 128 * 3) < 480) { in_smem[(LOC_ID_1D + 128 * 3)] = in[ blk_in_ix_base + (128*3) ];}
blk_in_ix_base += 480;
// end in_smem_loads;
for( int32_t ky = 0; ky != 5; ++ky ) {
if( ky != 0 ) { BARRIER_SYNC; }
// begin filt_smem_loads
filts_smem[(LOC_ID_1D + 128 * 0)] = filts[filts_off+(128*0)];
if( (LOC_ID_1D + 128 * 1) < 160 ) { filts_smem[(LOC_ID_1D + 128 * 1)] = filts[filts_off+(128*1)];}
filts_off += 160;
// end filt_smem_loads;
BARRIER_SYNC;
if( (out_line/28) >= 20 ) { continue; } // required: skip lines from invalid images (read might be invalid)
if( ((in_y+ky) < 0) || ((in_y+ky)>28) ) { continue; } // optimization: skip known-to-be-padding input lines
LSMASQ float * const in_smem_off = in_smem + ((LOC_ID_1D/4)*1+ky+img_off_lines)*12;
// begin inner_loop_body
in_strip[0] = in_smem_off[0];
in_strip[1] = in_smem_off[1];
in_strip[2] = in_smem_off[2];
in_strip[3] = in_smem_off[3];
in_strip[4] = in_smem_off[4];
in_strip[5] = in_smem_off[5];
in_strip[6] = in_smem_off[6];
in_strip[7] = in_smem_off[7];
in_strip[8] = in_smem_off[8];
in_strip[9] = in_smem_off[9];
in_strip[10] = in_smem_off[10];
in_strip[11] = in_smem_off[11];
filts_strip[0] = filts_smem_off[0*32+0*4];
filts_strip[1] = filts_smem_off[0*32+1*4];
filts_strip[2] = filts_smem_off[0*32+2*4];
filts_strip[3] = filts_smem_off[0*32+3*4];
filts_strip[4] = filts_smem_off[0*32+4*4];
filts_strip[5] = filts_smem_off[0*32+5*4];
filts_strip[6] = filts_smem_off[0*32+6*4];
filts_strip[7] = filts_smem_off[0*32+7*4];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[1*32+0*4];
filts_strip[1] = filts_smem_off[1*32+1*4];
filts_strip[2] = filts_smem_off[1*32+2*4];
filts_strip[3] = filts_smem_off[1*32+3*4];
filts_strip[4] = filts_smem_off[1*32+4*4];
filts_strip[5] = filts_smem_off[1*32+5*4];
filts_strip[6] = filts_smem_off[1*32+6*4];
filts_strip[7] = filts_smem_off[1*32+7*4];
out_tile[0] += filts_strip[0]*in_strip[1];
out_tile[1] += filts_strip[1]*in_strip[1];
out_tile[2] += filts_strip[2]*in_strip[1];
out_tile[3] += filts_strip[3]*in_strip[1];
out_tile[4] += filts_strip[4]*in_strip[1];
out_tile[5] += filts_strip[5]*in_strip[1];
out_tile[6] += filts_strip[6]*in_strip[1];
out_tile[7] += filts_strip[7]*in_strip[1];
out_tile[8] += filts_strip[0]*in_strip[2];
out_tile[9] += filts_strip[1]*in_strip[2];
out_tile[10] += filts_strip[2]*in_strip[2];
out_tile[11] += filts_strip[3]*in_strip[2];
out_tile[12] += filts_strip[4]*in_strip[2];
out_tile[13] += filts_strip[5]*in_strip[2];
out_tile[14] += filts_strip[6]*in_strip[2];
out_tile[15] += filts_strip[7]*in_strip[2];
out_tile[16] += filts_strip[0]*in_strip[3];
out_tile[17] += filts_strip[1]*in_strip[3];
out_tile[18] += filts_strip[2]*in_strip[3];
out_tile[19] += filts_strip[3]*in_strip[3];
out_tile[20] += filts_strip[4]*in_strip[3];
out_tile[21] += filts_strip[5]*in_strip[3];
out_tile[22] += filts_strip[6]*in_strip[3];
out_tile[23] += filts_strip[7]*in_strip[3];
out_tile[24] += filts_strip[0]*in_strip[4];
out_tile[25] += filts_strip[1]*in_strip[4];
out_tile[26] += filts_strip[2]*in_strip[4];
out_tile[27] += filts_strip[3]*in_strip[4];
out_tile[28] += filts_strip[4]*in_strip[4];
out_tile[29] += filts_strip[5]*in_strip[4];
out_tile[30] += filts_strip[6]*in_strip[4];
out_tile[31] += filts_strip[7]*in_strip[4];
out_tile[32] += filts_strip[0]*in_strip[5];
out_tile[33] += filts_strip[1]*in_strip[5];
out_tile[34] += filts_strip[2]*in_strip[5];
out_tile[35] += filts_strip[3]*in_strip[5];
out_tile[36] += filts_strip[4]*in_strip[5];
out_tile[37] += filts_strip[5]*in_strip[5];
out_tile[38] += filts_strip[6]*in_strip[5];
out_tile[39] += filts_strip[7]*in_strip[5];
out_tile[40] += filts_strip[0]*in_strip[6];
out_tile[41] += filts_strip[1]*in_strip[6];
out_tile[42] += filts_strip[2]*in_strip[6];
out_tile[43] += filts_strip[3]*in_strip[6];
out_tile[44] += filts_strip[4]*in_strip[6];
out_tile[45] += filts_strip[5]*in_strip[6];
out_tile[46] += filts_strip[6]*in_strip[6];
out_tile[47] += filts_strip[7]*in_strip[6];
out_tile[48] += filts_strip[0]*in_strip[7];
out_tile[49] += filts_strip[1]*in_strip[7];
out_tile[50] += filts_strip[2]*in_strip[7];
out_tile[51] += filts_strip[3]*in_strip[7];
out_tile[52] += filts_strip[4]*in_strip[7];
out_tile[53] += filts_strip[5]*in_strip[7];
out_tile[54] += filts_strip[6]*in_strip[7];
out_tile[55] += filts_strip[7]*in_strip[7];
out_tile[56] += filts_strip[0]*in_strip[8];
out_tile[57] += filts_strip[1]*in_strip[8];
out_tile[58] += filts_strip[2]*in_strip[8];
out_tile[59] += filts_strip[3]*in_strip[8];
out_tile[60] += filts_strip[4]*in_strip[8];
out_tile[61] += filts_strip[5]*in_strip[8];
out_tile[62] += filts_strip[6]*in_strip[8];
out_tile[63] += filts_strip[7]*in_strip[8];
filts_strip[0] = filts_smem_off[2*32+0*4];
filts_strip[1] = filts_smem_off[2*32+1*4];
filts_strip[2] = filts_smem_off[2*32+2*4];
filts_strip[3] = filts_smem_off[2*32+3*4];
filts_strip[4] = filts_smem_off[2*32+4*4];
filts_strip[5] = filts_smem_off[2*32+5*4];
filts_strip[6] = filts_smem_off[2*32+6*4];
filts_strip[7] = filts_smem_off[2*32+7*4];
out_tile[0] += filts_strip[0]*in_strip[2];
out_tile[1] += filts_strip[1]*in_strip[2];
out_tile[2] += filts_strip[2]*in_strip[2];
out_tile[3] += filts_strip[3]*in_strip[2];
out_tile[4] += filts_strip[4]*in_strip[2];
out_tile[5] += filts_strip[5]*in_strip[2];
out_tile[6] += filts_strip[6]*in_strip[2];
out_tile[7] += filts_strip[7]*in_strip[2];
out_tile[8] += filts_strip[0]*in_strip[3];
out_tile[9] += filts_strip[1]*in_strip[3];
out_tile[10] += filts_strip[2]*in_strip[3];
out_tile[11] += filts_strip[3]*in_strip[3];
out_tile[12] += filts_strip[4]*in_strip[3];
out_tile[13] += filts_strip[5]*in_strip[3];
out_tile[14] += filts_strip[6]*in_strip[3];
out_tile[15] += filts_strip[7]*in_strip[3];
out_tile[16] += filts_strip[0]*in_strip[4];
out_tile[17] += filts_strip[1]*in_strip[4];
out_tile[18] += filts_strip[2]*in_strip[4];
out_tile[19] += filts_strip[3]*in_strip[4];
out_tile[20] += filts_strip[4]*in_strip[4];
out_tile[21] += filts_strip[5]*in_strip[4];
out_tile[22] += filts_strip[6]*in_strip[4];
out_tile[23] += filts_strip[7]*in_strip[4];
out_tile[24] += filts_strip[0]*in_strip[5];
out_tile[25] += filts_strip[1]*in_strip[5];
out_tile[26] += filts_strip[2]*in_strip[5];
out_tile[27] += filts_strip[3]*in_strip[5];
out_tile[28] += filts_strip[4]*in_strip[5];
out_tile[29] += filts_strip[5]*in_strip[5];
out_tile[30] += filts_strip[6]*in_strip[5];
out_tile[31] += filts_strip[7]*in_strip[5];
out_tile[32] += filts_strip[0]*in_strip[6];
out_tile[33] += filts_strip[1]*in_strip[6];
out_tile[34] += filts_strip[2]*in_strip[6];
out_tile[35] += filts_strip[3]*in_strip[6];
out_tile[36] += filts_strip[4]*in_strip[6];
out_tile[37] += filts_strip[5]*in_strip[6];
out_tile[38] += filts_strip[6]*in_strip[6];
out_tile[39] += filts_strip[7]*in_strip[6];
out_tile[40] += filts_strip[0]*in_strip[7];
out_tile[41] += filts_strip[1]*in_strip[7];
out_tile[42] += filts_strip[2]*in_strip[7];
out_tile[43] += filts_strip[3]*in_strip[7];
out_tile[44] += filts_strip[4]*in_strip[7];
out_tile[45] += filts_strip[5]*in_strip[7];
out_tile[46] += filts_strip[6]*in_strip[7];
out_tile[47] += filts_strip[7]*in_strip[7];
out_tile[48] += filts_strip[0]*in_strip[8];
out_tile[49] += filts_strip[1]*in_strip[8];
out_tile[50] += filts_strip[2]*in_strip[8];
out_tile[51] += filts_strip[3]*in_strip[8];
out_tile[52] += filts_strip[4]*in_strip[8];
out_tile[53] += filts_strip[5]*in_strip[8];
out_tile[54] += filts_strip[6]*in_strip[8];
out_tile[55] += filts_strip[7]*in_strip[8];
out_tile[56] += filts_strip[0]*in_strip[9];
out_tile[57] += filts_strip[1]*in_strip[9];
out_tile[58] += filts_strip[2]*in_strip[9];
out_tile[59] += filts_strip[3]*in_strip[9];
out_tile[60] += filts_strip[4]*in_strip[9];
out_tile[61] += filts_strip[5]*in_strip[9];
out_tile[62] += filts_strip[6]*in_strip[9];
out_tile[63] += filts_strip[7]*in_strip[9];
filts_strip[0] = filts_smem_off[3*32+0*4];
filts_strip[1] = filts_smem_off[3*32+1*4];
filts_strip[2] = filts_smem_off[3*32+2*4];
filts_strip[3] = filts_smem_off[3*32+3*4];
filts_strip[4] = filts_smem_off[3*32+4*4];
filts_strip[5] = filts_smem_off[3*32+5*4];
filts_strip[6] = filts_smem_off[3*32+6*4];
filts_strip[7] = filts_smem_off[3*32+7*4];
out_tile[0] += filts_strip[0]*in_strip[3];
out_tile[1] += filts_strip[1]*in_strip[3];
out_tile[2] += filts_strip[2]*in_strip[3];
out_tile[3] += filts_strip[3]*in_strip[3];
out_tile[4] += filts_strip[4]*in_strip[3];
out_tile[5] += filts_strip[5]*in_strip[3];
out_tile[6] += filts_strip[6]*in_strip[3];
out_tile[7] += filts_strip[7]*in_strip[3];
out_tile[8] += filts_strip[0]*in_strip[4];
out_tile[9] += filts_strip[1]*in_strip[4];
out_tile[10] += filts_strip[2]*in_strip[4];
out_tile[11] += filts_strip[3]*in_strip[4];
out_tile[12] += filts_strip[4]*in_strip[4];
out_tile[13] += filts_strip[5]*in_strip[4];
out_tile[14] += filts_strip[6]*in_strip[4];
out_tile[15] += filts_strip[7]*in_strip[4];
out_tile[16] += filts_strip[0]*in_strip[5];
out_tile[17] += filts_strip[1]*in_strip[5];
out_tile[18] += filts_strip[2]*in_strip[5];
out_tile[19] += filts_strip[3]*in_strip[5];
out_tile[20] += filts_strip[4]*in_strip[5];
out_tile[21] += filts_strip[5]*in_strip[5];
out_tile[22] += filts_strip[6]*in_strip[5];
out_tile[23] += filts_strip[7]*in_strip[5];
out_tile[24] += filts_strip[0]*in_strip[6];
out_tile[25] += filts_strip[1]*in_strip[6];
out_tile[26] += filts_strip[2]*in_strip[6];
out_tile[27] += filts_strip[3]*in_strip[6];
out_tile[28] += filts_strip[4]*in_strip[6];
out_tile[29] += filts_strip[5]*in_strip[6];
out_tile[30] += filts_strip[6]*in_strip[6];
out_tile[31] += filts_strip[7]*in_strip[6];
out_tile[32] += filts_strip[0]*in_strip[7];
out_tile[33] += filts_strip[1]*in_strip[7];
out_tile[34] += filts_strip[2]*in_strip[7];
out_tile[35] += filts_strip[3]*in_strip[7];
out_tile[36] += filts_strip[4]*in_strip[7];
out_tile[37] += filts_strip[5]*in_strip[7];
out_tile[38] += filts_strip[6]*in_strip[7];
out_tile[39] += filts_strip[7]*in_strip[7];
out_tile[40] += filts_strip[0]*in_strip[8];
out_tile[41] += filts_strip[1]*in_strip[8];
out_tile[42] += filts_strip[2]*in_strip[8];
out_tile[43] += filts_strip[3]*in_strip[8];
out_tile[44] += filts_strip[4]*in_strip[8];
out_tile[45] += filts_strip[5]*in_strip[8];
out_tile[46] += filts_strip[6]*in_strip[8];
out_tile[47] += filts_strip[7]*in_strip[8];
out_tile[48] += filts_strip[0]*in_strip[9];
out_tile[49] += filts_strip[1]*in_strip[9];
out_tile[50] += filts_strip[2]*in_strip[9];
out_tile[51] += filts_strip[3]*in_strip[9];
out_tile[52] += filts_strip[4]*in_strip[9];
out_tile[53] += filts_strip[5]*in_strip[9];
out_tile[54] += filts_strip[6]*in_strip[9];
out_tile[55] += filts_strip[7]*in_strip[9];
out_tile[56] += filts_strip[0]*in_strip[10];
out_tile[57] += filts_strip[1]*in_strip[10];
out_tile[58] += filts_strip[2]*in_strip[10];
out_tile[59] += filts_strip[3]*in_strip[10];
out_tile[60] += filts_strip[4]*in_strip[10];
out_tile[61] += filts_strip[5]*in_strip[10];
out_tile[62] += filts_strip[6]*in_strip[10];
out_tile[63] += filts_strip[7]*in_strip[10];
filts_strip[0] = filts_smem_off[4*32+0*4];
filts_strip[1] = filts_smem_off[4*32+1*4];
filts_strip[2] = filts_smem_off[4*32+2*4];
filts_strip[3] = filts_smem_off[4*32+3*4];
filts_strip[4] = filts_smem_off[4*32+4*4];
filts_strip[5] = filts_smem_off[4*32+5*4];
filts_strip[6] = filts_smem_off[4*32+6*4];
filts_strip[7] = filts_smem_off[4*32+7*4];
out_tile[0] += filts_strip[0]*in_strip[4];
out_tile[1] += filts_strip[1]*in_strip[4];
out_tile[2] += filts_strip[2]*in_strip[4];
out_tile[3] += filts_strip[3]*in_strip[4];
out_tile[4] += filts_strip[4]*in_strip[4];
out_tile[5] += filts_strip[5]*in_strip[4];
out_tile[6] += filts_strip[6]*in_strip[4];
out_tile[7] += filts_strip[7]*in_strip[4];
out_tile[8] += filts_strip[0]*in_strip[5];
out_tile[9] += filts_strip[1]*in_strip[5];
out_tile[10] += filts_strip[2]*in_strip[5];
out_tile[11] += filts_strip[3]*in_strip[5];
out_tile[12] += filts_strip[4]*in_strip[5];
out_tile[13] += filts_strip[5]*in_strip[5];
out_tile[14] += filts_strip[6]*in_strip[5];
out_tile[15] += filts_strip[7]*in_strip[5];
out_tile[16] += filts_strip[0]*in_strip[6];
out_tile[17] += filts_strip[1]*in_strip[6];
out_tile[18] += filts_strip[2]*in_strip[6];
out_tile[19] += filts_strip[3]*in_strip[6];
out_tile[20] += filts_strip[4]*in_strip[6];
out_tile[21] += filts_strip[5]*in_strip[6];
out_tile[22] += filts_strip[6]*in_strip[6];
out_tile[23] += filts_strip[7]*in_strip[6];
out_tile[24] += filts_strip[0]*in_strip[7];
out_tile[25] += filts_strip[1]*in_strip[7];
out_tile[26] += filts_strip[2]*in_strip[7];
out_tile[27] += filts_strip[3]*in_strip[7];
out_tile[28] += filts_strip[4]*in_strip[7];
out_tile[29] += filts_strip[5]*in_strip[7];
out_tile[30] += filts_strip[6]*in_strip[7];
out_tile[31] += filts_strip[7]*in_strip[7];
out_tile[32] += filts_strip[0]*in_strip[8];
out_tile[33] += filts_strip[1]*in_strip[8];
out_tile[34] += filts_strip[2]*in_strip[8];
out_tile[35] += filts_strip[3]*in_strip[8];
out_tile[36] += filts_strip[4]*in_strip[8];
out_tile[37] += filts_strip[5]*in_strip[8];
out_tile[38] += filts_strip[6]*in_strip[8];
out_tile[39] += filts_strip[7]*in_strip[8];
out_tile[40] += filts_strip[0]*in_strip[9];
out_tile[41] += filts_strip[1]*in_strip[9];
out_tile[42] += filts_strip[2]*in_strip[9];
out_tile[43] += filts_strip[3]*in_strip[9];
out_tile[44] += filts_strip[4]*in_strip[9];
out_tile[45] += filts_strip[5]*in_strip[9];
out_tile[46] += filts_strip[6]*in_strip[9];
out_tile[47] += filts_strip[7]*in_strip[9];
out_tile[48] += filts_strip[0]*in_strip[10];
out_tile[49] += filts_strip[1]*in_strip[10];
out_tile[50] += filts_strip[2]*in_strip[10];
out_tile[51] += filts_strip[3]*in_strip[10];
out_tile[52] += filts_strip[4]*in_strip[10];
out_tile[53] += filts_strip[5]*in_strip[10];
out_tile[54] += filts_strip[6]*in_strip[10];
out_tile[55] += filts_strip[7]*in_strip[10];
out_tile[56] += filts_strip[0]*in_strip[11];
out_tile[57] += filts_strip[1]*in_strip[11];
out_tile[58] += filts_strip[2]*in_strip[11];
out_tile[59] += filts_strip[3]*in_strip[11];
out_tile[60] += filts_strip[4]*in_strip[11];
out_tile[61] += filts_strip[5]*in_strip[11];
out_tile[62] += filts_strip[6]*in_strip[11];
out_tile[63] += filts_strip[7]*in_strip[11];
;
}
}
if( flags == 2 ) { return; }
BARRIER_SYNC;
for( int32_t i = 0; i != 1; ++i ) {
int32_t const t_smem_bias_ix = LOC_ID_1D+128*i;
if( t_smem_bias_ix < 32 ) {
int32_t const ocix_base = (GRP_ID_1D%1)*32;
int32_t const load_reg = t_smem_bias_ix / 4;
int32_t const load_tile = t_smem_bias_ix % 4;
int32_t const ocix = ocix_base + load_tile*8 + load_reg;
if( ocix < 32 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; }
}
}
BARRIER_SYNC;
// begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*4];
filts_strip[1] = filts_smem_off[1*4];
filts_strip[2] = filts_smem_off[2*4];
filts_strip[3] = filts_smem_off[3*4];
filts_strip[4] = filts_smem_off[4*4];
filts_strip[5] = filts_smem_off[5*4];
filts_strip[6] = filts_smem_off[6*4];
filts_strip[7] = filts_smem_off[7*4];
// end t_tile_bias_loads;
if( flags == 1 ) { return; }
// begin t_tile_stores
if( (out_line/28) >= 20 ) { return; }
int32_t out_x = (GRP_ID_1D%4)*8;
int32_t out_chan = ((GRP_ID_1D%1)*4 + (LOC_ID_1D%4))*8;
GASQ float * out_off = out + (out_line/28)*25088 + out_chan*784 + (out_line%28)*28 + out_x*1 ;
if( (out_x + 0) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 32 ) { out_off[ 0*784 + 0*1 ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( (out_chan + 1) < 32 ) { out_off[ 1*784 + 0*1 ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( (out_chan + 2) < 32 ) { out_off[ 2*784 + 0*1 ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( (out_chan + 3) < 32 ) { out_off[ 3*784 + 0*1 ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( (out_chan + 4) < 32 ) { out_off[ 4*784 + 0*1 ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( (out_chan + 5) < 32 ) { out_off[ 5*784 + 0*1 ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( (out_chan + 6) < 32 ) { out_off[ 6*784 + 0*1 ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( (out_chan + 7) < 32 ) { out_off[ 7*784 + 0*1 ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( (out_x + 1) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 32 ) { out_off[ 0*784 + 1*1 ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( (out_chan + 1) < 32 ) { out_off[ 1*784 + 1*1 ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( (out_chan + 2) < 32 ) { out_off[ 2*784 + 1*1 ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( (out_chan + 3) < 32 ) { out_off[ 3*784 + 1*1 ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( (out_chan + 4) < 32 ) { out_off[ 4*784 + 1*1 ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( (out_chan + 5) < 32 ) { out_off[ 5*784 + 1*1 ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( (out_chan + 6) < 32 ) { out_off[ 6*784 + 1*1 ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( (out_chan + 7) < 32 ) { out_off[ 7*784 + 1*1 ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( (out_x + 2) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 32 ) { out_off[ 0*784 + 2*1 ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( (out_chan + 1) < 32 ) { out_off[ 1*784 + 2*1 ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( (out_chan + 2) < 32 ) { out_off[ 2*784 + 2*1 ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( (out_chan + 3) < 32 ) { out_off[ 3*784 + 2*1 ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( (out_chan + 4) < 32 ) { out_off[ 4*784 + 2*1 ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( (out_chan + 5) < 32 ) { out_off[ 5*784 + 2*1 ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( (out_chan + 6) < 32 ) { out_off[ 6*784 + 2*1 ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( (out_chan + 7) < 32 ) { out_off[ 7*784 + 2*1 ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( (out_x + 3) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 32 ) { out_off[ 0*784 + 3*1 ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( (out_chan + 1) < 32 ) { out_off[ 1*784 + 3*1 ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( (out_chan + 2) < 32 ) { out_off[ 2*784 + 3*1 ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( (out_chan + 3) < 32 ) { out_off[ 3*784 + 3*1 ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( (out_chan + 4) < 32 ) { out_off[ 4*784 + 3*1 ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( (out_chan + 5) < 32 ) { out_off[ 5*784 + 3*1 ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( (out_chan + 6) < 32 ) { out_off[ 6*784 + 3*1 ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( (out_chan + 7) < 32 ) { out_off[ 7*784 + 3*1 ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( (out_x + 4) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 32 ) { out_off[ 0*784 + 4*1 ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( (out_chan + 1) < 32 ) { out_off[ 1*784 + 4*1 ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( (out_chan + 2) < 32 ) { out_off[ 2*784 + 4*1 ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( (out_chan + 3) < 32 ) { out_off[ 3*784 + 4*1 ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( (out_chan + 4) < 32 ) { out_off[ 4*784 + 4*1 ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( (out_chan + 5) < 32 ) { out_off[ 5*784 + 4*1 ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( (out_chan + 6) < 32 ) { out_off[ 6*784 + 4*1 ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( (out_chan + 7) < 32 ) { out_off[ 7*784 + 4*1 ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( (out_x + 5) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 32 ) { out_off[ 0*784 + 5*1 ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( (out_chan + 1) < 32 ) { out_off[ 1*784 + 5*1 ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( (out_chan + 2) < 32 ) { out_off[ 2*784 + 5*1 ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( (out_chan + 3) < 32 ) { out_off[ 3*784 + 5*1 ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( (out_chan + 4) < 32 ) { out_off[ 4*784 + 5*1 ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( (out_chan + 5) < 32 ) { out_off[ 5*784 + 5*1 ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( (out_chan + 6) < 32 ) { out_off[ 6*784 + 5*1 ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( (out_chan + 7) < 32 ) { out_off[ 7*784 + 5*1 ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( (out_x + 6) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 32 ) { out_off[ 0*784 + 6*1 ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( (out_chan + 1) < 32 ) { out_off[ 1*784 + 6*1 ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( (out_chan + 2) < 32 ) { out_off[ 2*784 + 6*1 ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( (out_chan + 3) < 32 ) { out_off[ 3*784 + 6*1 ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( (out_chan + 4) < 32 ) { out_off[ 4*784 + 6*1 ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( (out_chan + 5) < 32 ) { out_off[ 5*784 + 6*1 ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( (out_chan + 6) < 32 ) { out_off[ 6*784 + 6*1 ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( (out_chan + 7) < 32 ) { out_off[ 7*784 + 6*1 ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( (out_x + 7) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 32 ) { out_off[ 0*784 + 7*1 ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( (out_chan + 1) < 32 ) { out_off[ 1*784 + 7*1 ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( (out_chan + 2) < 32 ) { out_off[ 2*784 + 7*1 ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( (out_chan + 3) < 32 ) { out_off[ 3*784 + 7*1 ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( (out_chan + 4) < 32 ) { out_off[ 4*784 + 7*1 ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( (out_chan + 5) < 32 ) { out_off[ 5*784 + 7*1 ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( (out_chan + 6) < 32 ) { out_off[ 6*784 + 7*1 ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( (out_chan + 7) < 32 ) { out_off[ 7*784 + 7*1 ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_dim_0 = 28 */
/* in_dim_1 = 28 */
/* kern_sz = 5 */
/* stride = 1 */
/* in_pad = 2 */
/* t_tile_sz = 8 */
/* conv_has_relu = 1 */
/* out_chans = 32 */
/* in_chans = 16 */
/* rtc_func_name = tconv__num_imgs_20__in_dim_0_28__in_dim_1_28__kern_sz_5__stride_1__in_pad_2__t_tile_sz_8__conv_has_relu_1__out_chans_32__in_chans_16 */
/* out_ix_x_dim = 28 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%28) */
/* out_ix_y_dim = 28 */
/* out_ix_y_sz = 28 */
/* out_ix_y_nomod = (out_ix/28) */
/* out_ix_y = ((out_ix/28)%%28) */
/* out_ix_chan_dim = 32 */
/* out_ix_chan_sz = 784 */
/* out_ix_chan_nomod = (out_ix/784) */
/* out_ix_chan = ((out_ix/784)%%32) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 25088 */
/* out_ix_img_nomod = (out_ix/25088) */
/* out_ix_img = (out_ix/25088) */
/* out_ix_sz = 501760 */
/* tpb = 128 */
/* out_line_y_dim = 28 */
/* out_line_y_sz = 1 */
/* out_line_y_nomod = out_line */
/* out_line_y = (out_line%%28) */
/* out_line_img_dim = 20 */
/* out_line_img_sz = 28 */
/* out_line_img_nomod = (out_line/28) */
/* out_line_img = (out_line/28) */
/* out_line_sz = 560 */
/* in_ix_blk_x_dim = 12 */
/* in_ix_blk_x_sz = 1 */
/* in_ix_blk_x_nomod = in_ix */
/* in_ix_blk_x = (in_ix%%12) */
/* in_ix_blk_y_dim = 40 */
/* in_ix_blk_y_sz = 12 */
/* in_ix_blk_y_nomod = (in_ix/12) */
/* in_ix_blk_y = ((in_ix/12)%%40) */
/* in_ix_blk_in_chan_dim = 16 */
/* in_ix_blk_in_chan_sz = 480 */
/* in_ix_blk_in_chan_nomod = (in_ix/480) */
/* in_ix_blk_in_chan = ((in_ix/480)%%16) */
/* in_ix_blk_bx_dim = 4 */
/* in_ix_blk_bx_sz = 7680 */
/* in_ix_blk_bx_nomod = (in_ix/7680) */
/* in_ix_blk_bx = ((in_ix/7680)%%4) */
/* in_ix_blk_bline_dim = 18 */
/* in_ix_blk_bline_sz = 30720 */
/* in_ix_blk_bline_nomod = (in_ix/30720) */
/* in_ix_blk_bline = (in_ix/30720) */
/* in_ix_sz = 552960 */
/* LOC_ID_1D_out_chan_tile_dim = 4 */
/* LOC_ID_1D_out_chan_tile_sz = 1 */
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%4) */
/* LOC_ID_1D_blk_y_dim = 32 */
/* LOC_ID_1D_blk_y_sz = 4 */
/* LOC_ID_1D_blk_y_nomod = (LOC_ID_1D/4) */
/* LOC_ID_1D_blk_y = (LOC_ID_1D/4) */
/* LOC_ID_1D_sz = 128 */
/* GRP_ID_1D_out_chan_blk_dim = 1 */
/* GRP_ID_1D_out_chan_blk_sz = 1 */
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */
/* GRP_ID_1D_blk_bx_dim = 4 */
/* GRP_ID_1D_blk_bx_sz = 1 */
/* GRP_ID_1D_blk_bx_nomod = GRP_ID_1D */
/* GRP_ID_1D_blk_bx = (GRP_ID_1D%%4) */
/* GRP_ID_1D_blk_bline_dim = 18 */
/* GRP_ID_1D_blk_bline_sz = 4 */
/* GRP_ID_1D_blk_bline_nomod = (GRP_ID_1D/4) */
/* GRP_ID_1D_blk_bline = (GRP_ID_1D/4) */
/* GRP_ID_1D_sz = 72 */
/* blk_filt_ix_sz = 32 */
/* filts_smem_sz = 160 */
/* in_smem_sz = 480 */
/* out_smem_sz = 1024 */
/* all_smem_sz = 1024 */
/* filts_xp_ix_out_chan_tile_dim = 4 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%4) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 4 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/4) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/4)%%8) */
/* filts_xp_ix_x_dim = 5 */
/* filts_xp_ix_x_sz = 32 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/32) */
/* filts_xp_ix_x = ((filts_xp_ix/32)%%5) */
/* filts_xp_ix_y_dim = 5 */
/* filts_xp_ix_y_sz = 160 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/160) */
/* filts_xp_ix_y = ((filts_xp_ix/160)%%5) */
/* filts_xp_ix_in_chan_dim = 16 */
/* filts_xp_ix_in_chan_sz = 800 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/800) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/800)%%16) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 12800 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/12800) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/12800) */
/* filts_xp_ix_sz = 12800 */
/* out_chan_bias_smem_load_iter = 1 */
/* filts_off_adj = LOC_ID_1D */
/* filt_smem_loads = // begin filt_smem_loads
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)];
if( (LOC_ID_1D + %(tpb) * 1) < %(filts_smem_sz) ) { filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)];}
filts_off += %(filts_xp_ix_y_sz);
// end filt_smem_loads */
/* in_smem_loads = // begin in_smem_loads
in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ];
in_smem[(LOC_ID_1D + %(tpb) * 1)] = in[ blk_in_ix_base + (%(tpb)*1) ];
in_smem[(LOC_ID_1D + %(tpb) * 2)] = in[ blk_in_ix_base + (%(tpb)*2) ];
if( (LOC_ID_1D + %(tpb) * 3) < %(in_smem_sz)) { in_smem[(LOC_ID_1D + %(tpb) * 3)] = in[ blk_in_ix_base + (%(tpb)*3) ];}
blk_in_ix_base += %(in_ix_blk_in_chan_sz);
// end in_smem_loads */
/* inner_loop_body = // begin inner_loop_body
in_strip[0] = in_smem_off[0];
in_strip[1] = in_smem_off[1];
in_strip[2] = in_smem_off[2];
in_strip[3] = in_smem_off[3];
in_strip[4] = in_smem_off[4];
in_strip[5] = in_smem_off[5];
in_strip[6] = in_smem_off[6];
in_strip[7] = in_smem_off[7];
in_strip[8] = in_smem_off[8];
in_strip[9] = in_smem_off[9];
in_strip[10] = in_smem_off[10];
in_strip[11] = in_smem_off[11];
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[1];
out_tile[1] += filts_strip[1]*in_strip[1];
out_tile[2] += filts_strip[2]*in_strip[1];
out_tile[3] += filts_strip[3]*in_strip[1];
out_tile[4] += filts_strip[4]*in_strip[1];
out_tile[5] += filts_strip[5]*in_strip[1];
out_tile[6] += filts_strip[6]*in_strip[1];
out_tile[7] += filts_strip[7]*in_strip[1];
out_tile[8] += filts_strip[0]*in_strip[2];
out_tile[9] += filts_strip[1]*in_strip[2];
out_tile[10] += filts_strip[2]*in_strip[2];
out_tile[11] += filts_strip[3]*in_strip[2];
out_tile[12] += filts_strip[4]*in_strip[2];
out_tile[13] += filts_strip[5]*in_strip[2];
out_tile[14] += filts_strip[6]*in_strip[2];
out_tile[15] += filts_strip[7]*in_strip[2];
out_tile[16] += filts_strip[0]*in_strip[3];
out_tile[17] += filts_strip[1]*in_strip[3];
out_tile[18] += filts_strip[2]*in_strip[3];
out_tile[19] += filts_strip[3]*in_strip[3];
out_tile[20] += filts_strip[4]*in_strip[3];
out_tile[21] += filts_strip[5]*in_strip[3];
out_tile[22] += filts_strip[6]*in_strip[3];
out_tile[23] += filts_strip[7]*in_strip[3];
out_tile[24] += filts_strip[0]*in_strip[4];
out_tile[25] += filts_strip[1]*in_strip[4];
out_tile[26] += filts_strip[2]*in_strip[4];
out_tile[27] += filts_strip[3]*in_strip[4];
out_tile[28] += filts_strip[4]*in_strip[4];
out_tile[29] += filts_strip[5]*in_strip[4];
out_tile[30] += filts_strip[6]*in_strip[4];
out_tile[31] += filts_strip[7]*in_strip[4];
out_tile[32] += filts_strip[0]*in_strip[5];
out_tile[33] += filts_strip[1]*in_strip[5];
out_tile[34] += filts_strip[2]*in_strip[5];
out_tile[35] += filts_strip[3]*in_strip[5];
out_tile[36] += filts_strip[4]*in_strip[5];
out_tile[37] += filts_strip[5]*in_strip[5];
out_tile[38] += filts_strip[6]*in_strip[5];
out_tile[39] += filts_strip[7]*in_strip[5];
out_tile[40] += filts_strip[0]*in_strip[6];
out_tile[41] += filts_strip[1]*in_strip[6];
out_tile[42] += filts_strip[2]*in_strip[6];
out_tile[43] += filts_strip[3]*in_strip[6];
out_tile[44] += filts_strip[4]*in_strip[6];
out_tile[45] += filts_strip[5]*in_strip[6];
out_tile[46] += filts_strip[6]*in_strip[6];
out_tile[47] += filts_strip[7]*in_strip[6];
out_tile[48] += filts_strip[0]*in_strip[7];
out_tile[49] += filts_strip[1]*in_strip[7];
out_tile[50] += filts_strip[2]*in_strip[7];
out_tile[51] += filts_strip[3]*in_strip[7];
out_tile[52] += filts_strip[4]*in_strip[7];
out_tile[53] += filts_strip[5]*in_strip[7];
out_tile[54] += filts_strip[6]*in_strip[7];
out_tile[55] += filts_strip[7]*in_strip[7];
out_tile[56] += filts_strip[0]*in_strip[8];
out_tile[57] += filts_strip[1]*in_strip[8];
out_tile[58] += filts_strip[2]*in_strip[8];
out_tile[59] += filts_strip[3]*in_strip[8];
out_tile[60] += filts_strip[4]*in_strip[8];
out_tile[61] += filts_strip[5]*in_strip[8];
out_tile[62] += filts_strip[6]*in_strip[8];
out_tile[63] += filts_strip[7]*in_strip[8];
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[2];
out_tile[1] += filts_strip[1]*in_strip[2];
out_tile[2] += filts_strip[2]*in_strip[2];
out_tile[3] += filts_strip[3]*in_strip[2];
out_tile[4] += filts_strip[4]*in_strip[2];
out_tile[5] += filts_strip[5]*in_strip[2];
out_tile[6] += filts_strip[6]*in_strip[2];
out_tile[7] += filts_strip[7]*in_strip[2];
out_tile[8] += filts_strip[0]*in_strip[3];
out_tile[9] += filts_strip[1]*in_strip[3];
out_tile[10] += filts_strip[2]*in_strip[3];
out_tile[11] += filts_strip[3]*in_strip[3];
out_tile[12] += filts_strip[4]*in_strip[3];
out_tile[13] += filts_strip[5]*in_strip[3];
out_tile[14] += filts_strip[6]*in_strip[3];
out_tile[15] += filts_strip[7]*in_strip[3];
out_tile[16] += filts_strip[0]*in_strip[4];
out_tile[17] += filts_strip[1]*in_strip[4];
out_tile[18] += filts_strip[2]*in_strip[4];
out_tile[19] += filts_strip[3]*in_strip[4];
out_tile[20] += filts_strip[4]*in_strip[4];
out_tile[21] += filts_strip[5]*in_strip[4];
out_tile[22] += filts_strip[6]*in_strip[4];
out_tile[23] += filts_strip[7]*in_strip[4];
out_tile[24] += filts_strip[0]*in_strip[5];
out_tile[25] += filts_strip[1]*in_strip[5];
out_tile[26] += filts_strip[2]*in_strip[5];
out_tile[27] += filts_strip[3]*in_strip[5];
out_tile[28] += filts_strip[4]*in_strip[5];
out_tile[29] += filts_strip[5]*in_strip[5];
out_tile[30] += filts_strip[6]*in_strip[5];
out_tile[31] += filts_strip[7]*in_strip[5];
out_tile[32] += filts_strip[0]*in_strip[6];
out_tile[33] += filts_strip[1]*in_strip[6];
out_tile[34] += filts_strip[2]*in_strip[6];
out_tile[35] += filts_strip[3]*in_strip[6];
out_tile[36] += filts_strip[4]*in_strip[6];
out_tile[37] += filts_strip[5]*in_strip[6];
out_tile[38] += filts_strip[6]*in_strip[6];
out_tile[39] += filts_strip[7]*in_strip[6];
out_tile[40] += filts_strip[0]*in_strip[7];
out_tile[41] += filts_strip[1]*in_strip[7];
out_tile[42] += filts_strip[2]*in_strip[7];
out_tile[43] += filts_strip[3]*in_strip[7];
out_tile[44] += filts_strip[4]*in_strip[7];
out_tile[45] += filts_strip[5]*in_strip[7];
out_tile[46] += filts_strip[6]*in_strip[7];
out_tile[47] += filts_strip[7]*in_strip[7];
out_tile[48] += filts_strip[0]*in_strip[8];
out_tile[49] += filts_strip[1]*in_strip[8];
out_tile[50] += filts_strip[2]*in_strip[8];
out_tile[51] += filts_strip[3]*in_strip[8];
out_tile[52] += filts_strip[4]*in_strip[8];
out_tile[53] += filts_strip[5]*in_strip[8];
out_tile[54] += filts_strip[6]*in_strip[8];
out_tile[55] += filts_strip[7]*in_strip[8];
out_tile[56] += filts_strip[0]*in_strip[9];
out_tile[57] += filts_strip[1]*in_strip[9];
out_tile[58] += filts_strip[2]*in_strip[9];
out_tile[59] += filts_strip[3]*in_strip[9];
out_tile[60] += filts_strip[4]*in_strip[9];
out_tile[61] += filts_strip[5]*in_strip[9];
out_tile[62] += filts_strip[6]*in_strip[9];
out_tile[63] += filts_strip[7]*in_strip[9];
filts_strip[0] = filts_smem_off[3*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[3*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[3*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[3*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[3*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[3*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[3*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[3*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[3];
out_tile[1] += filts_strip[1]*in_strip[3];
out_tile[2] += filts_strip[2]*in_strip[3];
out_tile[3] += filts_strip[3]*in_strip[3];
out_tile[4] += filts_strip[4]*in_strip[3];
out_tile[5] += filts_strip[5]*in_strip[3];
out_tile[6] += filts_strip[6]*in_strip[3];
out_tile[7] += filts_strip[7]*in_strip[3];
out_tile[8] += filts_strip[0]*in_strip[4];
out_tile[9] += filts_strip[1]*in_strip[4];
out_tile[10] += filts_strip[2]*in_strip[4];
out_tile[11] += filts_strip[3]*in_strip[4];
out_tile[12] += filts_strip[4]*in_strip[4];
out_tile[13] += filts_strip[5]*in_strip[4];
out_tile[14] += filts_strip[6]*in_strip[4];
out_tile[15] += filts_strip[7]*in_strip[4];
out_tile[16] += filts_strip[0]*in_strip[5];
out_tile[17] += filts_strip[1]*in_strip[5];
out_tile[18] += filts_strip[2]*in_strip[5];
out_tile[19] += filts_strip[3]*in_strip[5];
out_tile[20] += filts_strip[4]*in_strip[5];
out_tile[21] += filts_strip[5]*in_strip[5];
out_tile[22] += filts_strip[6]*in_strip[5];
out_tile[23] += filts_strip[7]*in_strip[5];
out_tile[24] += filts_strip[0]*in_strip[6];
out_tile[25] += filts_strip[1]*in_strip[6];
out_tile[26] += filts_strip[2]*in_strip[6];
out_tile[27] += filts_strip[3]*in_strip[6];
out_tile[28] += filts_strip[4]*in_strip[6];
out_tile[29] += filts_strip[5]*in_strip[6];
out_tile[30] += filts_strip[6]*in_strip[6];
out_tile[31] += filts_strip[7]*in_strip[6];
out_tile[32] += filts_strip[0]*in_strip[7];
out_tile[33] += filts_strip[1]*in_strip[7];
out_tile[34] += filts_strip[2]*in_strip[7];
out_tile[35] += filts_strip[3]*in_strip[7];
out_tile[36] += filts_strip[4]*in_strip[7];
out_tile[37] += filts_strip[5]*in_strip[7];
out_tile[38] += filts_strip[6]*in_strip[7];
out_tile[39] += filts_strip[7]*in_strip[7];
out_tile[40] += filts_strip[0]*in_strip[8];
out_tile[41] += filts_strip[1]*in_strip[8];
out_tile[42] += filts_strip[2]*in_strip[8];
out_tile[43] += filts_strip[3]*in_strip[8];
out_tile[44] += filts_strip[4]*in_strip[8];
out_tile[45] += filts_strip[5]*in_strip[8];
out_tile[46] += filts_strip[6]*in_strip[8];
out_tile[47] += filts_strip[7]*in_strip[8];
out_tile[48] += filts_strip[0]*in_strip[9];
out_tile[49] += filts_strip[1]*in_strip[9];
out_tile[50] += filts_strip[2]*in_strip[9];
out_tile[51] += filts_strip[3]*in_strip[9];
out_tile[52] += filts_strip[4]*in_strip[9];
out_tile[53] += filts_strip[5]*in_strip[9];
out_tile[54] += filts_strip[6]*in_strip[9];
out_tile[55] += filts_strip[7]*in_strip[9];
out_tile[56] += filts_strip[0]*in_strip[10];
out_tile[57] += filts_strip[1]*in_strip[10];
out_tile[58] += filts_strip[2]*in_strip[10];
out_tile[59] += filts_strip[3]*in_strip[10];
out_tile[60] += filts_strip[4]*in_strip[10];
out_tile[61] += filts_strip[5]*in_strip[10];
out_tile[62] += filts_strip[6]*in_strip[10];
out_tile[63] += filts_strip[7]*in_strip[10];
filts_strip[0] = filts_smem_off[4*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[4*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[4*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[4*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[4*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[4*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[4*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[4*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[4];
out_tile[1] += filts_strip[1]*in_strip[4];
out_tile[2] += filts_strip[2]*in_strip[4];
out_tile[3] += filts_strip[3]*in_strip[4];
out_tile[4] += filts_strip[4]*in_strip[4];
out_tile[5] += filts_strip[5]*in_strip[4];
out_tile[6] += filts_strip[6]*in_strip[4];
out_tile[7] += filts_strip[7]*in_strip[4];
out_tile[8] += filts_strip[0]*in_strip[5];
out_tile[9] += filts_strip[1]*in_strip[5];
out_tile[10] += filts_strip[2]*in_strip[5];
out_tile[11] += filts_strip[3]*in_strip[5];
out_tile[12] += filts_strip[4]*in_strip[5];
out_tile[13] += filts_strip[5]*in_strip[5];
out_tile[14] += filts_strip[6]*in_strip[5];
out_tile[15] += filts_strip[7]*in_strip[5];
out_tile[16] += filts_strip[0]*in_strip[6];
out_tile[17] += filts_strip[1]*in_strip[6];
out_tile[18] += filts_strip[2]*in_strip[6];
out_tile[19] += filts_strip[3]*in_strip[6];
out_tile[20] += filts_strip[4]*in_strip[6];
out_tile[21] += filts_strip[5]*in_strip[6];
out_tile[22] += filts_strip[6]*in_strip[6];
out_tile[23] += filts_strip[7]*in_strip[6];
out_tile[24] += filts_strip[0]*in_strip[7];
out_tile[25] += filts_strip[1]*in_strip[7];
out_tile[26] += filts_strip[2]*in_strip[7];
out_tile[27] += filts_strip[3]*in_strip[7];
out_tile[28] += filts_strip[4]*in_strip[7];
out_tile[29] += filts_strip[5]*in_strip[7];
out_tile[30] += filts_strip[6]*in_strip[7];
out_tile[31] += filts_strip[7]*in_strip[7];
out_tile[32] += filts_strip[0]*in_strip[8];
out_tile[33] += filts_strip[1]*in_strip[8];
out_tile[34] += filts_strip[2]*in_strip[8];
out_tile[35] += filts_strip[3]*in_strip[8];
out_tile[36] += filts_strip[4]*in_strip[8];
out_tile[37] += filts_strip[5]*in_strip[8];
out_tile[38] += filts_strip[6]*in_strip[8];
out_tile[39] += filts_strip[7]*in_strip[8];
out_tile[40] += filts_strip[0]*in_strip[9];
out_tile[41] += filts_strip[1]*in_strip[9];
out_tile[42] += filts_strip[2]*in_strip[9];
out_tile[43] += filts_strip[3]*in_strip[9];
out_tile[44] += filts_strip[4]*in_strip[9];
out_tile[45] += filts_strip[5]*in_strip[9];
out_tile[46] += filts_strip[6]*in_strip[9];
out_tile[47] += filts_strip[7]*in_strip[9];
out_tile[48] += filts_strip[0]*in_strip[10];
out_tile[49] += filts_strip[1]*in_strip[10];
out_tile[50] += filts_strip[2]*in_strip[10];
out_tile[51] += filts_strip[3]*in_strip[10];
out_tile[52] += filts_strip[4]*in_strip[10];
out_tile[53] += filts_strip[5]*in_strip[10];
out_tile[54] += filts_strip[6]*in_strip[10];
out_tile[55] += filts_strip[7]*in_strip[10];
out_tile[56] += filts_strip[0]*in_strip[11];
out_tile[57] += filts_strip[1]*in_strip[11];
out_tile[58] += filts_strip[2]*in_strip[11];
out_tile[59] += filts_strip[3]*in_strip[11];
out_tile[60] += filts_strip[4]*in_strip[11];
out_tile[61] += filts_strip[5]*in_strip[11];
out_tile[62] += filts_strip[6]*in_strip[11];
out_tile[63] += filts_strip[7]*in_strip[11];
*/
/* t_tile_bias_loads = // begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)];
// end t_tile_bias_loads */
/* t_tile_stores = // begin t_tile_stores
if( %(out_line_img) >= %(out_ix_img_dim) ) { return; }
int32_t out_x = %(GRP_ID_1D_blk_bx)*%(t_tile_sz);
int32_t out_chan = (%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim) + %(LOC_ID_1D_out_chan_tile))*%(t_tile_sz);
GASQ float * out_off = out + %(out_line_img)*%(out_ix_img_sz) + out_chan*%(out_ix_chan_sz) + %(out_line_y)*%(out_ix_y_sz) + out_x*%(out_ix_x_sz) ;
if( (out_x + 0) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( (out_x + 1) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( (out_x + 2) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( (out_x + 3) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( (out_x + 4) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( (out_x + 5) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( (out_x + 6) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( (out_x + 7) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores */
CUCL_GLOBAL_KERNEL void in_tile_xpose__num_imgs_20__stride_1__kern_sz_5__in_pad_2__in_chans_16__ysz_28__xsz_28__tix_pels_tile_sz_32__t_tile_sz_8__bix_pels_blk_sz_72( GASQ float const * const in, GASQ float * const out ) {
int32_t const out_ix = GLOB_ID_1D;
if( out_ix >= 552960 ) { return; }
int32_t const out_line = (out_ix/30720)*32;
int32_t const fi_skip_in_lines = (out_line%28)*1;
int32_t const in_line = (((out_ix/12)%40)+fi_skip_in_lines);
int32_t const img_in_lines = (28 - 1)*1 + 5;
int32_t const img_off = in_line/img_in_lines;
int32_t const img = (out_line/28) + img_off;
int32_t const iy = (in_line % img_in_lines) - 2; //(out_line%28)*1 + ((out_ix/12)%40) - 2;
int32_t const ix = ((out_ix/7680)%4)*8*1 + (out_ix%12) - 2;
float v = 0.0f;
if( 1
&& ( ix >= 0 )
&& ( iy >= 0 )
&& ( ix < 28 )
&& ( iy < 28 )
&& ( img < 20 )
)
{
v = in[ img*12544 +
((out_ix/480)%16)*784 +
iy*28 +
ix*1 ];
}
out[out_ix] = v;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* stride = 1 */
/* kern_sz = 5 */
/* in_pad = 2 */
/* in_chans = 16 */
/* ysz = 28 */
/* xsz = 28 */
/* tix_pels_tile_sz = 32 */
/* t_tile_sz = 8 */
/* bix_pels_blk_sz = 72 */
/* rtc_func_name = in_tile_xpose__num_imgs_20__stride_1__kern_sz_5__in_pad_2__in_chans_16__ysz_28__xsz_28__tix_pels_tile_sz_32__t_tile_sz_8__bix_pels_blk_sz_72 */
/* out_ix_blk_x_dim = 12 */
/* out_ix_blk_x_sz = 1 */
/* out_ix_blk_x_nomod = out_ix */
/* out_ix_blk_x = (out_ix%%12) */
/* out_ix_blk_y_dim = 40 */
/* out_ix_blk_y_sz = 12 */
/* out_ix_blk_y_nomod = (out_ix/12) */
/* out_ix_blk_y = ((out_ix/12)%%40) */
/* out_ix_blk_in_chan_dim = 16 */
/* out_ix_blk_in_chan_sz = 480 */
/* out_ix_blk_in_chan_nomod = (out_ix/480) */
/* out_ix_blk_in_chan = ((out_ix/480)%%16) */
/* out_ix_blk_bx_dim = 4 */
/* out_ix_blk_bx_sz = 7680 */
/* out_ix_blk_bx_nomod = (out_ix/7680) */
/* out_ix_blk_bx = ((out_ix/7680)%%4) */
/* out_ix_blk_bline_dim = 18 */
/* out_ix_blk_bline_sz = 30720 */
/* out_ix_blk_bline_nomod = (out_ix/30720) */
/* out_ix_blk_bline = (out_ix/30720) */
/* out_ix_sz = 552960 */
/* out_line_y_dim = 28 */
/* out_line_y_sz = 1 */
/* out_line_y_nomod = out_line */
/* out_line_y = (out_line%%28) */
/* out_line_img_dim = 20 */
/* out_line_img_sz = 28 */
/* out_line_img_nomod = (out_line/28) */
/* out_line_img = (out_line/28) */
/* out_line_sz = 560 */
/* in_ix_x_dim = 28 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%28) */
/* in_ix_y_dim = 28 */
/* in_ix_y_sz = 28 */
/* in_ix_y_nomod = (in_ix/28) */
/* in_ix_y = ((in_ix/28)%%28) */
/* in_ix_chan_dim = 16 */
/* in_ix_chan_sz = 784 */
/* in_ix_chan_nomod = (in_ix/784) */
/* in_ix_chan = ((in_ix/784)%%16) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 12544 */
/* in_ix_img_nomod = (in_ix/12544) */
/* in_ix_img = (in_ix/12544) */
/* in_ix_sz = 250880 */
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_32__in_chans_16__kysz_5__kxsz_5( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile
{
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg
int32_t const filts_ix = GLOB_ID_1D;
if( filts_ix >= 12800 ) { return; }
int32_t const fioc = (filts_ix/400);
float val = 0.0f;
int32_t const filts_xp_ix =
(fioc/32)*12800 +
(fioc%8)*4 +
((fioc/8)%4)*1 +
((filts_ix/25)%16)*800 +
((filts_ix/5)%5)*160 +
(filts_ix%5)*32;
#if 1
val = in[filts_ix];
#else
if( ((filts_ix/25)%16) == 0 ) {
// if( ((filts_ix%5) == 5) && (((filts_ix/5)%5) == 5) )
{
val = (filts_ix%5)*100 + ((filts_ix/5)%5);
}
}
#endif
out[filts_xp_ix] = val;
}
// -- template substituion table used: --
/* out_chans = 32 */
/* in_chans = 16 */
/* kysz = 5 */
/* kxsz = 5 */
/* rtc_func_name = xpose_filts__out_chans_32__in_chans_16__kysz_5__kxsz_5 */
/* t_tile_sz = 8 */
/* filts_ix_x_dim = 5 */
/* filts_ix_x_sz = 1 */
/* filts_ix_x_nomod = filts_ix */
/* filts_ix_x = (filts_ix%%5) */
/* filts_ix_y_dim = 5 */
/* filts_ix_y_sz = 5 */
/* filts_ix_y_nomod = (filts_ix/5) */
/* filts_ix_y = ((filts_ix/5)%%5) */
/* filts_ix_in_chan_dim = 16 */
/* filts_ix_in_chan_sz = 25 */
/* filts_ix_in_chan_nomod = (filts_ix/25) */
/* filts_ix_in_chan = ((filts_ix/25)%%16) */
/* filts_ix_out_chan_dim = 32 */
/* filts_ix_out_chan_sz = 400 */
/* filts_ix_out_chan_nomod = (filts_ix/400) */
/* filts_ix_out_chan = (filts_ix/400) */
/* filts_ix_sz = 12800 */
/* filts_xp_ix_out_chan_tile_dim = 4 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%4) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 4 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/4) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/4)%%8) */
/* filts_xp_ix_x_dim = 5 */
/* filts_xp_ix_x_sz = 32 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/32) */
/* filts_xp_ix_x = ((filts_xp_ix/32)%%5) */
/* filts_xp_ix_y_dim = 5 */
/* filts_xp_ix_y_sz = 160 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/160) */
/* filts_xp_ix_y = ((filts_xp_ix/160)%%5) */
/* filts_xp_ix_in_chan_dim = 16 */
/* filts_xp_ix_in_chan_sz = 800 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/800) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/800)%%16) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 12800 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/12800) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/12800) */
/* filts_xp_ix_sz = 12800 */
/* fioc_out_chan_reg_dim = 8 */
/* fioc_out_chan_reg_sz = 1 */
/* fioc_out_chan_reg_nomod = fioc */
/* fioc_out_chan_reg = (fioc%%8) */
/* fioc_out_chan_tile_dim = 4 */
/* fioc_out_chan_tile_sz = 8 */
/* fioc_out_chan_tile_nomod = (fioc/8) */
/* fioc_out_chan_tile = ((fioc/8)%%4) */
/* fioc_out_chan_blk_dim = 1 */
/* fioc_out_chan_blk_sz = 32 */
/* fioc_out_chan_blk_nomod = (fioc/32) */
/* fioc_out_chan_blk = (fioc/32) */
/* fioc_sz = 32 */
CUCL_GLOBAL_KERNEL void pool__num_imgs_20__in_pad_1__in_dim_0_28__in_dim_1_28__conv_has_relu_0__kern_sz_3__stride_1__out_chans_192__avg_pool_0( GASQ float const * const in, GASQ float * const out ) {
int32_t const out_ix = GLOB_ID_1D;
if( out_ix >= 3010560 ) { return; }
float out_v = 0.0f;
for( int32_t kx = 0; kx != 3; ++kx ) {
for( int32_t ky = 0; ky != 3; ++ky ) {
float v = 0;
int const in_ix_y = ((out_ix/28)%28)*1 + ky - 1;
int const in_ix_x = (out_ix%28)*1 + kx - 1;
if(in_ix_y >= 0 && in_ix_x >= 0 && in_ix_x < 28 && in_ix_y < 28 ) {
int32_t const in_ix = (out_ix/150528)*150528 + ((out_ix/784)%192)*784 +
in_ix_y*28 + in_ix_x*1;
v = in[in_ix];
}
out_v = max( out_v, v );
}
}
;
out[out_ix] = out_v;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_pad = 1 */
/* in_dim_0 = 28 */
/* in_dim_1 = 28 */
/* conv_has_relu = 0 */
/* kern_sz = 3 */
/* stride = 1 */
/* out_chans = 192 */
/* avg_pool = 0 */
/* rtc_func_name = pool__num_imgs_20__in_pad_1__in_dim_0_28__in_dim_1_28__conv_has_relu_0__kern_sz_3__stride_1__out_chans_192__avg_pool_0 */
/* t_tile_sz = 8 */
/* out_ix_x_dim = 28 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%28) */
/* out_ix_y_dim = 28 */
/* out_ix_y_sz = 28 */
/* out_ix_y_nomod = (out_ix/28) */
/* out_ix_y = ((out_ix/28)%%28) */
/* out_ix_chan_dim = 192 */
/* out_ix_chan_sz = 784 */
/* out_ix_chan_nomod = (out_ix/784) */
/* out_ix_chan = ((out_ix/784)%%192) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 150528 */
/* out_ix_img_nomod = (out_ix/150528) */
/* out_ix_img = (out_ix/150528) */
/* out_ix_sz = 3010560 */
/* in_ix_x_dim = 28 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%28) */
/* in_ix_y_dim = 28 */
/* in_ix_y_sz = 28 */
/* in_ix_y_nomod = (in_ix/28) */
/* in_ix_y = ((in_ix/28)%%28) */
/* in_ix_chan_dim = 192 */
/* in_ix_chan_sz = 784 */
/* in_ix_chan_nomod = (in_ix/784) */
/* in_ix_chan = ((in_ix/784)%%192) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 150528 */
/* in_ix_img_nomod = (in_ix/150528) */
/* in_ix_img = (in_ix/150528) */
/* in_ix_sz = 3010560 */
/* op = out_v = max( out_v, v ) */
/* op_post = */
// 256 tbp
// each thread: computes 8x8 block of out
// loop over k dim
CUCL_GLOBAL_KERNEL void conv__num_imgs_20__in_pad_0__in_dim_0_28__in_dim_1_28__conv_has_relu_1__kern_sz_1__stride_1__out_chans_32__in_chans_192( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out ) {
LOCSHAR_MEM float in_smem[32*8];
int32_t const blk_filt_ix_sz = 4*8;
LOCSHAR_MEM float filts_smem[4*8]; // aka blk_filt_ix_sz, which wasn't const enough OpenCL
float out_tile[8*8] = {0}; // tile of output for this thread to compute, stored in registers
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz )
float in_strip[8]; // across patches (approx square block in x/y space, favoring x if sqrt() not integer)
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*6144;
int32_t const blk_patch_ix_sz = 32*8;
int32_t const blk_patch_ix_base = GRP_ID_1D*blk_patch_ix_sz;
// iteratate over filter elements
int32_t filts_off = blk_filt_ix_base;
for( int32_t filts_ix_out_chan_elem = 0; filts_ix_out_chan_elem !=
(192 * 1 * 1); ++filts_ix_out_chan_elem ) {
BARRIER_SYNC;
if( LOC_ID_1D < blk_filt_ix_sz ) {
#ifdef NO_IOX // by default, we don't ever disable this, since it's seems about as good as it can be already
//filts_smem[LOC_ID_1D] = LOC_ID_1D;
filts_smem[LOC_ID_1D] = filts[LOC_ID_1D];
#else
filts_smem[LOC_ID_1D] = filts[filts_off + LOC_ID_1D];
#endif
}
for( int32_t i = 0; i != 2; ++i ) {
if( (LOC_ID_1D+LOC_SZ_1D*i) < blk_patch_ix_sz ) {
int32_t const t_smem_patch_ix = (blk_patch_ix_base+LOC_ID_1D+LOC_SZ_1D*i);
#ifdef NO_IO
//float v = LOC_ID_1D;
//float v = in[LOC_ID_1D];
float v = in[filts_off + LOC_ID_1D];
#else
float v = 0;
int const smem_in_ix_y = ((t_smem_patch_ix/28)%28)*1+(filts_ix_out_chan_elem%1) - 0;
int const smem_in_ix_x = (t_smem_patch_ix%28)*1+(filts_ix_out_chan_elem%1) - 0;
if(smem_in_ix_y >= 0 && smem_in_ix_x >= 0 &&
(t_smem_patch_ix/784) < 20 &&
smem_in_ix_x < 28 && smem_in_ix_y < 28 ) {
v = in[(t_smem_patch_ix/784)*150528 +
filts_ix_out_chan_elem*784 +
smem_in_ix_y*28 +
smem_in_ix_x*1];
};
#endif
in_smem[LOC_ID_1D+LOC_SZ_1D*i] = v;
}
}
filts_off += 32;
BARRIER_SYNC;
#ifdef NO_IO
// begin t_tile_dummy_loads
filts_strip[0] = filts_smem[(LOC_ID_1D % 32) + 0];
filts_strip[1] = filts_smem[(LOC_ID_1D % 32) + 1];
filts_strip[2] = filts_smem[(LOC_ID_1D % 32) + 2];
filts_strip[3] = filts_smem[(LOC_ID_1D % 32) + 3];
filts_strip[4] = filts_smem[(LOC_ID_1D % 32) + 4];
filts_strip[5] = filts_smem[(LOC_ID_1D % 32) + 5];
filts_strip[6] = filts_smem[(LOC_ID_1D % 32) + 6];
filts_strip[7] = filts_smem[(LOC_ID_1D % 32) + 7];
in_strip[0] = in_smem[(LOC_ID_1D % 32) + 0];
in_strip[1] = in_smem[(LOC_ID_1D % 32) + 1];
in_strip[2] = in_smem[(LOC_ID_1D % 32) + 2];
in_strip[3] = in_smem[(LOC_ID_1D % 32) + 3];
in_strip[4] = in_smem[(LOC_ID_1D % 32) + 4];
in_strip[5] = in_smem[(LOC_ID_1D % 32) + 5];
in_strip[6] = in_smem[(LOC_ID_1D % 32) + 6];
in_strip[7] = in_smem[(LOC_ID_1D % 32) + 7];
// end t_tile_dummy_loads;
#else
// begin t_tile_loads
filts_strip[0] = filts_smem[(LOC_ID_1D%4)+0*4];
filts_strip[1] = filts_smem[(LOC_ID_1D%4)+1*4];
filts_strip[2] = filts_smem[(LOC_ID_1D%4)+2*4];
filts_strip[3] = filts_smem[(LOC_ID_1D%4)+3*4];
filts_strip[4] = filts_smem[(LOC_ID_1D%4)+4*4];
filts_strip[5] = filts_smem[(LOC_ID_1D%4)+5*4];
filts_strip[6] = filts_smem[(LOC_ID_1D%4)+6*4];
filts_strip[7] = filts_smem[(LOC_ID_1D%4)+7*4];
in_strip[0] = in_smem[8*(LOC_ID_1D/4)+0];
in_strip[1] = in_smem[8*(LOC_ID_1D/4)+1];
in_strip[2] = in_smem[8*(LOC_ID_1D/4)+2];
in_strip[3] = in_smem[8*(LOC_ID_1D/4)+3];
in_strip[4] = in_smem[8*(LOC_ID_1D/4)+4];
in_strip[5] = in_smem[8*(LOC_ID_1D/4)+5];
in_strip[6] = in_smem[8*(LOC_ID_1D/4)+6];
in_strip[7] = in_smem[8*(LOC_ID_1D/4)+7];
// end t_tile_loads;
#endif
// (2) do 8^2 fmas into out_tile
// begin t_tile_fmas
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
// end t_tile_fmas;
}
// load per-block biases into smem
BARRIER_SYNC;
if( LOC_ID_1D < blk_filt_ix_sz ) {
int32_t const ocix_base = (GRP_ID_1D%1)*blk_filt_ix_sz;
int32_t const load_reg = LOC_ID_1D / 4;
int32_t const load_tile = LOC_ID_1D % 4;
int32_t const ocix = ocix_base + load_tile*8 + load_reg;
if( ocix < 32 ) { filts_smem[LOC_ID_1D] = biases[ ocix ]; }
//int32_t const ocix_tile = (ocix / 8) % 4;
//int32_t const ocix_reg = ocix % 8;
//filts_smem[ocix_tile * 1 + ocix_reg * 4] = biases[ocix];
}
BARRIER_SYNC;
// load biases into filts_strip
// begin t_tile_loads
filts_strip[0] = filts_smem[(LOC_ID_1D%4)+0*4];
filts_strip[1] = filts_smem[(LOC_ID_1D%4)+1*4];
filts_strip[2] = filts_smem[(LOC_ID_1D%4)+2*4];
filts_strip[3] = filts_smem[(LOC_ID_1D%4)+3*4];
filts_strip[4] = filts_smem[(LOC_ID_1D%4)+4*4];
filts_strip[5] = filts_smem[(LOC_ID_1D%4)+5*4];
filts_strip[6] = filts_smem[(LOC_ID_1D%4)+6*4];
filts_strip[7] = filts_smem[(LOC_ID_1D%4)+7*4];
in_strip[0] = in_smem[8*(LOC_ID_1D/4)+0];
in_strip[1] = in_smem[8*(LOC_ID_1D/4)+1];
in_strip[2] = in_smem[8*(LOC_ID_1D/4)+2];
in_strip[3] = in_smem[8*(LOC_ID_1D/4)+3];
in_strip[4] = in_smem[8*(LOC_ID_1D/4)+4];
in_strip[5] = in_smem[8*(LOC_ID_1D/4)+5];
in_strip[6] = in_smem[8*(LOC_ID_1D/4)+6];
in_strip[7] = in_smem[8*(LOC_ID_1D/4)+7];
// end t_tile_loads;
// add bias to each elem of out_tile[] and store the results to out[]
#ifdef NO_IO
// begin t_tile_dummy_stores
out[0] = 0.0f
+ max(0.0f,out_tile[0] + filts_strip[0])
+ max(0.0f,out_tile[1] + filts_strip[1])
+ max(0.0f,out_tile[2] + filts_strip[2])
+ max(0.0f,out_tile[3] + filts_strip[3])
+ max(0.0f,out_tile[4] + filts_strip[4])
+ max(0.0f,out_tile[5] + filts_strip[5])
+ max(0.0f,out_tile[6] + filts_strip[6])
+ max(0.0f,out_tile[7] + filts_strip[7])
+ max(0.0f,out_tile[8] + filts_strip[0])
+ max(0.0f,out_tile[9] + filts_strip[1])
+ max(0.0f,out_tile[10] + filts_strip[2])
+ max(0.0f,out_tile[11] + filts_strip[3])
+ max(0.0f,out_tile[12] + filts_strip[4])
+ max(0.0f,out_tile[13] + filts_strip[5])
+ max(0.0f,out_tile[14] + filts_strip[6])
+ max(0.0f,out_tile[15] + filts_strip[7])
+ max(0.0f,out_tile[16] + filts_strip[0])
+ max(0.0f,out_tile[17] + filts_strip[1])
+ max(0.0f,out_tile[18] + filts_strip[2])
+ max(0.0f,out_tile[19] + filts_strip[3])
+ max(0.0f,out_tile[20] + filts_strip[4])
+ max(0.0f,out_tile[21] + filts_strip[5])
+ max(0.0f,out_tile[22] + filts_strip[6])
+ max(0.0f,out_tile[23] + filts_strip[7])
+ max(0.0f,out_tile[24] + filts_strip[0])
+ max(0.0f,out_tile[25] + filts_strip[1])
+ max(0.0f,out_tile[26] + filts_strip[2])
+ max(0.0f,out_tile[27] + filts_strip[3])
+ max(0.0f,out_tile[28] + filts_strip[4])
+ max(0.0f,out_tile[29] + filts_strip[5])
+ max(0.0f,out_tile[30] + filts_strip[6])
+ max(0.0f,out_tile[31] + filts_strip[7])
+ max(0.0f,out_tile[32] + filts_strip[0])
+ max(0.0f,out_tile[33] + filts_strip[1])
+ max(0.0f,out_tile[34] + filts_strip[2])
+ max(0.0f,out_tile[35] + filts_strip[3])
+ max(0.0f,out_tile[36] + filts_strip[4])
+ max(0.0f,out_tile[37] + filts_strip[5])
+ max(0.0f,out_tile[38] + filts_strip[6])
+ max(0.0f,out_tile[39] + filts_strip[7])
+ max(0.0f,out_tile[40] + filts_strip[0])
+ max(0.0f,out_tile[41] + filts_strip[1])
+ max(0.0f,out_tile[42] + filts_strip[2])
+ max(0.0f,out_tile[43] + filts_strip[3])
+ max(0.0f,out_tile[44] + filts_strip[4])
+ max(0.0f,out_tile[45] + filts_strip[5])
+ max(0.0f,out_tile[46] + filts_strip[6])
+ max(0.0f,out_tile[47] + filts_strip[7])
+ max(0.0f,out_tile[48] + filts_strip[0])
+ max(0.0f,out_tile[49] + filts_strip[1])
+ max(0.0f,out_tile[50] + filts_strip[2])
+ max(0.0f,out_tile[51] + filts_strip[3])
+ max(0.0f,out_tile[52] + filts_strip[4])
+ max(0.0f,out_tile[53] + filts_strip[5])
+ max(0.0f,out_tile[54] + filts_strip[6])
+ max(0.0f,out_tile[55] + filts_strip[7])
+ max(0.0f,out_tile[56] + filts_strip[0])
+ max(0.0f,out_tile[57] + filts_strip[1])
+ max(0.0f,out_tile[58] + filts_strip[2])
+ max(0.0f,out_tile[59] + filts_strip[3])
+ max(0.0f,out_tile[60] + filts_strip[4])
+ max(0.0f,out_tile[61] + filts_strip[5])
+ max(0.0f,out_tile[62] + filts_strip[6])
+ max(0.0f,out_tile[63] + filts_strip[7])
;
// end t_tile_dummy_stores;
#else
// begin t_tile_stores
int32_t tpix[8];
int32_t tcix[8];
tpix[0] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+0)/784)*25088 +
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+0) % 784 ); // cache out patch ixs
tpix[1] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+1)/784)*25088 +
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+1) % 784 ); // cache out patch ixs
tpix[2] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+2)/784)*25088 +
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+2) % 784 ); // cache out patch ixs
tpix[3] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+3)/784)*25088 +
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+3) % 784 ); // cache out patch ixs
tpix[4] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+4)/784)*25088 +
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+4) % 784 ); // cache out patch ixs
tpix[5] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+5)/784)*25088 +
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+5) % 784 ); // cache out patch ixs
tpix[6] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+6)/784)*25088 +
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+6) % 784 ); // cache out patch ixs
tpix[7] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+7)/784)*25088 +
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+7) % 784 ); // cache out patch ixs
tcix[0] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+0)*784; // cache out chan ixs
tcix[1] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+1)*784; // cache out chan ixs
tcix[2] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+2)*784; // cache out chan ixs
tcix[3] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+3)*784; // cache out chan ixs
tcix[4] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+4)*784; // cache out chan ixs
tcix[5] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+5)*784; // cache out chan ixs
tcix[6] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+6)*784; // cache out chan ixs
tcix[7] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+7)*784; // cache out chan ixs
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+0) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (32*784) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( tcix[1] < (32*784) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( tcix[2] < (32*784) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( tcix[3] < (32*784) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( tcix[4] < (32*784) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( tcix[5] < (32*784) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( tcix[6] < (32*784) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( tcix[7] < (32*784) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+1) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (32*784) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( tcix[1] < (32*784) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( tcix[2] < (32*784) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( tcix[3] < (32*784) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( tcix[4] < (32*784) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( tcix[5] < (32*784) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( tcix[6] < (32*784) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( tcix[7] < (32*784) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+2) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (32*784) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( tcix[1] < (32*784) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( tcix[2] < (32*784) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( tcix[3] < (32*784) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( tcix[4] < (32*784) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( tcix[5] < (32*784) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( tcix[6] < (32*784) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( tcix[7] < (32*784) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+3) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (32*784) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( tcix[1] < (32*784) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( tcix[2] < (32*784) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( tcix[3] < (32*784) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( tcix[4] < (32*784) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( tcix[5] < (32*784) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( tcix[6] < (32*784) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( tcix[7] < (32*784) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+4) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (32*784) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( tcix[1] < (32*784) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( tcix[2] < (32*784) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( tcix[3] < (32*784) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( tcix[4] < (32*784) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( tcix[5] < (32*784) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( tcix[6] < (32*784) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( tcix[7] < (32*784) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+5) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (32*784) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( tcix[1] < (32*784) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( tcix[2] < (32*784) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( tcix[3] < (32*784) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( tcix[4] < (32*784) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( tcix[5] < (32*784) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( tcix[6] < (32*784) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( tcix[7] < (32*784) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+6) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (32*784) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( tcix[1] < (32*784) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( tcix[2] < (32*784) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( tcix[3] < (32*784) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( tcix[4] < (32*784) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( tcix[5] < (32*784) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( tcix[6] < (32*784) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( tcix[7] < (32*784) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+7) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (32*784) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( tcix[1] < (32*784) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( tcix[2] < (32*784) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( tcix[3] < (32*784) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( tcix[4] < (32*784) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( tcix[5] < (32*784) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( tcix[6] < (32*784) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( tcix[7] < (32*784) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores;
#endif
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_pad = 0 */
/* in_dim_0 = 28 */
/* in_dim_1 = 28 */
/* conv_has_relu = 1 */
/* kern_sz = 1 */
/* stride = 1 */
/* out_chans = 32 */
/* in_chans = 192 */
/* rtc_func_name = conv__num_imgs_20__in_pad_0__in_dim_0_28__in_dim_1_28__conv_has_relu_1__kern_sz_1__stride_1__out_chans_32__in_chans_192 */
/* t_tile_sz = 8 */
/* out_ix_x_dim = 28 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%28) */
/* out_ix_y_dim = 28 */
/* out_ix_y_sz = 28 */
/* out_ix_y_nomod = (out_ix/28) */
/* out_ix_y = ((out_ix/28)%%28) */
/* out_ix_chan_dim = 32 */
/* out_ix_chan_sz = 784 */
/* out_ix_chan_nomod = (out_ix/784) */
/* out_ix_chan = ((out_ix/784)%%32) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 25088 */
/* out_ix_img_nomod = (out_ix/25088) */
/* out_ix_img = (out_ix/25088) */
/* out_ix_sz = 501760 */
/* in_ix_x_dim = 28 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%28) */
/* in_ix_y_dim = 28 */
/* in_ix_y_sz = 28 */
/* in_ix_y_nomod = (in_ix/28) */
/* in_ix_y = ((in_ix/28)%%28) */
/* in_ix_chan_dim = 192 */
/* in_ix_chan_sz = 784 */
/* in_ix_chan_nomod = (in_ix/784) */
/* in_ix_chan = ((in_ix/784)%%192) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 150528 */
/* in_ix_img_nomod = (in_ix/150528) */
/* in_ix_img = (in_ix/150528) */
/* in_ix_sz = 3010560 */
/* t_smem_patch_ix_x_dim = 28 */
/* t_smem_patch_ix_x_sz = 1 */
/* t_smem_patch_ix_x_nomod = t_smem_patch_ix */
/* t_smem_patch_ix_x = (t_smem_patch_ix%%28) */
/* t_smem_patch_ix_y_dim = 28 */
/* t_smem_patch_ix_y_sz = 28 */
/* t_smem_patch_ix_y_nomod = (t_smem_patch_ix/28) */
/* t_smem_patch_ix_y = ((t_smem_patch_ix/28)%%28) */
/* t_smem_patch_ix_img_dim = 20 */
/* t_smem_patch_ix_img_sz = 784 */
/* t_smem_patch_ix_img_nomod = (t_smem_patch_ix/784) */
/* t_smem_patch_ix_img = (t_smem_patch_ix/784) */
/* t_smem_patch_ix_sz = 15680 */
/* filts_ix_out_chan_elem_x_dim = 1 */
/* filts_ix_out_chan_elem_x_sz = 1 */
/* filts_ix_out_chan_elem_x_nomod = filts_ix_out_chan_elem */
/* filts_ix_out_chan_elem_x = (filts_ix_out_chan_elem%%1) */
/* filts_ix_out_chan_elem_y_dim = 1 */
/* filts_ix_out_chan_elem_y_sz = 1 */
/* filts_ix_out_chan_elem_y_nomod = filts_ix_out_chan_elem */
/* filts_ix_out_chan_elem_y = (filts_ix_out_chan_elem%%1) */
/* filts_ix_out_chan_elem_in_chan_dim = 192 */
/* filts_ix_out_chan_elem_in_chan_sz = 1 */
/* filts_ix_out_chan_elem_in_chan_nomod = filts_ix_out_chan_elem */
/* filts_ix_out_chan_elem_in_chan = filts_ix_out_chan_elem */
/* filts_ix_out_chan_elem_sz = 192 */
/* LOC_ID_1D_out_chan_tile_dim = 4 */
/* LOC_ID_1D_out_chan_tile_sz = 1 */
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%4) */
/* LOC_ID_1D_patch_tile_dim = 32 */
/* LOC_ID_1D_patch_tile_sz = 4 */
/* LOC_ID_1D_patch_tile_nomod = (LOC_ID_1D/4) */
/* LOC_ID_1D_patch_tile = (LOC_ID_1D/4) */
/* LOC_ID_1D_sz = 128 */
/* filts_xp_ix_out_chan_tile_dim = 4 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%4) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 4 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/4) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/4)%%8) */
/* filts_xp_ix_x_dim = 1 */
/* filts_xp_ix_x_sz = 32 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/32) */
/* filts_xp_ix_x = ((filts_xp_ix/32)%%1) */
/* filts_xp_ix_y_dim = 1 */
/* filts_xp_ix_y_sz = 32 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/32) */
/* filts_xp_ix_y = ((filts_xp_ix/32)%%1) */
/* filts_xp_ix_in_chan_dim = 192 */
/* filts_xp_ix_in_chan_sz = 32 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/32) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/32)%%192) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 6144 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/6144) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/6144) */
/* filts_xp_ix_sz = 6144 */
/* patch_smem_load_iter = 2 */
/* GRP_ID_1D_out_chan_blk_dim = 1 */
/* GRP_ID_1D_out_chan_blk_sz = 1 */
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */
/* GRP_ID_1D_patch_blk_dim = 62 */
/* GRP_ID_1D_patch_blk_sz = 1 */
/* GRP_ID_1D_patch_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_patch_blk = GRP_ID_1D */
/* GRP_ID_1D_sz = 62 */
/* out_chan_tile = (%(LOC_ID_1D_out_chan_tile)+%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim)) */
/* patch_tile = (%(LOC_ID_1D_patch_tile)+%(GRP_ID_1D_patch_blk)*%(LOC_ID_1D_patch_tile_dim)) */
/* out_chan_ix = (%(out_chan_tile)*%(t_tile_sz)) */
/* patch_ix_0 = (%(patch_tile)*%(t_tile_sz)+0) */
/* patch_ix_0_x_dim = 28 */
/* patch_ix_0_x_sz = 1 */
/* patch_ix_0_x_nomod = %(patch_ix_0) */
/* patch_ix_0_x = (%(patch_ix_0)%%28) */
/* patch_ix_0_y_dim = 28 */
/* patch_ix_0_y_sz = 28 */
/* patch_ix_0_y_nomod = (%(patch_ix_0)/28) */
/* patch_ix_0_y = ((%(patch_ix_0)/28)%%28) */
/* patch_ix_0_img_dim = 20 */
/* patch_ix_0_img_sz = 784 */
/* patch_ix_0_img_nomod = (%(patch_ix_0)/784) */
/* patch_ix_0_img = (%(patch_ix_0)/784) */
/* patch_ix_0_sz = 15680 */
/* patch_ix_1 = (%(patch_tile)*%(t_tile_sz)+1) */
/* patch_ix_1_x_dim = 28 */
/* patch_ix_1_x_sz = 1 */
/* patch_ix_1_x_nomod = %(patch_ix_1) */
/* patch_ix_1_x = (%(patch_ix_1)%%28) */
/* patch_ix_1_y_dim = 28 */
/* patch_ix_1_y_sz = 28 */
/* patch_ix_1_y_nomod = (%(patch_ix_1)/28) */
/* patch_ix_1_y = ((%(patch_ix_1)/28)%%28) */
/* patch_ix_1_img_dim = 20 */
/* patch_ix_1_img_sz = 784 */
/* patch_ix_1_img_nomod = (%(patch_ix_1)/784) */
/* patch_ix_1_img = (%(patch_ix_1)/784) */
/* patch_ix_1_sz = 15680 */
/* patch_ix_2 = (%(patch_tile)*%(t_tile_sz)+2) */
/* patch_ix_2_x_dim = 28 */
/* patch_ix_2_x_sz = 1 */
/* patch_ix_2_x_nomod = %(patch_ix_2) */
/* patch_ix_2_x = (%(patch_ix_2)%%28) */
/* patch_ix_2_y_dim = 28 */
/* patch_ix_2_y_sz = 28 */
/* patch_ix_2_y_nomod = (%(patch_ix_2)/28) */
/* patch_ix_2_y = ((%(patch_ix_2)/28)%%28) */
/* patch_ix_2_img_dim = 20 */
/* patch_ix_2_img_sz = 784 */
/* patch_ix_2_img_nomod = (%(patch_ix_2)/784) */
/* patch_ix_2_img = (%(patch_ix_2)/784) */
/* patch_ix_2_sz = 15680 */
/* patch_ix_3 = (%(patch_tile)*%(t_tile_sz)+3) */
/* patch_ix_3_x_dim = 28 */
/* patch_ix_3_x_sz = 1 */
/* patch_ix_3_x_nomod = %(patch_ix_3) */
/* patch_ix_3_x = (%(patch_ix_3)%%28) */
/* patch_ix_3_y_dim = 28 */
/* patch_ix_3_y_sz = 28 */
/* patch_ix_3_y_nomod = (%(patch_ix_3)/28) */
/* patch_ix_3_y = ((%(patch_ix_3)/28)%%28) */
/* patch_ix_3_img_dim = 20 */
/* patch_ix_3_img_sz = 784 */
/* patch_ix_3_img_nomod = (%(patch_ix_3)/784) */
/* patch_ix_3_img = (%(patch_ix_3)/784) */
/* patch_ix_3_sz = 15680 */
/* patch_ix_4 = (%(patch_tile)*%(t_tile_sz)+4) */
/* patch_ix_4_x_dim = 28 */
/* patch_ix_4_x_sz = 1 */
/* patch_ix_4_x_nomod = %(patch_ix_4) */
/* patch_ix_4_x = (%(patch_ix_4)%%28) */
/* patch_ix_4_y_dim = 28 */
/* patch_ix_4_y_sz = 28 */
/* patch_ix_4_y_nomod = (%(patch_ix_4)/28) */
/* patch_ix_4_y = ((%(patch_ix_4)/28)%%28) */
/* patch_ix_4_img_dim = 20 */
/* patch_ix_4_img_sz = 784 */
/* patch_ix_4_img_nomod = (%(patch_ix_4)/784) */
/* patch_ix_4_img = (%(patch_ix_4)/784) */
/* patch_ix_4_sz = 15680 */
/* patch_ix_5 = (%(patch_tile)*%(t_tile_sz)+5) */
/* patch_ix_5_x_dim = 28 */
/* patch_ix_5_x_sz = 1 */
/* patch_ix_5_x_nomod = %(patch_ix_5) */
/* patch_ix_5_x = (%(patch_ix_5)%%28) */
/* patch_ix_5_y_dim = 28 */
/* patch_ix_5_y_sz = 28 */
/* patch_ix_5_y_nomod = (%(patch_ix_5)/28) */
/* patch_ix_5_y = ((%(patch_ix_5)/28)%%28) */
/* patch_ix_5_img_dim = 20 */
/* patch_ix_5_img_sz = 784 */
/* patch_ix_5_img_nomod = (%(patch_ix_5)/784) */
/* patch_ix_5_img = (%(patch_ix_5)/784) */
/* patch_ix_5_sz = 15680 */
/* patch_ix_6 = (%(patch_tile)*%(t_tile_sz)+6) */
/* patch_ix_6_x_dim = 28 */
/* patch_ix_6_x_sz = 1 */
/* patch_ix_6_x_nomod = %(patch_ix_6) */
/* patch_ix_6_x = (%(patch_ix_6)%%28) */
/* patch_ix_6_y_dim = 28 */
/* patch_ix_6_y_sz = 28 */
/* patch_ix_6_y_nomod = (%(patch_ix_6)/28) */
/* patch_ix_6_y = ((%(patch_ix_6)/28)%%28) */
/* patch_ix_6_img_dim = 20 */
/* patch_ix_6_img_sz = 784 */
/* patch_ix_6_img_nomod = (%(patch_ix_6)/784) */
/* patch_ix_6_img = (%(patch_ix_6)/784) */
/* patch_ix_6_sz = 15680 */
/* patch_ix_7 = (%(patch_tile)*%(t_tile_sz)+7) */
/* patch_ix_7_x_dim = 28 */
/* patch_ix_7_x_sz = 1 */
/* patch_ix_7_x_nomod = %(patch_ix_7) */
/* patch_ix_7_x = (%(patch_ix_7)%%28) */
/* patch_ix_7_y_dim = 28 */
/* patch_ix_7_y_sz = 28 */
/* patch_ix_7_y_nomod = (%(patch_ix_7)/28) */
/* patch_ix_7_y = ((%(patch_ix_7)/28)%%28) */
/* patch_ix_7_img_dim = 20 */
/* patch_ix_7_img_sz = 784 */
/* patch_ix_7_img_nomod = (%(patch_ix_7)/784) */
/* patch_ix_7_img = (%(patch_ix_7)/784) */
/* patch_ix_7_sz = 15680 */
/* get_in = float v = 0;
int const smem_in_ix_y = %(t_smem_patch_ix_y)*%(stride)+%(filts_ix_out_chan_elem_y) - %(in_pad);
int const smem_in_ix_x = %(t_smem_patch_ix_x)*%(stride)+%(filts_ix_out_chan_elem_x) - %(in_pad);
if(smem_in_ix_y >= 0 && smem_in_ix_x >= 0 &&
%(t_smem_patch_ix_img) < %(in_ix_img_dim) &&
smem_in_ix_x < %(in_ix_x_dim) && smem_in_ix_y < %(in_ix_y_dim) ) {
v = in[%(t_smem_patch_ix_img)*%(in_ix_img_sz) +
%(filts_ix_out_chan_elem_in_chan)*%(in_ix_chan_sz) +
smem_in_ix_y*%(in_ix_y_sz) +
smem_in_ix_x*%(in_ix_x_sz)];
} */
/* t_tile_fmas = // begin t_tile_fmas
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
// end t_tile_fmas */
/* t_tile_loads = // begin t_tile_loads
filts_strip[0] = filts_smem[%(LOC_ID_1D_out_chan_tile)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem[%(LOC_ID_1D_out_chan_tile)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem[%(LOC_ID_1D_out_chan_tile)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem[%(LOC_ID_1D_out_chan_tile)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem[%(LOC_ID_1D_out_chan_tile)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem[%(LOC_ID_1D_out_chan_tile)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem[%(LOC_ID_1D_out_chan_tile)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem[%(LOC_ID_1D_out_chan_tile)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+0];
in_strip[1] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+1];
in_strip[2] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+2];
in_strip[3] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+3];
in_strip[4] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+4];
in_strip[5] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+5];
in_strip[6] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+6];
in_strip[7] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+7];
// end t_tile_loads */
/* t_tile_dummy_loads = // begin t_tile_dummy_loads
filts_strip[0] = filts_smem[(LOC_ID_1D %% 32) + 0];
filts_strip[1] = filts_smem[(LOC_ID_1D %% 32) + 1];
filts_strip[2] = filts_smem[(LOC_ID_1D %% 32) + 2];
filts_strip[3] = filts_smem[(LOC_ID_1D %% 32) + 3];
filts_strip[4] = filts_smem[(LOC_ID_1D %% 32) + 4];
filts_strip[5] = filts_smem[(LOC_ID_1D %% 32) + 5];
filts_strip[6] = filts_smem[(LOC_ID_1D %% 32) + 6];
filts_strip[7] = filts_smem[(LOC_ID_1D %% 32) + 7];
in_strip[0] = in_smem[(LOC_ID_1D %% 32) + 0];
in_strip[1] = in_smem[(LOC_ID_1D %% 32) + 1];
in_strip[2] = in_smem[(LOC_ID_1D %% 32) + 2];
in_strip[3] = in_smem[(LOC_ID_1D %% 32) + 3];
in_strip[4] = in_smem[(LOC_ID_1D %% 32) + 4];
in_strip[5] = in_smem[(LOC_ID_1D %% 32) + 5];
in_strip[6] = in_smem[(LOC_ID_1D %% 32) + 6];
in_strip[7] = in_smem[(LOC_ID_1D %% 32) + 7];
// end t_tile_dummy_loads */
/* t_tile_stores = // begin t_tile_stores
int32_t tpix[%(t_tile_sz)];
int32_t tcix[%(t_tile_sz)];
tpix[0] = %(patch_ix_0_img)*%(out_ix_img_sz) +
( %(patch_ix_0) %% %(patch_ix_0_img_sz) ); // cache out patch ixs
tpix[1] = %(patch_ix_1_img)*%(out_ix_img_sz) +
( %(patch_ix_1) %% %(patch_ix_1_img_sz) ); // cache out patch ixs
tpix[2] = %(patch_ix_2_img)*%(out_ix_img_sz) +
( %(patch_ix_2) %% %(patch_ix_2_img_sz) ); // cache out patch ixs
tpix[3] = %(patch_ix_3_img)*%(out_ix_img_sz) +
( %(patch_ix_3) %% %(patch_ix_3_img_sz) ); // cache out patch ixs
tpix[4] = %(patch_ix_4_img)*%(out_ix_img_sz) +
( %(patch_ix_4) %% %(patch_ix_4_img_sz) ); // cache out patch ixs
tpix[5] = %(patch_ix_5_img)*%(out_ix_img_sz) +
( %(patch_ix_5) %% %(patch_ix_5_img_sz) ); // cache out patch ixs
tpix[6] = %(patch_ix_6_img)*%(out_ix_img_sz) +
( %(patch_ix_6) %% %(patch_ix_6_img_sz) ); // cache out patch ixs
tpix[7] = %(patch_ix_7_img)*%(out_ix_img_sz) +
( %(patch_ix_7) %% %(patch_ix_7_img_sz) ); // cache out patch ixs
tcix[0] = (%(out_chan_ix)+0)*%(out_ix_chan_sz); // cache out chan ixs
tcix[1] = (%(out_chan_ix)+1)*%(out_ix_chan_sz); // cache out chan ixs
tcix[2] = (%(out_chan_ix)+2)*%(out_ix_chan_sz); // cache out chan ixs
tcix[3] = (%(out_chan_ix)+3)*%(out_ix_chan_sz); // cache out chan ixs
tcix[4] = (%(out_chan_ix)+4)*%(out_ix_chan_sz); // cache out chan ixs
tcix[5] = (%(out_chan_ix)+5)*%(out_ix_chan_sz); // cache out chan ixs
tcix[6] = (%(out_chan_ix)+6)*%(out_ix_chan_sz); // cache out chan ixs
tcix[7] = (%(out_chan_ix)+7)*%(out_ix_chan_sz); // cache out chan ixs
if( %(patch_ix_0) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( %(patch_ix_1) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( %(patch_ix_2) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( %(patch_ix_3) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( %(patch_ix_4) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( %(patch_ix_5) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( %(patch_ix_6) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( %(patch_ix_7) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores */
/* t_tile_dummy_stores = // begin t_tile_dummy_stores
out[0] = 0.0f
+ max(0.0f,out_tile[0] + filts_strip[0])
+ max(0.0f,out_tile[1] + filts_strip[1])
+ max(0.0f,out_tile[2] + filts_strip[2])
+ max(0.0f,out_tile[3] + filts_strip[3])
+ max(0.0f,out_tile[4] + filts_strip[4])
+ max(0.0f,out_tile[5] + filts_strip[5])
+ max(0.0f,out_tile[6] + filts_strip[6])
+ max(0.0f,out_tile[7] + filts_strip[7])
+ max(0.0f,out_tile[8] + filts_strip[0])
+ max(0.0f,out_tile[9] + filts_strip[1])
+ max(0.0f,out_tile[10] + filts_strip[2])
+ max(0.0f,out_tile[11] + filts_strip[3])
+ max(0.0f,out_tile[12] + filts_strip[4])
+ max(0.0f,out_tile[13] + filts_strip[5])
+ max(0.0f,out_tile[14] + filts_strip[6])
+ max(0.0f,out_tile[15] + filts_strip[7])
+ max(0.0f,out_tile[16] + filts_strip[0])
+ max(0.0f,out_tile[17] + filts_strip[1])
+ max(0.0f,out_tile[18] + filts_strip[2])
+ max(0.0f,out_tile[19] + filts_strip[3])
+ max(0.0f,out_tile[20] + filts_strip[4])
+ max(0.0f,out_tile[21] + filts_strip[5])
+ max(0.0f,out_tile[22] + filts_strip[6])
+ max(0.0f,out_tile[23] + filts_strip[7])
+ max(0.0f,out_tile[24] + filts_strip[0])
+ max(0.0f,out_tile[25] + filts_strip[1])
+ max(0.0f,out_tile[26] + filts_strip[2])
+ max(0.0f,out_tile[27] + filts_strip[3])
+ max(0.0f,out_tile[28] + filts_strip[4])
+ max(0.0f,out_tile[29] + filts_strip[5])
+ max(0.0f,out_tile[30] + filts_strip[6])
+ max(0.0f,out_tile[31] + filts_strip[7])
+ max(0.0f,out_tile[32] + filts_strip[0])
+ max(0.0f,out_tile[33] + filts_strip[1])
+ max(0.0f,out_tile[34] + filts_strip[2])
+ max(0.0f,out_tile[35] + filts_strip[3])
+ max(0.0f,out_tile[36] + filts_strip[4])
+ max(0.0f,out_tile[37] + filts_strip[5])
+ max(0.0f,out_tile[38] + filts_strip[6])
+ max(0.0f,out_tile[39] + filts_strip[7])
+ max(0.0f,out_tile[40] + filts_strip[0])
+ max(0.0f,out_tile[41] + filts_strip[1])
+ max(0.0f,out_tile[42] + filts_strip[2])
+ max(0.0f,out_tile[43] + filts_strip[3])
+ max(0.0f,out_tile[44] + filts_strip[4])
+ max(0.0f,out_tile[45] + filts_strip[5])
+ max(0.0f,out_tile[46] + filts_strip[6])
+ max(0.0f,out_tile[47] + filts_strip[7])
+ max(0.0f,out_tile[48] + filts_strip[0])
+ max(0.0f,out_tile[49] + filts_strip[1])
+ max(0.0f,out_tile[50] + filts_strip[2])
+ max(0.0f,out_tile[51] + filts_strip[3])
+ max(0.0f,out_tile[52] + filts_strip[4])
+ max(0.0f,out_tile[53] + filts_strip[5])
+ max(0.0f,out_tile[54] + filts_strip[6])
+ max(0.0f,out_tile[55] + filts_strip[7])
+ max(0.0f,out_tile[56] + filts_strip[0])
+ max(0.0f,out_tile[57] + filts_strip[1])
+ max(0.0f,out_tile[58] + filts_strip[2])
+ max(0.0f,out_tile[59] + filts_strip[3])
+ max(0.0f,out_tile[60] + filts_strip[4])
+ max(0.0f,out_tile[61] + filts_strip[5])
+ max(0.0f,out_tile[62] + filts_strip[6])
+ max(0.0f,out_tile[63] + filts_strip[7])
;
// end t_tile_dummy_stores */
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_32__in_chans_192__kysz_1__kxsz_1( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile
{
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg
int32_t const filts_ix = GLOB_ID_1D;
if( filts_ix >= 6144 ) { return; }
int32_t const fioc = (filts_ix/192);
float val = 0.0f;
int32_t const filts_xp_ix =
(fioc/32)*6144 +
(fioc%8)*4 +
((fioc/8)%4)*1 +
(filts_ix%192)*32 +
(filts_ix%1)*32 +
(filts_ix%1)*32;
#if 1
val = in[filts_ix];
#else
if( (filts_ix%192) == 0 ) {
// if( ((filts_ix%1) == 5) && ((filts_ix%1) == 5) )
{
val = (filts_ix%1)*100 + (filts_ix%1);
}
}
#endif
out[filts_xp_ix] = val;
}
// -- template substituion table used: --
/* out_chans = 32 */
/* in_chans = 192 */
/* kysz = 1 */
/* kxsz = 1 */
/* rtc_func_name = xpose_filts__out_chans_32__in_chans_192__kysz_1__kxsz_1 */
/* t_tile_sz = 8 */
/* filts_ix_x_dim = 1 */
/* filts_ix_x_sz = 1 */
/* filts_ix_x_nomod = filts_ix */
/* filts_ix_x = (filts_ix%%1) */
/* filts_ix_y_dim = 1 */
/* filts_ix_y_sz = 1 */
/* filts_ix_y_nomod = filts_ix */
/* filts_ix_y = (filts_ix%%1) */
/* filts_ix_in_chan_dim = 192 */
/* filts_ix_in_chan_sz = 1 */
/* filts_ix_in_chan_nomod = filts_ix */
/* filts_ix_in_chan = (filts_ix%%192) */
/* filts_ix_out_chan_dim = 32 */
/* filts_ix_out_chan_sz = 192 */
/* filts_ix_out_chan_nomod = (filts_ix/192) */
/* filts_ix_out_chan = (filts_ix/192) */
/* filts_ix_sz = 6144 */
/* filts_xp_ix_out_chan_tile_dim = 4 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%4) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 4 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/4) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/4)%%8) */
/* filts_xp_ix_x_dim = 1 */
/* filts_xp_ix_x_sz = 32 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/32) */
/* filts_xp_ix_x = ((filts_xp_ix/32)%%1) */
/* filts_xp_ix_y_dim = 1 */
/* filts_xp_ix_y_sz = 32 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/32) */
/* filts_xp_ix_y = ((filts_xp_ix/32)%%1) */
/* filts_xp_ix_in_chan_dim = 192 */
/* filts_xp_ix_in_chan_sz = 32 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/32) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/32)%%192) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 6144 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/6144) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/6144) */
/* filts_xp_ix_sz = 6144 */
/* fioc_out_chan_reg_dim = 8 */
/* fioc_out_chan_reg_sz = 1 */
/* fioc_out_chan_reg_nomod = fioc */
/* fioc_out_chan_reg = (fioc%%8) */
/* fioc_out_chan_tile_dim = 4 */
/* fioc_out_chan_tile_sz = 8 */
/* fioc_out_chan_tile_nomod = (fioc/8) */
/* fioc_out_chan_tile = ((fioc/8)%%4) */
/* fioc_out_chan_blk_dim = 1 */
/* fioc_out_chan_blk_sz = 32 */
/* fioc_out_chan_blk_nomod = (fioc/32) */
/* fioc_out_chan_blk = (fioc/32) */
/* fioc_sz = 32 */
// each thread: computes 8x8 block of out
// loop over k dim
CUCL_GLOBAL_KERNEL void k1conv__num_imgs_20__in_dim_0_28__in_dim_1_28__conv_has_relu_1__out_chans_64__write_xposed_0__in_chans_192( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) {
//int32_t const blk_in_ix_sz = 16*8;
LOCSHAR_MEM float all_smem[1536]; // note: max(filts+in,out) == max(512+1024,1024)
LSMASQ float * const filts_smem = all_smem;
LSMASQ float * const in_smem = filts_smem + 512;
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz )
float in_strip[8]; // segment of input line sufficient for one unrolling of inner loop
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*12288; // index of first out chan
int32_t blk_in_ix_base = GRP_ID_1D*24576 + LOC_ID_1D;// index of first input pel to load for this thread
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%8);
LSMASQ float * const in_smem_off = in_smem + 8*(LOC_ID_1D/8);
LSMASQ float * const out_smem_off = all_smem + LOC_ID_1D;
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D;
// iteratate over filter elements
for( int32_t blk_iter = 0; blk_iter != 24; ++blk_iter ) {
BARRIER_SYNC;
// begin smem_loads
filts_smem[(LOC_ID_1D + 128 * 0)] = filts[filts_off+(128*0)];
filts_smem[(LOC_ID_1D + 128 * 1)] = filts[filts_off+(128*1)];
filts_smem[(LOC_ID_1D + 128 * 2)] = filts[filts_off+(128*2)];
filts_smem[(LOC_ID_1D + 128 * 3)] = filts[filts_off+(128*3)];
in_smem[(LOC_ID_1D + 128 * 0)] = in[ blk_in_ix_base + (128*0) ];
in_smem[(LOC_ID_1D + 128 * 1)] = in[ blk_in_ix_base + (128*1) ];
in_smem[(LOC_ID_1D + 128 * 2)] = in[ blk_in_ix_base + (128*2) ];
in_smem[(LOC_ID_1D + 128 * 3)] = in[ blk_in_ix_base + (128*3) ];
in_smem[(LOC_ID_1D + 128 * 4)] = in[ blk_in_ix_base + (128*4) ];
in_smem[(LOC_ID_1D + 128 * 5)] = in[ blk_in_ix_base + (128*5) ];
in_smem[(LOC_ID_1D + 128 * 6)] = in[ blk_in_ix_base + (128*6) ];
in_smem[(LOC_ID_1D + 128 * 7)] = in[ blk_in_ix_base + (128*7) ];
// end smem_loads;
BARRIER_SYNC;
filts_off += 64*8;
blk_in_ix_base += 1024;
// begin inner_loop_body
filts_strip[0] = filts_smem_off[0*64+0*8];
filts_strip[1] = filts_smem_off[0*64+1*8];
filts_strip[2] = filts_smem_off[0*64+2*8];
filts_strip[3] = filts_smem_off[0*64+3*8];
filts_strip[4] = filts_smem_off[0*64+4*8];
filts_strip[5] = filts_smem_off[0*64+5*8];
filts_strip[6] = filts_smem_off[0*64+6*8];
filts_strip[7] = filts_smem_off[0*64+7*8];
in_strip[0] = in_smem_off[(0*8*16+0)];
in_strip[1] = in_smem_off[(0*8*16+1)];
in_strip[2] = in_smem_off[(0*8*16+2)];
in_strip[3] = in_smem_off[(0*8*16+3)];
in_strip[4] = in_smem_off[(0*8*16+4)];
in_strip[5] = in_smem_off[(0*8*16+5)];
in_strip[6] = in_smem_off[(0*8*16+6)];
in_strip[7] = in_smem_off[(0*8*16+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[1*64+0*8];
filts_strip[1] = filts_smem_off[1*64+1*8];
filts_strip[2] = filts_smem_off[1*64+2*8];
filts_strip[3] = filts_smem_off[1*64+3*8];
filts_strip[4] = filts_smem_off[1*64+4*8];
filts_strip[5] = filts_smem_off[1*64+5*8];
filts_strip[6] = filts_smem_off[1*64+6*8];
filts_strip[7] = filts_smem_off[1*64+7*8];
in_strip[0] = in_smem_off[(1*8*16+0)];
in_strip[1] = in_smem_off[(1*8*16+1)];
in_strip[2] = in_smem_off[(1*8*16+2)];
in_strip[3] = in_smem_off[(1*8*16+3)];
in_strip[4] = in_smem_off[(1*8*16+4)];
in_strip[5] = in_smem_off[(1*8*16+5)];
in_strip[6] = in_smem_off[(1*8*16+6)];
in_strip[7] = in_smem_off[(1*8*16+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[2*64+0*8];
filts_strip[1] = filts_smem_off[2*64+1*8];
filts_strip[2] = filts_smem_off[2*64+2*8];
filts_strip[3] = filts_smem_off[2*64+3*8];
filts_strip[4] = filts_smem_off[2*64+4*8];
filts_strip[5] = filts_smem_off[2*64+5*8];
filts_strip[6] = filts_smem_off[2*64+6*8];
filts_strip[7] = filts_smem_off[2*64+7*8];
in_strip[0] = in_smem_off[(2*8*16+0)];
in_strip[1] = in_smem_off[(2*8*16+1)];
in_strip[2] = in_smem_off[(2*8*16+2)];
in_strip[3] = in_smem_off[(2*8*16+3)];
in_strip[4] = in_smem_off[(2*8*16+4)];
in_strip[5] = in_smem_off[(2*8*16+5)];
in_strip[6] = in_smem_off[(2*8*16+6)];
in_strip[7] = in_smem_off[(2*8*16+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[3*64+0*8];
filts_strip[1] = filts_smem_off[3*64+1*8];
filts_strip[2] = filts_smem_off[3*64+2*8];
filts_strip[3] = filts_smem_off[3*64+3*8];
filts_strip[4] = filts_smem_off[3*64+4*8];
filts_strip[5] = filts_smem_off[3*64+5*8];
filts_strip[6] = filts_smem_off[3*64+6*8];
filts_strip[7] = filts_smem_off[3*64+7*8];
in_strip[0] = in_smem_off[(3*8*16+0)];
in_strip[1] = in_smem_off[(3*8*16+1)];
in_strip[2] = in_smem_off[(3*8*16+2)];
in_strip[3] = in_smem_off[(3*8*16+3)];
in_strip[4] = in_smem_off[(3*8*16+4)];
in_strip[5] = in_smem_off[(3*8*16+5)];
in_strip[6] = in_smem_off[(3*8*16+6)];
in_strip[7] = in_smem_off[(3*8*16+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[4*64+0*8];
filts_strip[1] = filts_smem_off[4*64+1*8];
filts_strip[2] = filts_smem_off[4*64+2*8];
filts_strip[3] = filts_smem_off[4*64+3*8];
filts_strip[4] = filts_smem_off[4*64+4*8];
filts_strip[5] = filts_smem_off[4*64+5*8];
filts_strip[6] = filts_smem_off[4*64+6*8];
filts_strip[7] = filts_smem_off[4*64+7*8];
in_strip[0] = in_smem_off[(4*8*16+0)];
in_strip[1] = in_smem_off[(4*8*16+1)];
in_strip[2] = in_smem_off[(4*8*16+2)];
in_strip[3] = in_smem_off[(4*8*16+3)];
in_strip[4] = in_smem_off[(4*8*16+4)];
in_strip[5] = in_smem_off[(4*8*16+5)];
in_strip[6] = in_smem_off[(4*8*16+6)];
in_strip[7] = in_smem_off[(4*8*16+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[5*64+0*8];
filts_strip[1] = filts_smem_off[5*64+1*8];
filts_strip[2] = filts_smem_off[5*64+2*8];
filts_strip[3] = filts_smem_off[5*64+3*8];
filts_strip[4] = filts_smem_off[5*64+4*8];
filts_strip[5] = filts_smem_off[5*64+5*8];
filts_strip[6] = filts_smem_off[5*64+6*8];
filts_strip[7] = filts_smem_off[5*64+7*8];
in_strip[0] = in_smem_off[(5*8*16+0)];
in_strip[1] = in_smem_off[(5*8*16+1)];
in_strip[2] = in_smem_off[(5*8*16+2)];
in_strip[3] = in_smem_off[(5*8*16+3)];
in_strip[4] = in_smem_off[(5*8*16+4)];
in_strip[5] = in_smem_off[(5*8*16+5)];
in_strip[6] = in_smem_off[(5*8*16+6)];
in_strip[7] = in_smem_off[(5*8*16+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[6*64+0*8];
filts_strip[1] = filts_smem_off[6*64+1*8];
filts_strip[2] = filts_smem_off[6*64+2*8];
filts_strip[3] = filts_smem_off[6*64+3*8];
filts_strip[4] = filts_smem_off[6*64+4*8];
filts_strip[5] = filts_smem_off[6*64+5*8];
filts_strip[6] = filts_smem_off[6*64+6*8];
filts_strip[7] = filts_smem_off[6*64+7*8];
in_strip[0] = in_smem_off[(6*8*16+0)];
in_strip[1] = in_smem_off[(6*8*16+1)];
in_strip[2] = in_smem_off[(6*8*16+2)];
in_strip[3] = in_smem_off[(6*8*16+3)];
in_strip[4] = in_smem_off[(6*8*16+4)];
in_strip[5] = in_smem_off[(6*8*16+5)];
in_strip[6] = in_smem_off[(6*8*16+6)];
in_strip[7] = in_smem_off[(6*8*16+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[7*64+0*8];
filts_strip[1] = filts_smem_off[7*64+1*8];
filts_strip[2] = filts_smem_off[7*64+2*8];
filts_strip[3] = filts_smem_off[7*64+3*8];
filts_strip[4] = filts_smem_off[7*64+4*8];
filts_strip[5] = filts_smem_off[7*64+5*8];
filts_strip[6] = filts_smem_off[7*64+6*8];
filts_strip[7] = filts_smem_off[7*64+7*8];
in_strip[0] = in_smem_off[(7*8*16+0)];
in_strip[1] = in_smem_off[(7*8*16+1)];
in_strip[2] = in_smem_off[(7*8*16+2)];
in_strip[3] = in_smem_off[(7*8*16+3)];
in_strip[4] = in_smem_off[(7*8*16+4)];
in_strip[5] = in_smem_off[(7*8*16+5)];
in_strip[6] = in_smem_off[(7*8*16+6)];
in_strip[7] = in_smem_off[(7*8*16+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
;
}
// load per-block biases into smem
if( flags == 2 ) { return; }
BARRIER_SYNC;
for( int32_t i = 0; i != 1; ++i ) {
int32_t const t_smem_bias_ix = LOC_ID_1D+128*i;
if( t_smem_bias_ix < 64 ) {
int32_t const ocix_base = (GRP_ID_1D%1)*64;
int32_t const load_reg = t_smem_bias_ix / 8;
int32_t const load_tile = t_smem_bias_ix % 8;
int32_t const ocix = ocix_base + load_tile*8 + load_reg;
if( ocix < 64 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; }
}
}
BARRIER_SYNC;
// load biases into filts_strip
// begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*8];
filts_strip[1] = filts_smem_off[1*8];
filts_strip[2] = filts_smem_off[2*8];
filts_strip[3] = filts_smem_off[3*8];
filts_strip[4] = filts_smem_off[4*8];
filts_strip[5] = filts_smem_off[5*8];
filts_strip[6] = filts_smem_off[6*8];
filts_strip[7] = filts_smem_off[7*8];
// end t_tile_bias_loads;
if( flags == 1 ) {
GASQ float * const out_off = out + LOC_ID_1D;
out_off[0] = max(0.0f,out_tile[0]+filts_strip[0]);
out_off[128] = max(0.0f,out_tile[1]+filts_strip[1]);
out_off[256] = max(0.0f,out_tile[2]+filts_strip[2]);
out_off[384] = max(0.0f,out_tile[3]+filts_strip[3]);
out_off[512] = max(0.0f,out_tile[4]+filts_strip[4]);
out_off[640] = max(0.0f,out_tile[5]+filts_strip[5]);
out_off[768] = max(0.0f,out_tile[6]+filts_strip[6]);
out_off[896] = max(0.0f,out_tile[7]+filts_strip[7]);
out_off[1024] = max(0.0f,out_tile[8]+filts_strip[0]);
out_off[1152] = max(0.0f,out_tile[9]+filts_strip[1]);
out_off[1280] = max(0.0f,out_tile[10]+filts_strip[2]);
out_off[1408] = max(0.0f,out_tile[11]+filts_strip[3]);
out_off[1536] = max(0.0f,out_tile[12]+filts_strip[4]);
out_off[1664] = max(0.0f,out_tile[13]+filts_strip[5]);
out_off[1792] = max(0.0f,out_tile[14]+filts_strip[6]);
out_off[1920] = max(0.0f,out_tile[15]+filts_strip[7]);
out_off[2048] = max(0.0f,out_tile[16]+filts_strip[0]);
out_off[2176] = max(0.0f,out_tile[17]+filts_strip[1]);
out_off[2304] = max(0.0f,out_tile[18]+filts_strip[2]);
out_off[2432] = max(0.0f,out_tile[19]+filts_strip[3]);
out_off[2560] = max(0.0f,out_tile[20]+filts_strip[4]);
out_off[2688] = max(0.0f,out_tile[21]+filts_strip[5]);
out_off[2816] = max(0.0f,out_tile[22]+filts_strip[6]);
out_off[2944] = max(0.0f,out_tile[23]+filts_strip[7]);
out_off[3072] = max(0.0f,out_tile[24]+filts_strip[0]);
out_off[3200] = max(0.0f,out_tile[25]+filts_strip[1]);
out_off[3328] = max(0.0f,out_tile[26]+filts_strip[2]);
out_off[3456] = max(0.0f,out_tile[27]+filts_strip[3]);
out_off[3584] = max(0.0f,out_tile[28]+filts_strip[4]);
out_off[3712] = max(0.0f,out_tile[29]+filts_strip[5]);
out_off[3840] = max(0.0f,out_tile[30]+filts_strip[6]);
out_off[3968] = max(0.0f,out_tile[31]+filts_strip[7]);
out_off[4096] = max(0.0f,out_tile[32]+filts_strip[0]);
out_off[4224] = max(0.0f,out_tile[33]+filts_strip[1]);
out_off[4352] = max(0.0f,out_tile[34]+filts_strip[2]);
out_off[4480] = max(0.0f,out_tile[35]+filts_strip[3]);
out_off[4608] = max(0.0f,out_tile[36]+filts_strip[4]);
out_off[4736] = max(0.0f,out_tile[37]+filts_strip[5]);
out_off[4864] = max(0.0f,out_tile[38]+filts_strip[6]);
out_off[4992] = max(0.0f,out_tile[39]+filts_strip[7]);
out_off[5120] = max(0.0f,out_tile[40]+filts_strip[0]);
out_off[5248] = max(0.0f,out_tile[41]+filts_strip[1]);
out_off[5376] = max(0.0f,out_tile[42]+filts_strip[2]);
out_off[5504] = max(0.0f,out_tile[43]+filts_strip[3]);
out_off[5632] = max(0.0f,out_tile[44]+filts_strip[4]);
out_off[5760] = max(0.0f,out_tile[45]+filts_strip[5]);
out_off[5888] = max(0.0f,out_tile[46]+filts_strip[6]);
out_off[6016] = max(0.0f,out_tile[47]+filts_strip[7]);
out_off[6144] = max(0.0f,out_tile[48]+filts_strip[0]);
out_off[6272] = max(0.0f,out_tile[49]+filts_strip[1]);
out_off[6400] = max(0.0f,out_tile[50]+filts_strip[2]);
out_off[6528] = max(0.0f,out_tile[51]+filts_strip[3]);
out_off[6656] = max(0.0f,out_tile[52]+filts_strip[4]);
out_off[6784] = max(0.0f,out_tile[53]+filts_strip[5]);
out_off[6912] = max(0.0f,out_tile[54]+filts_strip[6]);
out_off[7040] = max(0.0f,out_tile[55]+filts_strip[7]);
out_off[7168] = max(0.0f,out_tile[56]+filts_strip[0]);
out_off[7296] = max(0.0f,out_tile[57]+filts_strip[1]);
out_off[7424] = max(0.0f,out_tile[58]+filts_strip[2]);
out_off[7552] = max(0.0f,out_tile[59]+filts_strip[3]);
out_off[7680] = max(0.0f,out_tile[60]+filts_strip[4]);
out_off[7808] = max(0.0f,out_tile[61]+filts_strip[5]);
out_off[7936] = max(0.0f,out_tile[62]+filts_strip[6]);
out_off[8064] = max(0.0f,out_tile[63]+filts_strip[7]);
;
return;
}
// add bias to each elem of out_tile[] and store the results to out[]
// begin t_tile_stores
int32_t tpix[8];
int32_t tcix[8];
tpix[0] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+0)/784)*50176 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+0)%784)*1 ; // cache out patch ixs
tpix[1] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+1)/784)*50176 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+1)%784)*1 ; // cache out patch ixs
tpix[2] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+2)/784)*50176 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+2)%784)*1 ; // cache out patch ixs
tpix[3] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+3)/784)*50176 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+3)%784)*1 ; // cache out patch ixs
tpix[4] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+4)/784)*50176 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+4)%784)*1 ; // cache out patch ixs
tpix[5] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+5)/784)*50176 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+5)%784)*1 ; // cache out patch ixs
tpix[6] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+6)/784)*50176 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+6)%784)*1 ; // cache out patch ixs
tpix[7] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+7)/784)*50176 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+7)%784)*1 ; // cache out patch ixs
tcix[0] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+0)*784; // cache out chan ixs
tcix[1] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+1)*784; // cache out chan ixs
tcix[2] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+2)*784; // cache out chan ixs
tcix[3] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+3)*784; // cache out chan ixs
tcix[4] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+4)*784; // cache out chan ixs
tcix[5] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+5)*784; // cache out chan ixs
tcix[6] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+6)*784; // cache out chan ixs
tcix[7] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+7)*784; // cache out chan ixs
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+0)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (64*784) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( tcix[1] < (64*784) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( tcix[2] < (64*784) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( tcix[3] < (64*784) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( tcix[4] < (64*784) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( tcix[5] < (64*784) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( tcix[6] < (64*784) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( tcix[7] < (64*784) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+1)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (64*784) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( tcix[1] < (64*784) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( tcix[2] < (64*784) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( tcix[3] < (64*784) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( tcix[4] < (64*784) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( tcix[5] < (64*784) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( tcix[6] < (64*784) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( tcix[7] < (64*784) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+2)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (64*784) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( tcix[1] < (64*784) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( tcix[2] < (64*784) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( tcix[3] < (64*784) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( tcix[4] < (64*784) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( tcix[5] < (64*784) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( tcix[6] < (64*784) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( tcix[7] < (64*784) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+3)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (64*784) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( tcix[1] < (64*784) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( tcix[2] < (64*784) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( tcix[3] < (64*784) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( tcix[4] < (64*784) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( tcix[5] < (64*784) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( tcix[6] < (64*784) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( tcix[7] < (64*784) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+4)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (64*784) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( tcix[1] < (64*784) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( tcix[2] < (64*784) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( tcix[3] < (64*784) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( tcix[4] < (64*784) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( tcix[5] < (64*784) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( tcix[6] < (64*784) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( tcix[7] < (64*784) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+5)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (64*784) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( tcix[1] < (64*784) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( tcix[2] < (64*784) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( tcix[3] < (64*784) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( tcix[4] < (64*784) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( tcix[5] < (64*784) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( tcix[6] < (64*784) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( tcix[7] < (64*784) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+6)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (64*784) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( tcix[1] < (64*784) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( tcix[2] < (64*784) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( tcix[3] < (64*784) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( tcix[4] < (64*784) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( tcix[5] < (64*784) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( tcix[6] < (64*784) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( tcix[7] < (64*784) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+7)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (64*784) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( tcix[1] < (64*784) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( tcix[2] < (64*784) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( tcix[3] < (64*784) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( tcix[4] < (64*784) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( tcix[5] < (64*784) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( tcix[6] < (64*784) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( tcix[7] < (64*784) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_dim_0 = 28 */
/* in_dim_1 = 28 */
/* conv_has_relu = 1 */
/* out_chans = 64 */
/* write_xposed = 0 */
/* in_chans = 192 */
/* rtc_func_name = k1conv__num_imgs_20__in_dim_0_28__in_dim_1_28__conv_has_relu_1__out_chans_64__write_xposed_0__in_chans_192 */
/* t_tile_sz = 8 */
/* out_ix_x_dim = 28 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%28) */
/* out_ix_y_dim = 28 */
/* out_ix_y_sz = 28 */
/* out_ix_y_nomod = (out_ix/28) */
/* out_ix_y = ((out_ix/28)%%28) */
/* out_ix_chan_dim = 64 */
/* out_ix_chan_sz = 784 */
/* out_ix_chan_nomod = (out_ix/784) */
/* out_ix_chan = ((out_ix/784)%%64) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 50176 */
/* out_ix_img_nomod = (out_ix/50176) */
/* out_ix_img = (out_ix/50176) */
/* out_ix_sz = 1003520 */
/* tpb = 128 */
/* in_chan_tile = 8 */
/* LOC_ID_1D_out_chan_tile_dim = 8 */
/* LOC_ID_1D_out_chan_tile_sz = 1 */
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%8) */
/* LOC_ID_1D_pels_tile_dim = 16 */
/* LOC_ID_1D_pels_tile_sz = 8 */
/* LOC_ID_1D_pels_tile_nomod = (LOC_ID_1D/8) */
/* LOC_ID_1D_pels_tile = (LOC_ID_1D/8) */
/* LOC_ID_1D_sz = 128 */
/* GRP_ID_1D_out_chan_blk_dim = 1 */
/* GRP_ID_1D_out_chan_blk_sz = 1 */
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */
/* GRP_ID_1D_pels_blk_dim = 123 */
/* GRP_ID_1D_pels_blk_sz = 1 */
/* GRP_ID_1D_pels_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_pels_blk = GRP_ID_1D */
/* GRP_ID_1D_sz = 123 */
/* in_ix_blk_pel_dim = 128 */
/* in_ix_blk_pel_sz = 1 */
/* in_ix_blk_pel_nomod = in_ix */
/* in_ix_blk_pel = (in_ix%%128) */
/* in_ix_blk_iter_chan_dim = 8 */
/* in_ix_blk_iter_chan_sz = 128 */
/* in_ix_blk_iter_chan_nomod = (in_ix/128) */
/* in_ix_blk_iter_chan = ((in_ix/128)%%8) */
/* in_ix_blk_iter_dim = 24 */
/* in_ix_blk_iter_sz = 1024 */
/* in_ix_blk_iter_nomod = (in_ix/1024) */
/* in_ix_blk_iter = ((in_ix/1024)%%24) */
/* in_ix_blk_dim = 123 */
/* in_ix_blk_sz = 24576 */
/* in_ix_blk_nomod = (in_ix/24576) */
/* in_ix_blk = (in_ix/24576) */
/* in_ix_sz = 3022848 */
/* blk_filt_ix_sz = 64 */
/* filts_smem_sz = 512 */
/* in_smem_sz = 1024 */
/* out_smem_sz = 1024 */
/* all_smem_sz = 1536 */
/* filts_xp_ix_out_chan_tile_dim = 8 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%8) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 8 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/8) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/8)%%8) */
/* filts_xp_ix_in_chan_dim = 192 */
/* filts_xp_ix_in_chan_sz = 64 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/64) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/64)%%192) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 12288 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/12288) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/12288) */
/* filts_xp_ix_sz = 12288 */
/* out_chan_bias_smem_load_iter = 1 */
/* filts_off_adj = LOC_ID_1D */
/* smem_loads = // begin smem_loads
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)];
filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)];
filts_smem[(LOC_ID_1D + %(tpb) * 2)] = filts[filts_off+(%(tpb)*2)];
filts_smem[(LOC_ID_1D + %(tpb) * 3)] = filts[filts_off+(%(tpb)*3)];
in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ];
in_smem[(LOC_ID_1D + %(tpb) * 1)] = in[ blk_in_ix_base + (%(tpb)*1) ];
in_smem[(LOC_ID_1D + %(tpb) * 2)] = in[ blk_in_ix_base + (%(tpb)*2) ];
in_smem[(LOC_ID_1D + %(tpb) * 3)] = in[ blk_in_ix_base + (%(tpb)*3) ];
in_smem[(LOC_ID_1D + %(tpb) * 4)] = in[ blk_in_ix_base + (%(tpb)*4) ];
in_smem[(LOC_ID_1D + %(tpb) * 5)] = in[ blk_in_ix_base + (%(tpb)*5) ];
in_smem[(LOC_ID_1D + %(tpb) * 6)] = in[ blk_in_ix_base + (%(tpb)*6) ];
in_smem[(LOC_ID_1D + %(tpb) * 7)] = in[ blk_in_ix_base + (%(tpb)*7) ];
// end smem_loads */
/* out_chan_tile = (%(LOC_ID_1D_out_chan_tile)+%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim)) */
/* out_chan_ix = (%(out_chan_tile)*%(t_tile_sz)) */
/* t_smem_ld_pel_pel_dim = 128 */
/* t_smem_ld_pel_pel_sz = 1 */
/* t_smem_ld_pel_pel_nomod = t_smem_ld_pel */
/* t_smem_ld_pel_pel = (t_smem_ld_pel%%128) */
/* t_smem_ld_pel_chan_dim = 8 */
/* t_smem_ld_pel_chan_sz = 128 */
/* t_smem_ld_pel_chan_nomod = (t_smem_ld_pel/128) */
/* t_smem_ld_pel_chan = (t_smem_ld_pel/128) */
/* t_smem_ld_pel_sz = 1024 */
/* out_pel_0 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+0) */
/* out_pel_0_pel_dim = 784 */
/* out_pel_0_pel_sz = 1 */
/* out_pel_0_pel_nomod = %(out_pel_0) */
/* out_pel_0_pel = (%(out_pel_0)%%784) */
/* out_pel_0_img_dim = 20 */
/* out_pel_0_img_sz = 784 */
/* out_pel_0_img_nomod = (%(out_pel_0)/784) */
/* out_pel_0_img = (%(out_pel_0)/784) */
/* out_pel_0_sz = 15680 */
/* out_pel_1 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+1) */
/* out_pel_1_pel_dim = 784 */
/* out_pel_1_pel_sz = 1 */
/* out_pel_1_pel_nomod = %(out_pel_1) */
/* out_pel_1_pel = (%(out_pel_1)%%784) */
/* out_pel_1_img_dim = 20 */
/* out_pel_1_img_sz = 784 */
/* out_pel_1_img_nomod = (%(out_pel_1)/784) */
/* out_pel_1_img = (%(out_pel_1)/784) */
/* out_pel_1_sz = 15680 */
/* out_pel_2 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+2) */
/* out_pel_2_pel_dim = 784 */
/* out_pel_2_pel_sz = 1 */
/* out_pel_2_pel_nomod = %(out_pel_2) */
/* out_pel_2_pel = (%(out_pel_2)%%784) */
/* out_pel_2_img_dim = 20 */
/* out_pel_2_img_sz = 784 */
/* out_pel_2_img_nomod = (%(out_pel_2)/784) */
/* out_pel_2_img = (%(out_pel_2)/784) */
/* out_pel_2_sz = 15680 */
/* out_pel_3 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+3) */
/* out_pel_3_pel_dim = 784 */
/* out_pel_3_pel_sz = 1 */
/* out_pel_3_pel_nomod = %(out_pel_3) */
/* out_pel_3_pel = (%(out_pel_3)%%784) */
/* out_pel_3_img_dim = 20 */
/* out_pel_3_img_sz = 784 */
/* out_pel_3_img_nomod = (%(out_pel_3)/784) */
/* out_pel_3_img = (%(out_pel_3)/784) */
/* out_pel_3_sz = 15680 */
/* out_pel_4 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+4) */
/* out_pel_4_pel_dim = 784 */
/* out_pel_4_pel_sz = 1 */
/* out_pel_4_pel_nomod = %(out_pel_4) */
/* out_pel_4_pel = (%(out_pel_4)%%784) */
/* out_pel_4_img_dim = 20 */
/* out_pel_4_img_sz = 784 */
/* out_pel_4_img_nomod = (%(out_pel_4)/784) */
/* out_pel_4_img = (%(out_pel_4)/784) */
/* out_pel_4_sz = 15680 */
/* out_pel_5 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+5) */
/* out_pel_5_pel_dim = 784 */
/* out_pel_5_pel_sz = 1 */
/* out_pel_5_pel_nomod = %(out_pel_5) */
/* out_pel_5_pel = (%(out_pel_5)%%784) */
/* out_pel_5_img_dim = 20 */
/* out_pel_5_img_sz = 784 */
/* out_pel_5_img_nomod = (%(out_pel_5)/784) */
/* out_pel_5_img = (%(out_pel_5)/784) */
/* out_pel_5_sz = 15680 */
/* out_pel_6 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+6) */
/* out_pel_6_pel_dim = 784 */
/* out_pel_6_pel_sz = 1 */
/* out_pel_6_pel_nomod = %(out_pel_6) */
/* out_pel_6_pel = (%(out_pel_6)%%784) */
/* out_pel_6_img_dim = 20 */
/* out_pel_6_img_sz = 784 */
/* out_pel_6_img_nomod = (%(out_pel_6)/784) */
/* out_pel_6_img = (%(out_pel_6)/784) */
/* out_pel_6_sz = 15680 */
/* out_pel_7 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+7) */
/* out_pel_7_pel_dim = 784 */
/* out_pel_7_pel_sz = 1 */
/* out_pel_7_pel_nomod = %(out_pel_7) */
/* out_pel_7_pel = (%(out_pel_7)%%784) */
/* out_pel_7_img_dim = 20 */
/* out_pel_7_img_sz = 784 */
/* out_pel_7_img_nomod = (%(out_pel_7)/784) */
/* out_pel_7_img = (%(out_pel_7)/784) */
/* out_pel_7_sz = 15680 */
/* t_tile_stores = // begin t_tile_stores
int32_t tpix[%(t_tile_sz)];
int32_t tcix[%(t_tile_sz)];
tpix[0] = %(out_pel_0_img)*%(out_ix_img_sz) + %(out_pel_0_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[1] = %(out_pel_1_img)*%(out_ix_img_sz) + %(out_pel_1_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[2] = %(out_pel_2_img)*%(out_ix_img_sz) + %(out_pel_2_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[3] = %(out_pel_3_img)*%(out_ix_img_sz) + %(out_pel_3_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[4] = %(out_pel_4_img)*%(out_ix_img_sz) + %(out_pel_4_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[5] = %(out_pel_5_img)*%(out_ix_img_sz) + %(out_pel_5_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[6] = %(out_pel_6_img)*%(out_ix_img_sz) + %(out_pel_6_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[7] = %(out_pel_7_img)*%(out_ix_img_sz) + %(out_pel_7_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tcix[0] = (%(out_chan_ix)+0)*%(out_ix_chan_sz); // cache out chan ixs
tcix[1] = (%(out_chan_ix)+1)*%(out_ix_chan_sz); // cache out chan ixs
tcix[2] = (%(out_chan_ix)+2)*%(out_ix_chan_sz); // cache out chan ixs
tcix[3] = (%(out_chan_ix)+3)*%(out_ix_chan_sz); // cache out chan ixs
tcix[4] = (%(out_chan_ix)+4)*%(out_ix_chan_sz); // cache out chan ixs
tcix[5] = (%(out_chan_ix)+5)*%(out_ix_chan_sz); // cache out chan ixs
tcix[6] = (%(out_chan_ix)+6)*%(out_ix_chan_sz); // cache out chan ixs
tcix[7] = (%(out_chan_ix)+7)*%(out_ix_chan_sz); // cache out chan ixs
if( %(out_pel_0_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( %(out_pel_1_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( %(out_pel_2_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( %(out_pel_3_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( %(out_pel_4_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( %(out_pel_5_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( %(out_pel_6_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( %(out_pel_7_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores */
/* t_tile_dummy_stores = out_off[0] = max(0.0f,out_tile[0]+filts_strip[0]);
out_off[128] = max(0.0f,out_tile[1]+filts_strip[1]);
out_off[256] = max(0.0f,out_tile[2]+filts_strip[2]);
out_off[384] = max(0.0f,out_tile[3]+filts_strip[3]);
out_off[512] = max(0.0f,out_tile[4]+filts_strip[4]);
out_off[640] = max(0.0f,out_tile[5]+filts_strip[5]);
out_off[768] = max(0.0f,out_tile[6]+filts_strip[6]);
out_off[896] = max(0.0f,out_tile[7]+filts_strip[7]);
out_off[1024] = max(0.0f,out_tile[8]+filts_strip[0]);
out_off[1152] = max(0.0f,out_tile[9]+filts_strip[1]);
out_off[1280] = max(0.0f,out_tile[10]+filts_strip[2]);
out_off[1408] = max(0.0f,out_tile[11]+filts_strip[3]);
out_off[1536] = max(0.0f,out_tile[12]+filts_strip[4]);
out_off[1664] = max(0.0f,out_tile[13]+filts_strip[5]);
out_off[1792] = max(0.0f,out_tile[14]+filts_strip[6]);
out_off[1920] = max(0.0f,out_tile[15]+filts_strip[7]);
out_off[2048] = max(0.0f,out_tile[16]+filts_strip[0]);
out_off[2176] = max(0.0f,out_tile[17]+filts_strip[1]);
out_off[2304] = max(0.0f,out_tile[18]+filts_strip[2]);
out_off[2432] = max(0.0f,out_tile[19]+filts_strip[3]);
out_off[2560] = max(0.0f,out_tile[20]+filts_strip[4]);
out_off[2688] = max(0.0f,out_tile[21]+filts_strip[5]);
out_off[2816] = max(0.0f,out_tile[22]+filts_strip[6]);
out_off[2944] = max(0.0f,out_tile[23]+filts_strip[7]);
out_off[3072] = max(0.0f,out_tile[24]+filts_strip[0]);
out_off[3200] = max(0.0f,out_tile[25]+filts_strip[1]);
out_off[3328] = max(0.0f,out_tile[26]+filts_strip[2]);
out_off[3456] = max(0.0f,out_tile[27]+filts_strip[3]);
out_off[3584] = max(0.0f,out_tile[28]+filts_strip[4]);
out_off[3712] = max(0.0f,out_tile[29]+filts_strip[5]);
out_off[3840] = max(0.0f,out_tile[30]+filts_strip[6]);
out_off[3968] = max(0.0f,out_tile[31]+filts_strip[7]);
out_off[4096] = max(0.0f,out_tile[32]+filts_strip[0]);
out_off[4224] = max(0.0f,out_tile[33]+filts_strip[1]);
out_off[4352] = max(0.0f,out_tile[34]+filts_strip[2]);
out_off[4480] = max(0.0f,out_tile[35]+filts_strip[3]);
out_off[4608] = max(0.0f,out_tile[36]+filts_strip[4]);
out_off[4736] = max(0.0f,out_tile[37]+filts_strip[5]);
out_off[4864] = max(0.0f,out_tile[38]+filts_strip[6]);
out_off[4992] = max(0.0f,out_tile[39]+filts_strip[7]);
out_off[5120] = max(0.0f,out_tile[40]+filts_strip[0]);
out_off[5248] = max(0.0f,out_tile[41]+filts_strip[1]);
out_off[5376] = max(0.0f,out_tile[42]+filts_strip[2]);
out_off[5504] = max(0.0f,out_tile[43]+filts_strip[3]);
out_off[5632] = max(0.0f,out_tile[44]+filts_strip[4]);
out_off[5760] = max(0.0f,out_tile[45]+filts_strip[5]);
out_off[5888] = max(0.0f,out_tile[46]+filts_strip[6]);
out_off[6016] = max(0.0f,out_tile[47]+filts_strip[7]);
out_off[6144] = max(0.0f,out_tile[48]+filts_strip[0]);
out_off[6272] = max(0.0f,out_tile[49]+filts_strip[1]);
out_off[6400] = max(0.0f,out_tile[50]+filts_strip[2]);
out_off[6528] = max(0.0f,out_tile[51]+filts_strip[3]);
out_off[6656] = max(0.0f,out_tile[52]+filts_strip[4]);
out_off[6784] = max(0.0f,out_tile[53]+filts_strip[5]);
out_off[6912] = max(0.0f,out_tile[54]+filts_strip[6]);
out_off[7040] = max(0.0f,out_tile[55]+filts_strip[7]);
out_off[7168] = max(0.0f,out_tile[56]+filts_strip[0]);
out_off[7296] = max(0.0f,out_tile[57]+filts_strip[1]);
out_off[7424] = max(0.0f,out_tile[58]+filts_strip[2]);
out_off[7552] = max(0.0f,out_tile[59]+filts_strip[3]);
out_off[7680] = max(0.0f,out_tile[60]+filts_strip[4]);
out_off[7808] = max(0.0f,out_tile[61]+filts_strip[5]);
out_off[7936] = max(0.0f,out_tile[62]+filts_strip[6]);
out_off[8064] = max(0.0f,out_tile[63]+filts_strip[7]);
*/
/* t_tile_bias_loads = // begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)];
// end t_tile_bias_loads */
/* inner_loop_body = // begin inner_loop_body
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[3*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[3*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[3*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[3*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[3*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[3*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[3*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[3*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[4*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[4*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[4*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[4*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[4*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[4*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[4*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[4*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[5*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[5*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[5*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[5*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[5*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[5*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[5*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[5*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[6*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[6*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[6*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[6*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[6*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[6*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[6*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[6*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[7*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[7*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[7*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[7*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[7*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[7*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[7*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[7*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
*/
CUCL_GLOBAL_KERNEL void xpose_in__num_imgs_20__in_chan_tile_8__pad_in_chans_192__in_chans_192__ysz_28__xsz_28__tix_pels_tile_sz_16__bix_pels_blk_sz_123( GASQ float const * const in, GASQ float * const out ) {
int32_t const out_ix = GLOB_ID_1D;
int32_t const chan_ix = ((out_ix/1024)%24)*8 + ((out_ix/128)%8);
int32_t const pel_ix = (out_ix/24576)*128 + (out_ix%128);
float v = 0.0f;
if( ( chan_ix < 192 ) && ( (pel_ix/784) < 20 ) ) {
v = in[ (pel_ix/784)*150528 +
chan_ix*784 +
((pel_ix/28)%28)*28 +
(pel_ix%28)*1 ];
}
out[out_ix] = v;
}
/*
in_pels = num_img * in.sz.dims_prod()
num_in_blks = u32_ceil_div( in_pels, block_chan_pels )
normal in dims: img, chan, y, x OR img, chan, pels // where pels = x,y dims merged
block_iters = u32_ceil_div( chan, in_chan_tile ) // for ccp1, 96/8=12
pad_chan = block_iter * in_chan_tile // pad by up to (in_chan_tile-1) [typ. 8; pad with zeros? garbage okay?]
block_chan_pels = t_tile_sz*tix_pels_tile_sz // typically 8*8=64
block_iter_pels = block_chan_pels * in_chan_tile; // typically 512
block_pels = 12*512 = 6144 // note: 24576 bytes, prob. too big for SM to fully cache, but 512=2K (per-iter cache) is fine.
xposed in dims (inner): (block_iter, block_iter_chan, block_iter_pel) == block_pel
sz (inner): (block_iters, in_chan_tile, block_chan_pels) == block_pels (only inner 2 dims need to be linear?)
*/
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_chan_tile = 8 */
/* pad_in_chans = 192 */
/* in_chans = 192 */
/* ysz = 28 */
/* xsz = 28 */
/* tix_pels_tile_sz = 16 */
/* bix_pels_blk_sz = 123 */
/* rtc_func_name = xpose_in__num_imgs_20__in_chan_tile_8__pad_in_chans_192__in_chans_192__ysz_28__xsz_28__tix_pels_tile_sz_16__bix_pels_blk_sz_123 */
/* out_ix_blk_pel_dim = 128 */
/* out_ix_blk_pel_sz = 1 */
/* out_ix_blk_pel_nomod = out_ix */
/* out_ix_blk_pel = (out_ix%%128) */
/* out_ix_blk_iter_chan_dim = 8 */
/* out_ix_blk_iter_chan_sz = 128 */
/* out_ix_blk_iter_chan_nomod = (out_ix/128) */
/* out_ix_blk_iter_chan = ((out_ix/128)%%8) */
/* out_ix_blk_iter_dim = 24 */
/* out_ix_blk_iter_sz = 1024 */
/* out_ix_blk_iter_nomod = (out_ix/1024) */
/* out_ix_blk_iter = ((out_ix/1024)%%24) */
/* out_ix_blk_dim = 123 */
/* out_ix_blk_sz = 24576 */
/* out_ix_blk_nomod = (out_ix/24576) */
/* out_ix_blk = (out_ix/24576) */
/* out_ix_sz = 3022848 */
/* pel_ix_x_dim = 28 */
/* pel_ix_x_sz = 1 */
/* pel_ix_x_nomod = pel_ix */
/* pel_ix_x = (pel_ix%%28) */
/* pel_ix_y_dim = 28 */
/* pel_ix_y_sz = 28 */
/* pel_ix_y_nomod = (pel_ix/28) */
/* pel_ix_y = ((pel_ix/28)%%28) */
/* pel_ix_img_dim = 20 */
/* pel_ix_img_sz = 784 */
/* pel_ix_img_nomod = (pel_ix/784) */
/* pel_ix_img = (pel_ix/784) */
/* pel_ix_sz = 15680 */
/* in_ix_x_dim = 28 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%28) */
/* in_ix_y_dim = 28 */
/* in_ix_y_sz = 28 */
/* in_ix_y_nomod = (in_ix/28) */
/* in_ix_y = ((in_ix/28)%%28) */
/* in_ix_chan_dim = 192 */
/* in_ix_chan_sz = 784 */
/* in_ix_chan_nomod = (in_ix/784) */
/* in_ix_chan = ((in_ix/784)%%192) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 150528 */
/* in_ix_img_nomod = (in_ix/150528) */
/* in_ix_img = (in_ix/150528) */
/* in_ix_sz = 3010560 */
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_64__in_chans_192__kysz_1__kxsz_1( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile
{
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg
int32_t const filts_ix = GLOB_ID_1D;
if( filts_ix >= 12288 ) { return; }
int32_t const fioc = (filts_ix/192);
float val = 0.0f;
int32_t const filts_xp_ix =
(fioc/64)*12288 +
(fioc%8)*8 +
((fioc/8)%8)*1 +
(filts_ix%192)*64 +
(filts_ix%1)*64 +
(filts_ix%1)*64;
#if 1
val = in[filts_ix];
#else
if( (filts_ix%192) == 0 ) {
// if( ((filts_ix%1) == 5) && ((filts_ix%1) == 5) )
{
val = (filts_ix%1)*100 + (filts_ix%1);
}
}
#endif
out[filts_xp_ix] = val;
}
// -- template substituion table used: --
/* out_chans = 64 */
/* in_chans = 192 */
/* kysz = 1 */
/* kxsz = 1 */
/* rtc_func_name = xpose_filts__out_chans_64__in_chans_192__kysz_1__kxsz_1 */
/* t_tile_sz = 8 */
/* filts_ix_x_dim = 1 */
/* filts_ix_x_sz = 1 */
/* filts_ix_x_nomod = filts_ix */
/* filts_ix_x = (filts_ix%%1) */
/* filts_ix_y_dim = 1 */
/* filts_ix_y_sz = 1 */
/* filts_ix_y_nomod = filts_ix */
/* filts_ix_y = (filts_ix%%1) */
/* filts_ix_in_chan_dim = 192 */
/* filts_ix_in_chan_sz = 1 */
/* filts_ix_in_chan_nomod = filts_ix */
/* filts_ix_in_chan = (filts_ix%%192) */
/* filts_ix_out_chan_dim = 64 */
/* filts_ix_out_chan_sz = 192 */
/* filts_ix_out_chan_nomod = (filts_ix/192) */
/* filts_ix_out_chan = (filts_ix/192) */
/* filts_ix_sz = 12288 */
/* filts_xp_ix_out_chan_tile_dim = 8 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%8) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 8 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/8) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/8)%%8) */
/* filts_xp_ix_x_dim = 1 */
/* filts_xp_ix_x_sz = 64 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/64) */
/* filts_xp_ix_x = ((filts_xp_ix/64)%%1) */
/* filts_xp_ix_y_dim = 1 */
/* filts_xp_ix_y_sz = 64 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/64) */
/* filts_xp_ix_y = ((filts_xp_ix/64)%%1) */
/* filts_xp_ix_in_chan_dim = 192 */
/* filts_xp_ix_in_chan_sz = 64 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/64) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/64)%%192) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 12288 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/12288) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/12288) */
/* filts_xp_ix_sz = 12288 */
/* fioc_out_chan_reg_dim = 8 */
/* fioc_out_chan_reg_sz = 1 */
/* fioc_out_chan_reg_nomod = fioc */
/* fioc_out_chan_reg = (fioc%%8) */
/* fioc_out_chan_tile_dim = 8 */
/* fioc_out_chan_tile_sz = 8 */
/* fioc_out_chan_tile_nomod = (fioc/8) */
/* fioc_out_chan_tile = ((fioc/8)%%8) */
/* fioc_out_chan_blk_dim = 1 */
/* fioc_out_chan_blk_sz = 64 */
/* fioc_out_chan_blk_nomod = (fioc/64) */
/* fioc_out_chan_blk = (fioc/64) */
/* fioc_sz = 64 */
// each thread: computes outputs across chan dim, using inputs across chan dim
CUCL_GLOBAL_KERNEL void copy__num_imgs_20__in_chans_64__ysz_28__xsz_28__out_chans_256__ocix_0( GASQ float const * const in, GASQ float * const out ) {
int32_t const in_ix = GLOB_ID_1D;
if( in_ix >= 1003520 ) { return; }
int32_t const out_ix = (in_ix/50176)*200704 + (((in_ix/784)%64)+0)*784 +
((in_ix/28)%28)*28 + (in_ix%28)*1;
out[out_ix] = in[in_ix];
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_chans = 64 */
/* ysz = 28 */
/* xsz = 28 */
/* out_chans = 256 */
/* ocix = 0 */
/* rtc_func_name = copy__num_imgs_20__in_chans_64__ysz_28__xsz_28__out_chans_256__ocix_0 */
/* in_ix_x_dim = 28 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%28) */
/* in_ix_y_dim = 28 */
/* in_ix_y_sz = 28 */
/* in_ix_y_nomod = (in_ix/28) */
/* in_ix_y = ((in_ix/28)%%28) */
/* in_ix_chan_dim = 64 */
/* in_ix_chan_sz = 784 */
/* in_ix_chan_nomod = (in_ix/784) */
/* in_ix_chan = ((in_ix/784)%%64) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 50176 */
/* in_ix_img_nomod = (in_ix/50176) */
/* in_ix_img = (in_ix/50176) */
/* in_ix_sz = 1003520 */
/* out_ix_x_dim = 28 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%28) */
/* out_ix_y_dim = 28 */
/* out_ix_y_sz = 28 */
/* out_ix_y_nomod = (out_ix/28) */
/* out_ix_y = ((out_ix/28)%%28) */
/* out_ix_chan_dim = 256 */
/* out_ix_chan_sz = 784 */
/* out_ix_chan_nomod = (out_ix/784) */
/* out_ix_chan = ((out_ix/784)%%256) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 200704 */
/* out_ix_img_nomod = (out_ix/200704) */
/* out_ix_img = (out_ix/200704) */
/* out_ix_sz = 4014080 */
// each thread: computes outputs across chan dim, using inputs across chan dim
CUCL_GLOBAL_KERNEL void copy__num_imgs_20__in_chans_128__ysz_28__xsz_28__out_chans_256__ocix_64( GASQ float const * const in, GASQ float * const out ) {
int32_t const in_ix = GLOB_ID_1D;
if( in_ix >= 2007040 ) { return; }
int32_t const out_ix = (in_ix/100352)*200704 + (((in_ix/784)%128)+64)*784 +
((in_ix/28)%28)*28 + (in_ix%28)*1;
out[out_ix] = in[in_ix];
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_chans = 128 */
/* ysz = 28 */
/* xsz = 28 */
/* out_chans = 256 */
/* ocix = 64 */
/* rtc_func_name = copy__num_imgs_20__in_chans_128__ysz_28__xsz_28__out_chans_256__ocix_64 */
/* in_ix_x_dim = 28 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%28) */
/* in_ix_y_dim = 28 */
/* in_ix_y_sz = 28 */
/* in_ix_y_nomod = (in_ix/28) */
/* in_ix_y = ((in_ix/28)%%28) */
/* in_ix_chan_dim = 128 */
/* in_ix_chan_sz = 784 */
/* in_ix_chan_nomod = (in_ix/784) */
/* in_ix_chan = ((in_ix/784)%%128) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 100352 */
/* in_ix_img_nomod = (in_ix/100352) */
/* in_ix_img = (in_ix/100352) */
/* in_ix_sz = 2007040 */
/* out_ix_x_dim = 28 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%28) */
/* out_ix_y_dim = 28 */
/* out_ix_y_sz = 28 */
/* out_ix_y_nomod = (out_ix/28) */
/* out_ix_y = ((out_ix/28)%%28) */
/* out_ix_chan_dim = 256 */
/* out_ix_chan_sz = 784 */
/* out_ix_chan_nomod = (out_ix/784) */
/* out_ix_chan = ((out_ix/784)%%256) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 200704 */
/* out_ix_img_nomod = (out_ix/200704) */
/* out_ix_img = (out_ix/200704) */
/* out_ix_sz = 4014080 */
// each thread: computes outputs across chan dim, using inputs across chan dim
CUCL_GLOBAL_KERNEL void copy__num_imgs_20__in_chans_32__ysz_28__xsz_28__out_chans_256__ocix_192( GASQ float const * const in, GASQ float * const out ) {
int32_t const in_ix = GLOB_ID_1D;
if( in_ix >= 501760 ) { return; }
int32_t const out_ix = (in_ix/25088)*200704 + (((in_ix/784)%32)+192)*784 +
((in_ix/28)%28)*28 + (in_ix%28)*1;
out[out_ix] = in[in_ix];
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_chans = 32 */
/* ysz = 28 */
/* xsz = 28 */
/* out_chans = 256 */
/* ocix = 192 */
/* rtc_func_name = copy__num_imgs_20__in_chans_32__ysz_28__xsz_28__out_chans_256__ocix_192 */
/* in_ix_x_dim = 28 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%28) */
/* in_ix_y_dim = 28 */
/* in_ix_y_sz = 28 */
/* in_ix_y_nomod = (in_ix/28) */
/* in_ix_y = ((in_ix/28)%%28) */
/* in_ix_chan_dim = 32 */
/* in_ix_chan_sz = 784 */
/* in_ix_chan_nomod = (in_ix/784) */
/* in_ix_chan = ((in_ix/784)%%32) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 25088 */
/* in_ix_img_nomod = (in_ix/25088) */
/* in_ix_img = (in_ix/25088) */
/* in_ix_sz = 501760 */
/* out_ix_x_dim = 28 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%28) */
/* out_ix_y_dim = 28 */
/* out_ix_y_sz = 28 */
/* out_ix_y_nomod = (out_ix/28) */
/* out_ix_y = ((out_ix/28)%%28) */
/* out_ix_chan_dim = 256 */
/* out_ix_chan_sz = 784 */
/* out_ix_chan_nomod = (out_ix/784) */
/* out_ix_chan = ((out_ix/784)%%256) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 200704 */
/* out_ix_img_nomod = (out_ix/200704) */
/* out_ix_img = (out_ix/200704) */
/* out_ix_sz = 4014080 */
// each thread: computes outputs across chan dim, using inputs across chan dim
CUCL_GLOBAL_KERNEL void copy__num_imgs_20__in_chans_32__ysz_28__xsz_28__out_chans_256__ocix_224( GASQ float const * const in, GASQ float * const out ) {
int32_t const in_ix = GLOB_ID_1D;
if( in_ix >= 501760 ) { return; }
int32_t const out_ix = (in_ix/25088)*200704 + (((in_ix/784)%32)+224)*784 +
((in_ix/28)%28)*28 + (in_ix%28)*1;
out[out_ix] = in[in_ix];
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_chans = 32 */
/* ysz = 28 */
/* xsz = 28 */
/* out_chans = 256 */
/* ocix = 224 */
/* rtc_func_name = copy__num_imgs_20__in_chans_32__ysz_28__xsz_28__out_chans_256__ocix_224 */
/* in_ix_x_dim = 28 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%28) */
/* in_ix_y_dim = 28 */
/* in_ix_y_sz = 28 */
/* in_ix_y_nomod = (in_ix/28) */
/* in_ix_y = ((in_ix/28)%%28) */
/* in_ix_chan_dim = 32 */
/* in_ix_chan_sz = 784 */
/* in_ix_chan_nomod = (in_ix/784) */
/* in_ix_chan = ((in_ix/784)%%32) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 25088 */
/* in_ix_img_nomod = (in_ix/25088) */
/* in_ix_img = (in_ix/25088) */
/* in_ix_sz = 501760 */
/* out_ix_x_dim = 28 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%28) */
/* out_ix_y_dim = 28 */
/* out_ix_y_sz = 28 */
/* out_ix_y_nomod = (out_ix/28) */
/* out_ix_y = ((out_ix/28)%%28) */
/* out_ix_chan_dim = 256 */
/* out_ix_chan_sz = 784 */
/* out_ix_chan_nomod = (out_ix/784) */
/* out_ix_chan = ((out_ix/784)%%256) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 200704 */
/* out_ix_img_nomod = (out_ix/200704) */
/* out_ix_img = (out_ix/200704) */
/* out_ix_sz = 4014080 */
// each thread: computes 8x8 block of out
// loop over k dim
CUCL_GLOBAL_KERNEL void k1conv__num_imgs_20__in_dim_0_28__in_dim_1_28__conv_has_relu_1__out_chans_128__write_xposed_0__in_chans_256( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) {
//int32_t const blk_in_ix_sz = 8*8;
LOCSHAR_MEM float all_smem[1536]; // note: max(filts+in,out) == max(1024+512,1024)
LSMASQ float * const filts_smem = all_smem;
LSMASQ float * const in_smem = filts_smem + 1024;
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz )
float in_strip[8]; // segment of input line sufficient for one unrolling of inner loop
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*32768; // index of first out chan
int32_t blk_in_ix_base = GRP_ID_1D*16384 + LOC_ID_1D;// index of first input pel to load for this thread
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%16);
LSMASQ float * const in_smem_off = in_smem + 8*(LOC_ID_1D/16);
LSMASQ float * const out_smem_off = all_smem + LOC_ID_1D;
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D;
// iteratate over filter elements
for( int32_t blk_iter = 0; blk_iter != 32; ++blk_iter ) {
BARRIER_SYNC;
// begin smem_loads
filts_smem[(LOC_ID_1D + 128 * 0)] = filts[filts_off+(128*0)];
filts_smem[(LOC_ID_1D + 128 * 1)] = filts[filts_off+(128*1)];
filts_smem[(LOC_ID_1D + 128 * 2)] = filts[filts_off+(128*2)];
filts_smem[(LOC_ID_1D + 128 * 3)] = filts[filts_off+(128*3)];
filts_smem[(LOC_ID_1D + 128 * 4)] = filts[filts_off+(128*4)];
filts_smem[(LOC_ID_1D + 128 * 5)] = filts[filts_off+(128*5)];
filts_smem[(LOC_ID_1D + 128 * 6)] = filts[filts_off+(128*6)];
filts_smem[(LOC_ID_1D + 128 * 7)] = filts[filts_off+(128*7)];
in_smem[(LOC_ID_1D + 128 * 0)] = in[ blk_in_ix_base + (128*0) ];
in_smem[(LOC_ID_1D + 128 * 1)] = in[ blk_in_ix_base + (128*1) ];
in_smem[(LOC_ID_1D + 128 * 2)] = in[ blk_in_ix_base + (128*2) ];
in_smem[(LOC_ID_1D + 128 * 3)] = in[ blk_in_ix_base + (128*3) ];
// end smem_loads;
BARRIER_SYNC;
filts_off += 128*8;
blk_in_ix_base += 512;
// begin inner_loop_body
filts_strip[0] = filts_smem_off[0*128+0*16];
filts_strip[1] = filts_smem_off[0*128+1*16];
filts_strip[2] = filts_smem_off[0*128+2*16];
filts_strip[3] = filts_smem_off[0*128+3*16];
filts_strip[4] = filts_smem_off[0*128+4*16];
filts_strip[5] = filts_smem_off[0*128+5*16];
filts_strip[6] = filts_smem_off[0*128+6*16];
filts_strip[7] = filts_smem_off[0*128+7*16];
in_strip[0] = in_smem_off[(0*8*8+0)];
in_strip[1] = in_smem_off[(0*8*8+1)];
in_strip[2] = in_smem_off[(0*8*8+2)];
in_strip[3] = in_smem_off[(0*8*8+3)];
in_strip[4] = in_smem_off[(0*8*8+4)];
in_strip[5] = in_smem_off[(0*8*8+5)];
in_strip[6] = in_smem_off[(0*8*8+6)];
in_strip[7] = in_smem_off[(0*8*8+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[1*128+0*16];
filts_strip[1] = filts_smem_off[1*128+1*16];
filts_strip[2] = filts_smem_off[1*128+2*16];
filts_strip[3] = filts_smem_off[1*128+3*16];
filts_strip[4] = filts_smem_off[1*128+4*16];
filts_strip[5] = filts_smem_off[1*128+5*16];
filts_strip[6] = filts_smem_off[1*128+6*16];
filts_strip[7] = filts_smem_off[1*128+7*16];
in_strip[0] = in_smem_off[(1*8*8+0)];
in_strip[1] = in_smem_off[(1*8*8+1)];
in_strip[2] = in_smem_off[(1*8*8+2)];
in_strip[3] = in_smem_off[(1*8*8+3)];
in_strip[4] = in_smem_off[(1*8*8+4)];
in_strip[5] = in_smem_off[(1*8*8+5)];
in_strip[6] = in_smem_off[(1*8*8+6)];
in_strip[7] = in_smem_off[(1*8*8+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[2*128+0*16];
filts_strip[1] = filts_smem_off[2*128+1*16];
filts_strip[2] = filts_smem_off[2*128+2*16];
filts_strip[3] = filts_smem_off[2*128+3*16];
filts_strip[4] = filts_smem_off[2*128+4*16];
filts_strip[5] = filts_smem_off[2*128+5*16];
filts_strip[6] = filts_smem_off[2*128+6*16];
filts_strip[7] = filts_smem_off[2*128+7*16];
in_strip[0] = in_smem_off[(2*8*8+0)];
in_strip[1] = in_smem_off[(2*8*8+1)];
in_strip[2] = in_smem_off[(2*8*8+2)];
in_strip[3] = in_smem_off[(2*8*8+3)];
in_strip[4] = in_smem_off[(2*8*8+4)];
in_strip[5] = in_smem_off[(2*8*8+5)];
in_strip[6] = in_smem_off[(2*8*8+6)];
in_strip[7] = in_smem_off[(2*8*8+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[3*128+0*16];
filts_strip[1] = filts_smem_off[3*128+1*16];
filts_strip[2] = filts_smem_off[3*128+2*16];
filts_strip[3] = filts_smem_off[3*128+3*16];
filts_strip[4] = filts_smem_off[3*128+4*16];
filts_strip[5] = filts_smem_off[3*128+5*16];
filts_strip[6] = filts_smem_off[3*128+6*16];
filts_strip[7] = filts_smem_off[3*128+7*16];
in_strip[0] = in_smem_off[(3*8*8+0)];
in_strip[1] = in_smem_off[(3*8*8+1)];
in_strip[2] = in_smem_off[(3*8*8+2)];
in_strip[3] = in_smem_off[(3*8*8+3)];
in_strip[4] = in_smem_off[(3*8*8+4)];
in_strip[5] = in_smem_off[(3*8*8+5)];
in_strip[6] = in_smem_off[(3*8*8+6)];
in_strip[7] = in_smem_off[(3*8*8+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[4*128+0*16];
filts_strip[1] = filts_smem_off[4*128+1*16];
filts_strip[2] = filts_smem_off[4*128+2*16];
filts_strip[3] = filts_smem_off[4*128+3*16];
filts_strip[4] = filts_smem_off[4*128+4*16];
filts_strip[5] = filts_smem_off[4*128+5*16];
filts_strip[6] = filts_smem_off[4*128+6*16];
filts_strip[7] = filts_smem_off[4*128+7*16];
in_strip[0] = in_smem_off[(4*8*8+0)];
in_strip[1] = in_smem_off[(4*8*8+1)];
in_strip[2] = in_smem_off[(4*8*8+2)];
in_strip[3] = in_smem_off[(4*8*8+3)];
in_strip[4] = in_smem_off[(4*8*8+4)];
in_strip[5] = in_smem_off[(4*8*8+5)];
in_strip[6] = in_smem_off[(4*8*8+6)];
in_strip[7] = in_smem_off[(4*8*8+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[5*128+0*16];
filts_strip[1] = filts_smem_off[5*128+1*16];
filts_strip[2] = filts_smem_off[5*128+2*16];
filts_strip[3] = filts_smem_off[5*128+3*16];
filts_strip[4] = filts_smem_off[5*128+4*16];
filts_strip[5] = filts_smem_off[5*128+5*16];
filts_strip[6] = filts_smem_off[5*128+6*16];
filts_strip[7] = filts_smem_off[5*128+7*16];
in_strip[0] = in_smem_off[(5*8*8+0)];
in_strip[1] = in_smem_off[(5*8*8+1)];
in_strip[2] = in_smem_off[(5*8*8+2)];
in_strip[3] = in_smem_off[(5*8*8+3)];
in_strip[4] = in_smem_off[(5*8*8+4)];
in_strip[5] = in_smem_off[(5*8*8+5)];
in_strip[6] = in_smem_off[(5*8*8+6)];
in_strip[7] = in_smem_off[(5*8*8+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[6*128+0*16];
filts_strip[1] = filts_smem_off[6*128+1*16];
filts_strip[2] = filts_smem_off[6*128+2*16];
filts_strip[3] = filts_smem_off[6*128+3*16];
filts_strip[4] = filts_smem_off[6*128+4*16];
filts_strip[5] = filts_smem_off[6*128+5*16];
filts_strip[6] = filts_smem_off[6*128+6*16];
filts_strip[7] = filts_smem_off[6*128+7*16];
in_strip[0] = in_smem_off[(6*8*8+0)];
in_strip[1] = in_smem_off[(6*8*8+1)];
in_strip[2] = in_smem_off[(6*8*8+2)];
in_strip[3] = in_smem_off[(6*8*8+3)];
in_strip[4] = in_smem_off[(6*8*8+4)];
in_strip[5] = in_smem_off[(6*8*8+5)];
in_strip[6] = in_smem_off[(6*8*8+6)];
in_strip[7] = in_smem_off[(6*8*8+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[7*128+0*16];
filts_strip[1] = filts_smem_off[7*128+1*16];
filts_strip[2] = filts_smem_off[7*128+2*16];
filts_strip[3] = filts_smem_off[7*128+3*16];
filts_strip[4] = filts_smem_off[7*128+4*16];
filts_strip[5] = filts_smem_off[7*128+5*16];
filts_strip[6] = filts_smem_off[7*128+6*16];
filts_strip[7] = filts_smem_off[7*128+7*16];
in_strip[0] = in_smem_off[(7*8*8+0)];
in_strip[1] = in_smem_off[(7*8*8+1)];
in_strip[2] = in_smem_off[(7*8*8+2)];
in_strip[3] = in_smem_off[(7*8*8+3)];
in_strip[4] = in_smem_off[(7*8*8+4)];
in_strip[5] = in_smem_off[(7*8*8+5)];
in_strip[6] = in_smem_off[(7*8*8+6)];
in_strip[7] = in_smem_off[(7*8*8+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
;
}
// load per-block biases into smem
if( flags == 2 ) { return; }
BARRIER_SYNC;
for( int32_t i = 0; i != 1; ++i ) {
int32_t const t_smem_bias_ix = LOC_ID_1D+128*i;
if( t_smem_bias_ix < 128 ) {
int32_t const ocix_base = (GRP_ID_1D%1)*128;
int32_t const load_reg = t_smem_bias_ix / 16;
int32_t const load_tile = t_smem_bias_ix % 16;
int32_t const ocix = ocix_base + load_tile*8 + load_reg;
if( ocix < 128 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; }
}
}
BARRIER_SYNC;
// load biases into filts_strip
// begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*16];
filts_strip[1] = filts_smem_off[1*16];
filts_strip[2] = filts_smem_off[2*16];
filts_strip[3] = filts_smem_off[3*16];
filts_strip[4] = filts_smem_off[4*16];
filts_strip[5] = filts_smem_off[5*16];
filts_strip[6] = filts_smem_off[6*16];
filts_strip[7] = filts_smem_off[7*16];
// end t_tile_bias_loads;
if( flags == 1 ) {
GASQ float * const out_off = out + LOC_ID_1D;
out_off[0] = max(0.0f,out_tile[0]+filts_strip[0]);
out_off[128] = max(0.0f,out_tile[1]+filts_strip[1]);
out_off[256] = max(0.0f,out_tile[2]+filts_strip[2]);
out_off[384] = max(0.0f,out_tile[3]+filts_strip[3]);
out_off[512] = max(0.0f,out_tile[4]+filts_strip[4]);
out_off[640] = max(0.0f,out_tile[5]+filts_strip[5]);
out_off[768] = max(0.0f,out_tile[6]+filts_strip[6]);
out_off[896] = max(0.0f,out_tile[7]+filts_strip[7]);
out_off[1024] = max(0.0f,out_tile[8]+filts_strip[0]);
out_off[1152] = max(0.0f,out_tile[9]+filts_strip[1]);
out_off[1280] = max(0.0f,out_tile[10]+filts_strip[2]);
out_off[1408] = max(0.0f,out_tile[11]+filts_strip[3]);
out_off[1536] = max(0.0f,out_tile[12]+filts_strip[4]);
out_off[1664] = max(0.0f,out_tile[13]+filts_strip[5]);
out_off[1792] = max(0.0f,out_tile[14]+filts_strip[6]);
out_off[1920] = max(0.0f,out_tile[15]+filts_strip[7]);
out_off[2048] = max(0.0f,out_tile[16]+filts_strip[0]);
out_off[2176] = max(0.0f,out_tile[17]+filts_strip[1]);
out_off[2304] = max(0.0f,out_tile[18]+filts_strip[2]);
out_off[2432] = max(0.0f,out_tile[19]+filts_strip[3]);
out_off[2560] = max(0.0f,out_tile[20]+filts_strip[4]);
out_off[2688] = max(0.0f,out_tile[21]+filts_strip[5]);
out_off[2816] = max(0.0f,out_tile[22]+filts_strip[6]);
out_off[2944] = max(0.0f,out_tile[23]+filts_strip[7]);
out_off[3072] = max(0.0f,out_tile[24]+filts_strip[0]);
out_off[3200] = max(0.0f,out_tile[25]+filts_strip[1]);
out_off[3328] = max(0.0f,out_tile[26]+filts_strip[2]);
out_off[3456] = max(0.0f,out_tile[27]+filts_strip[3]);
out_off[3584] = max(0.0f,out_tile[28]+filts_strip[4]);
out_off[3712] = max(0.0f,out_tile[29]+filts_strip[5]);
out_off[3840] = max(0.0f,out_tile[30]+filts_strip[6]);
out_off[3968] = max(0.0f,out_tile[31]+filts_strip[7]);
out_off[4096] = max(0.0f,out_tile[32]+filts_strip[0]);
out_off[4224] = max(0.0f,out_tile[33]+filts_strip[1]);
out_off[4352] = max(0.0f,out_tile[34]+filts_strip[2]);
out_off[4480] = max(0.0f,out_tile[35]+filts_strip[3]);
out_off[4608] = max(0.0f,out_tile[36]+filts_strip[4]);
out_off[4736] = max(0.0f,out_tile[37]+filts_strip[5]);
out_off[4864] = max(0.0f,out_tile[38]+filts_strip[6]);
out_off[4992] = max(0.0f,out_tile[39]+filts_strip[7]);
out_off[5120] = max(0.0f,out_tile[40]+filts_strip[0]);
out_off[5248] = max(0.0f,out_tile[41]+filts_strip[1]);
out_off[5376] = max(0.0f,out_tile[42]+filts_strip[2]);
out_off[5504] = max(0.0f,out_tile[43]+filts_strip[3]);
out_off[5632] = max(0.0f,out_tile[44]+filts_strip[4]);
out_off[5760] = max(0.0f,out_tile[45]+filts_strip[5]);
out_off[5888] = max(0.0f,out_tile[46]+filts_strip[6]);
out_off[6016] = max(0.0f,out_tile[47]+filts_strip[7]);
out_off[6144] = max(0.0f,out_tile[48]+filts_strip[0]);
out_off[6272] = max(0.0f,out_tile[49]+filts_strip[1]);
out_off[6400] = max(0.0f,out_tile[50]+filts_strip[2]);
out_off[6528] = max(0.0f,out_tile[51]+filts_strip[3]);
out_off[6656] = max(0.0f,out_tile[52]+filts_strip[4]);
out_off[6784] = max(0.0f,out_tile[53]+filts_strip[5]);
out_off[6912] = max(0.0f,out_tile[54]+filts_strip[6]);
out_off[7040] = max(0.0f,out_tile[55]+filts_strip[7]);
out_off[7168] = max(0.0f,out_tile[56]+filts_strip[0]);
out_off[7296] = max(0.0f,out_tile[57]+filts_strip[1]);
out_off[7424] = max(0.0f,out_tile[58]+filts_strip[2]);
out_off[7552] = max(0.0f,out_tile[59]+filts_strip[3]);
out_off[7680] = max(0.0f,out_tile[60]+filts_strip[4]);
out_off[7808] = max(0.0f,out_tile[61]+filts_strip[5]);
out_off[7936] = max(0.0f,out_tile[62]+filts_strip[6]);
out_off[8064] = max(0.0f,out_tile[63]+filts_strip[7]);
;
return;
}
// add bias to each elem of out_tile[] and store the results to out[]
// begin t_tile_stores
int32_t tpix[8];
int32_t tcix[8];
tpix[0] = ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+0)/784)*100352 + ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+0)%784)*1 ; // cache out patch ixs
tpix[1] = ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+1)/784)*100352 + ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+1)%784)*1 ; // cache out patch ixs
tpix[2] = ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+2)/784)*100352 + ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+2)%784)*1 ; // cache out patch ixs
tpix[3] = ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+3)/784)*100352 + ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+3)%784)*1 ; // cache out patch ixs
tpix[4] = ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+4)/784)*100352 + ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+4)%784)*1 ; // cache out patch ixs
tpix[5] = ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+5)/784)*100352 + ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+5)%784)*1 ; // cache out patch ixs
tpix[6] = ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+6)/784)*100352 + ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+6)%784)*1 ; // cache out patch ixs
tpix[7] = ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+7)/784)*100352 + ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+7)%784)*1 ; // cache out patch ixs
tcix[0] = ((((LOC_ID_1D%16)+(GRP_ID_1D%1)*16)*8)+0)*784; // cache out chan ixs
tcix[1] = ((((LOC_ID_1D%16)+(GRP_ID_1D%1)*16)*8)+1)*784; // cache out chan ixs
tcix[2] = ((((LOC_ID_1D%16)+(GRP_ID_1D%1)*16)*8)+2)*784; // cache out chan ixs
tcix[3] = ((((LOC_ID_1D%16)+(GRP_ID_1D%1)*16)*8)+3)*784; // cache out chan ixs
tcix[4] = ((((LOC_ID_1D%16)+(GRP_ID_1D%1)*16)*8)+4)*784; // cache out chan ixs
tcix[5] = ((((LOC_ID_1D%16)+(GRP_ID_1D%1)*16)*8)+5)*784; // cache out chan ixs
tcix[6] = ((((LOC_ID_1D%16)+(GRP_ID_1D%1)*16)*8)+6)*784; // cache out chan ixs
tcix[7] = ((((LOC_ID_1D%16)+(GRP_ID_1D%1)*16)*8)+7)*784; // cache out chan ixs
if( ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+0)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (128*784) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( tcix[1] < (128*784) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( tcix[2] < (128*784) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( tcix[3] < (128*784) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( tcix[4] < (128*784) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( tcix[5] < (128*784) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( tcix[6] < (128*784) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( tcix[7] < (128*784) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+1)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (128*784) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( tcix[1] < (128*784) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( tcix[2] < (128*784) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( tcix[3] < (128*784) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( tcix[4] < (128*784) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( tcix[5] < (128*784) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( tcix[6] < (128*784) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( tcix[7] < (128*784) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+2)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (128*784) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( tcix[1] < (128*784) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( tcix[2] < (128*784) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( tcix[3] < (128*784) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( tcix[4] < (128*784) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( tcix[5] < (128*784) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( tcix[6] < (128*784) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( tcix[7] < (128*784) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+3)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (128*784) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( tcix[1] < (128*784) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( tcix[2] < (128*784) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( tcix[3] < (128*784) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( tcix[4] < (128*784) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( tcix[5] < (128*784) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( tcix[6] < (128*784) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( tcix[7] < (128*784) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+4)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (128*784) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( tcix[1] < (128*784) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( tcix[2] < (128*784) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( tcix[3] < (128*784) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( tcix[4] < (128*784) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( tcix[5] < (128*784) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( tcix[6] < (128*784) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( tcix[7] < (128*784) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+5)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (128*784) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( tcix[1] < (128*784) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( tcix[2] < (128*784) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( tcix[3] < (128*784) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( tcix[4] < (128*784) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( tcix[5] < (128*784) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( tcix[6] < (128*784) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( tcix[7] < (128*784) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+6)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (128*784) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( tcix[1] < (128*784) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( tcix[2] < (128*784) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( tcix[3] < (128*784) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( tcix[4] < (128*784) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( tcix[5] < (128*784) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( tcix[6] < (128*784) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( tcix[7] < (128*784) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+7)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (128*784) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( tcix[1] < (128*784) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( tcix[2] < (128*784) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( tcix[3] < (128*784) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( tcix[4] < (128*784) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( tcix[5] < (128*784) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( tcix[6] < (128*784) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( tcix[7] < (128*784) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_dim_0 = 28 */
/* in_dim_1 = 28 */
/* conv_has_relu = 1 */
/* out_chans = 128 */
/* write_xposed = 0 */
/* in_chans = 256 */
/* rtc_func_name = k1conv__num_imgs_20__in_dim_0_28__in_dim_1_28__conv_has_relu_1__out_chans_128__write_xposed_0__in_chans_256 */
/* t_tile_sz = 8 */
/* out_ix_x_dim = 28 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%28) */
/* out_ix_y_dim = 28 */
/* out_ix_y_sz = 28 */
/* out_ix_y_nomod = (out_ix/28) */
/* out_ix_y = ((out_ix/28)%%28) */
/* out_ix_chan_dim = 128 */
/* out_ix_chan_sz = 784 */
/* out_ix_chan_nomod = (out_ix/784) */
/* out_ix_chan = ((out_ix/784)%%128) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 100352 */
/* out_ix_img_nomod = (out_ix/100352) */
/* out_ix_img = (out_ix/100352) */
/* out_ix_sz = 2007040 */
/* tpb = 128 */
/* in_chan_tile = 8 */
/* LOC_ID_1D_out_chan_tile_dim = 16 */
/* LOC_ID_1D_out_chan_tile_sz = 1 */
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%16) */
/* LOC_ID_1D_pels_tile_dim = 8 */
/* LOC_ID_1D_pels_tile_sz = 16 */
/* LOC_ID_1D_pels_tile_nomod = (LOC_ID_1D/16) */
/* LOC_ID_1D_pels_tile = (LOC_ID_1D/16) */
/* LOC_ID_1D_sz = 128 */
/* GRP_ID_1D_out_chan_blk_dim = 1 */
/* GRP_ID_1D_out_chan_blk_sz = 1 */
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */
/* GRP_ID_1D_pels_blk_dim = 245 */
/* GRP_ID_1D_pels_blk_sz = 1 */
/* GRP_ID_1D_pels_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_pels_blk = GRP_ID_1D */
/* GRP_ID_1D_sz = 245 */
/* in_ix_blk_pel_dim = 64 */
/* in_ix_blk_pel_sz = 1 */
/* in_ix_blk_pel_nomod = in_ix */
/* in_ix_blk_pel = (in_ix%%64) */
/* in_ix_blk_iter_chan_dim = 8 */
/* in_ix_blk_iter_chan_sz = 64 */
/* in_ix_blk_iter_chan_nomod = (in_ix/64) */
/* in_ix_blk_iter_chan = ((in_ix/64)%%8) */
/* in_ix_blk_iter_dim = 32 */
/* in_ix_blk_iter_sz = 512 */
/* in_ix_blk_iter_nomod = (in_ix/512) */
/* in_ix_blk_iter = ((in_ix/512)%%32) */
/* in_ix_blk_dim = 245 */
/* in_ix_blk_sz = 16384 */
/* in_ix_blk_nomod = (in_ix/16384) */
/* in_ix_blk = (in_ix/16384) */
/* in_ix_sz = 4014080 */
/* blk_filt_ix_sz = 128 */
/* filts_smem_sz = 1024 */
/* in_smem_sz = 512 */
/* out_smem_sz = 1024 */
/* all_smem_sz = 1536 */
/* filts_xp_ix_out_chan_tile_dim = 16 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%16) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 16 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/16) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/16)%%8) */
/* filts_xp_ix_in_chan_dim = 256 */
/* filts_xp_ix_in_chan_sz = 128 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/128) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/128)%%256) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 32768 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/32768) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/32768) */
/* filts_xp_ix_sz = 32768 */
/* out_chan_bias_smem_load_iter = 1 */
/* filts_off_adj = LOC_ID_1D */
/* smem_loads = // begin smem_loads
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)];
filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)];
filts_smem[(LOC_ID_1D + %(tpb) * 2)] = filts[filts_off+(%(tpb)*2)];
filts_smem[(LOC_ID_1D + %(tpb) * 3)] = filts[filts_off+(%(tpb)*3)];
filts_smem[(LOC_ID_1D + %(tpb) * 4)] = filts[filts_off+(%(tpb)*4)];
filts_smem[(LOC_ID_1D + %(tpb) * 5)] = filts[filts_off+(%(tpb)*5)];
filts_smem[(LOC_ID_1D + %(tpb) * 6)] = filts[filts_off+(%(tpb)*6)];
filts_smem[(LOC_ID_1D + %(tpb) * 7)] = filts[filts_off+(%(tpb)*7)];
in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ];
in_smem[(LOC_ID_1D + %(tpb) * 1)] = in[ blk_in_ix_base + (%(tpb)*1) ];
in_smem[(LOC_ID_1D + %(tpb) * 2)] = in[ blk_in_ix_base + (%(tpb)*2) ];
in_smem[(LOC_ID_1D + %(tpb) * 3)] = in[ blk_in_ix_base + (%(tpb)*3) ];
// end smem_loads */
/* out_chan_tile = (%(LOC_ID_1D_out_chan_tile)+%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim)) */
/* out_chan_ix = (%(out_chan_tile)*%(t_tile_sz)) */
/* t_smem_ld_pel_pel_dim = 64 */
/* t_smem_ld_pel_pel_sz = 1 */
/* t_smem_ld_pel_pel_nomod = t_smem_ld_pel */
/* t_smem_ld_pel_pel = (t_smem_ld_pel%%64) */
/* t_smem_ld_pel_chan_dim = 8 */
/* t_smem_ld_pel_chan_sz = 64 */
/* t_smem_ld_pel_chan_nomod = (t_smem_ld_pel/64) */
/* t_smem_ld_pel_chan = (t_smem_ld_pel/64) */
/* t_smem_ld_pel_sz = 512 */
/* out_pel_0 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+0) */
/* out_pel_0_pel_dim = 784 */
/* out_pel_0_pel_sz = 1 */
/* out_pel_0_pel_nomod = %(out_pel_0) */
/* out_pel_0_pel = (%(out_pel_0)%%784) */
/* out_pel_0_img_dim = 20 */
/* out_pel_0_img_sz = 784 */
/* out_pel_0_img_nomod = (%(out_pel_0)/784) */
/* out_pel_0_img = (%(out_pel_0)/784) */
/* out_pel_0_sz = 15680 */
/* out_pel_1 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+1) */
/* out_pel_1_pel_dim = 784 */
/* out_pel_1_pel_sz = 1 */
/* out_pel_1_pel_nomod = %(out_pel_1) */
/* out_pel_1_pel = (%(out_pel_1)%%784) */
/* out_pel_1_img_dim = 20 */
/* out_pel_1_img_sz = 784 */
/* out_pel_1_img_nomod = (%(out_pel_1)/784) */
/* out_pel_1_img = (%(out_pel_1)/784) */
/* out_pel_1_sz = 15680 */
/* out_pel_2 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+2) */
/* out_pel_2_pel_dim = 784 */
/* out_pel_2_pel_sz = 1 */
/* out_pel_2_pel_nomod = %(out_pel_2) */
/* out_pel_2_pel = (%(out_pel_2)%%784) */
/* out_pel_2_img_dim = 20 */
/* out_pel_2_img_sz = 784 */
/* out_pel_2_img_nomod = (%(out_pel_2)/784) */
/* out_pel_2_img = (%(out_pel_2)/784) */
/* out_pel_2_sz = 15680 */
/* out_pel_3 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+3) */
/* out_pel_3_pel_dim = 784 */
/* out_pel_3_pel_sz = 1 */
/* out_pel_3_pel_nomod = %(out_pel_3) */
/* out_pel_3_pel = (%(out_pel_3)%%784) */
/* out_pel_3_img_dim = 20 */
/* out_pel_3_img_sz = 784 */
/* out_pel_3_img_nomod = (%(out_pel_3)/784) */
/* out_pel_3_img = (%(out_pel_3)/784) */
/* out_pel_3_sz = 15680 */
/* out_pel_4 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+4) */
/* out_pel_4_pel_dim = 784 */
/* out_pel_4_pel_sz = 1 */
/* out_pel_4_pel_nomod = %(out_pel_4) */
/* out_pel_4_pel = (%(out_pel_4)%%784) */
/* out_pel_4_img_dim = 20 */
/* out_pel_4_img_sz = 784 */
/* out_pel_4_img_nomod = (%(out_pel_4)/784) */
/* out_pel_4_img = (%(out_pel_4)/784) */
/* out_pel_4_sz = 15680 */
/* out_pel_5 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+5) */
/* out_pel_5_pel_dim = 784 */
/* out_pel_5_pel_sz = 1 */
/* out_pel_5_pel_nomod = %(out_pel_5) */
/* out_pel_5_pel = (%(out_pel_5)%%784) */
/* out_pel_5_img_dim = 20 */
/* out_pel_5_img_sz = 784 */
/* out_pel_5_img_nomod = (%(out_pel_5)/784) */
/* out_pel_5_img = (%(out_pel_5)/784) */
/* out_pel_5_sz = 15680 */
/* out_pel_6 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+6) */
/* out_pel_6_pel_dim = 784 */
/* out_pel_6_pel_sz = 1 */
/* out_pel_6_pel_nomod = %(out_pel_6) */
/* out_pel_6_pel = (%(out_pel_6)%%784) */
/* out_pel_6_img_dim = 20 */
/* out_pel_6_img_sz = 784 */
/* out_pel_6_img_nomod = (%(out_pel_6)/784) */
/* out_pel_6_img = (%(out_pel_6)/784) */
/* out_pel_6_sz = 15680 */
/* out_pel_7 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+7) */
/* out_pel_7_pel_dim = 784 */
/* out_pel_7_pel_sz = 1 */
/* out_pel_7_pel_nomod = %(out_pel_7) */
/* out_pel_7_pel = (%(out_pel_7)%%784) */
/* out_pel_7_img_dim = 20 */
/* out_pel_7_img_sz = 784 */
/* out_pel_7_img_nomod = (%(out_pel_7)/784) */
/* out_pel_7_img = (%(out_pel_7)/784) */
/* out_pel_7_sz = 15680 */
/* t_tile_stores = // begin t_tile_stores
int32_t tpix[%(t_tile_sz)];
int32_t tcix[%(t_tile_sz)];
tpix[0] = %(out_pel_0_img)*%(out_ix_img_sz) + %(out_pel_0_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[1] = %(out_pel_1_img)*%(out_ix_img_sz) + %(out_pel_1_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[2] = %(out_pel_2_img)*%(out_ix_img_sz) + %(out_pel_2_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[3] = %(out_pel_3_img)*%(out_ix_img_sz) + %(out_pel_3_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[4] = %(out_pel_4_img)*%(out_ix_img_sz) + %(out_pel_4_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[5] = %(out_pel_5_img)*%(out_ix_img_sz) + %(out_pel_5_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[6] = %(out_pel_6_img)*%(out_ix_img_sz) + %(out_pel_6_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tpix[7] = %(out_pel_7_img)*%(out_ix_img_sz) + %(out_pel_7_pel)*%(out_ix_x_sz) ; // cache out patch ixs
tcix[0] = (%(out_chan_ix)+0)*%(out_ix_chan_sz); // cache out chan ixs
tcix[1] = (%(out_chan_ix)+1)*%(out_ix_chan_sz); // cache out chan ixs
tcix[2] = (%(out_chan_ix)+2)*%(out_ix_chan_sz); // cache out chan ixs
tcix[3] = (%(out_chan_ix)+3)*%(out_ix_chan_sz); // cache out chan ixs
tcix[4] = (%(out_chan_ix)+4)*%(out_ix_chan_sz); // cache out chan ixs
tcix[5] = (%(out_chan_ix)+5)*%(out_ix_chan_sz); // cache out chan ixs
tcix[6] = (%(out_chan_ix)+6)*%(out_ix_chan_sz); // cache out chan ixs
tcix[7] = (%(out_chan_ix)+7)*%(out_ix_chan_sz); // cache out chan ixs
if( %(out_pel_0_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( %(out_pel_1_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( %(out_pel_2_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( %(out_pel_3_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( %(out_pel_4_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( %(out_pel_5_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( %(out_pel_6_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( %(out_pel_7_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores */
/* t_tile_dummy_stores = out_off[0] = max(0.0f,out_tile[0]+filts_strip[0]);
out_off[128] = max(0.0f,out_tile[1]+filts_strip[1]);
out_off[256] = max(0.0f,out_tile[2]+filts_strip[2]);
out_off[384] = max(0.0f,out_tile[3]+filts_strip[3]);
out_off[512] = max(0.0f,out_tile[4]+filts_strip[4]);
out_off[640] = max(0.0f,out_tile[5]+filts_strip[5]);
out_off[768] = max(0.0f,out_tile[6]+filts_strip[6]);
out_off[896] = max(0.0f,out_tile[7]+filts_strip[7]);
out_off[1024] = max(0.0f,out_tile[8]+filts_strip[0]);
out_off[1152] = max(0.0f,out_tile[9]+filts_strip[1]);
out_off[1280] = max(0.0f,out_tile[10]+filts_strip[2]);
out_off[1408] = max(0.0f,out_tile[11]+filts_strip[3]);
out_off[1536] = max(0.0f,out_tile[12]+filts_strip[4]);
out_off[1664] = max(0.0f,out_tile[13]+filts_strip[5]);
out_off[1792] = max(0.0f,out_tile[14]+filts_strip[6]);
out_off[1920] = max(0.0f,out_tile[15]+filts_strip[7]);
out_off[2048] = max(0.0f,out_tile[16]+filts_strip[0]);
out_off[2176] = max(0.0f,out_tile[17]+filts_strip[1]);
out_off[2304] = max(0.0f,out_tile[18]+filts_strip[2]);
out_off[2432] = max(0.0f,out_tile[19]+filts_strip[3]);
out_off[2560] = max(0.0f,out_tile[20]+filts_strip[4]);
out_off[2688] = max(0.0f,out_tile[21]+filts_strip[5]);
out_off[2816] = max(0.0f,out_tile[22]+filts_strip[6]);
out_off[2944] = max(0.0f,out_tile[23]+filts_strip[7]);
out_off[3072] = max(0.0f,out_tile[24]+filts_strip[0]);
out_off[3200] = max(0.0f,out_tile[25]+filts_strip[1]);
out_off[3328] = max(0.0f,out_tile[26]+filts_strip[2]);
out_off[3456] = max(0.0f,out_tile[27]+filts_strip[3]);
out_off[3584] = max(0.0f,out_tile[28]+filts_strip[4]);
out_off[3712] = max(0.0f,out_tile[29]+filts_strip[5]);
out_off[3840] = max(0.0f,out_tile[30]+filts_strip[6]);
out_off[3968] = max(0.0f,out_tile[31]+filts_strip[7]);
out_off[4096] = max(0.0f,out_tile[32]+filts_strip[0]);
out_off[4224] = max(0.0f,out_tile[33]+filts_strip[1]);
out_off[4352] = max(0.0f,out_tile[34]+filts_strip[2]);
out_off[4480] = max(0.0f,out_tile[35]+filts_strip[3]);
out_off[4608] = max(0.0f,out_tile[36]+filts_strip[4]);
out_off[4736] = max(0.0f,out_tile[37]+filts_strip[5]);
out_off[4864] = max(0.0f,out_tile[38]+filts_strip[6]);
out_off[4992] = max(0.0f,out_tile[39]+filts_strip[7]);
out_off[5120] = max(0.0f,out_tile[40]+filts_strip[0]);
out_off[5248] = max(0.0f,out_tile[41]+filts_strip[1]);
out_off[5376] = max(0.0f,out_tile[42]+filts_strip[2]);
out_off[5504] = max(0.0f,out_tile[43]+filts_strip[3]);
out_off[5632] = max(0.0f,out_tile[44]+filts_strip[4]);
out_off[5760] = max(0.0f,out_tile[45]+filts_strip[5]);
out_off[5888] = max(0.0f,out_tile[46]+filts_strip[6]);
out_off[6016] = max(0.0f,out_tile[47]+filts_strip[7]);
out_off[6144] = max(0.0f,out_tile[48]+filts_strip[0]);
out_off[6272] = max(0.0f,out_tile[49]+filts_strip[1]);
out_off[6400] = max(0.0f,out_tile[50]+filts_strip[2]);
out_off[6528] = max(0.0f,out_tile[51]+filts_strip[3]);
out_off[6656] = max(0.0f,out_tile[52]+filts_strip[4]);
out_off[6784] = max(0.0f,out_tile[53]+filts_strip[5]);
out_off[6912] = max(0.0f,out_tile[54]+filts_strip[6]);
out_off[7040] = max(0.0f,out_tile[55]+filts_strip[7]);
out_off[7168] = max(0.0f,out_tile[56]+filts_strip[0]);
out_off[7296] = max(0.0f,out_tile[57]+filts_strip[1]);
out_off[7424] = max(0.0f,out_tile[58]+filts_strip[2]);
out_off[7552] = max(0.0f,out_tile[59]+filts_strip[3]);
out_off[7680] = max(0.0f,out_tile[60]+filts_strip[4]);
out_off[7808] = max(0.0f,out_tile[61]+filts_strip[5]);
out_off[7936] = max(0.0f,out_tile[62]+filts_strip[6]);
out_off[8064] = max(0.0f,out_tile[63]+filts_strip[7]);
*/
/* t_tile_bias_loads = // begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)];
// end t_tile_bias_loads */
/* inner_loop_body = // begin inner_loop_body
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[3*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[3*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[3*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[3*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[3*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[3*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[3*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[3*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[4*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[4*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[4*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[4*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[4*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[4*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[4*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[4*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[5*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[5*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[5*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[5*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[5*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[5*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[5*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[5*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[6*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[6*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[6*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[6*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[6*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[6*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[6*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[6*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[7*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[7*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[7*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[7*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[7*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[7*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[7*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[7*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
in_strip[0] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)];
in_strip[1] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)];
in_strip[2] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)];
in_strip[3] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)];
in_strip[4] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)];
in_strip[5] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)];
in_strip[6] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)];
in_strip[7] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
*/
CUCL_GLOBAL_KERNEL void xpose_in__num_imgs_20__in_chan_tile_8__pad_in_chans_256__in_chans_256__ysz_28__xsz_28__tix_pels_tile_sz_8__bix_pels_blk_sz_245( GASQ float const * const in, GASQ float * const out ) {
int32_t const out_ix = GLOB_ID_1D;
int32_t const chan_ix = ((out_ix/512)%32)*8 + ((out_ix/64)%8);
int32_t const pel_ix = (out_ix/16384)*64 + (out_ix%64);
float v = 0.0f;
if( ( chan_ix < 256 ) && ( (pel_ix/784) < 20 ) ) {
v = in[ (pel_ix/784)*200704 +
chan_ix*784 +
((pel_ix/28)%28)*28 +
(pel_ix%28)*1 ];
}
out[out_ix] = v;
}
/*
in_pels = num_img * in.sz.dims_prod()
num_in_blks = u32_ceil_div( in_pels, block_chan_pels )
normal in dims: img, chan, y, x OR img, chan, pels // where pels = x,y dims merged
block_iters = u32_ceil_div( chan, in_chan_tile ) // for ccp1, 96/8=12
pad_chan = block_iter * in_chan_tile // pad by up to (in_chan_tile-1) [typ. 8; pad with zeros? garbage okay?]
block_chan_pels = t_tile_sz*tix_pels_tile_sz // typically 8*8=64
block_iter_pels = block_chan_pels * in_chan_tile; // typically 512
block_pels = 12*512 = 6144 // note: 24576 bytes, prob. too big for SM to fully cache, but 512=2K (per-iter cache) is fine.
xposed in dims (inner): (block_iter, block_iter_chan, block_iter_pel) == block_pel
sz (inner): (block_iters, in_chan_tile, block_chan_pels) == block_pels (only inner 2 dims need to be linear?)
*/
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_chan_tile = 8 */
/* pad_in_chans = 256 */
/* in_chans = 256 */
/* ysz = 28 */
/* xsz = 28 */
/* tix_pels_tile_sz = 8 */
/* bix_pels_blk_sz = 245 */
/* rtc_func_name = xpose_in__num_imgs_20__in_chan_tile_8__pad_in_chans_256__in_chans_256__ysz_28__xsz_28__tix_pels_tile_sz_8__bix_pels_blk_sz_245 */
/* out_ix_blk_pel_dim = 64 */
/* out_ix_blk_pel_sz = 1 */
/* out_ix_blk_pel_nomod = out_ix */
/* out_ix_blk_pel = (out_ix%%64) */
/* out_ix_blk_iter_chan_dim = 8 */
/* out_ix_blk_iter_chan_sz = 64 */
/* out_ix_blk_iter_chan_nomod = (out_ix/64) */
/* out_ix_blk_iter_chan = ((out_ix/64)%%8) */
/* out_ix_blk_iter_dim = 32 */
/* out_ix_blk_iter_sz = 512 */
/* out_ix_blk_iter_nomod = (out_ix/512) */
/* out_ix_blk_iter = ((out_ix/512)%%32) */
/* out_ix_blk_dim = 245 */
/* out_ix_blk_sz = 16384 */
/* out_ix_blk_nomod = (out_ix/16384) */
/* out_ix_blk = (out_ix/16384) */
/* out_ix_sz = 4014080 */
/* pel_ix_x_dim = 28 */
/* pel_ix_x_sz = 1 */
/* pel_ix_x_nomod = pel_ix */
/* pel_ix_x = (pel_ix%%28) */
/* pel_ix_y_dim = 28 */
/* pel_ix_y_sz = 28 */
/* pel_ix_y_nomod = (pel_ix/28) */
/* pel_ix_y = ((pel_ix/28)%%28) */
/* pel_ix_img_dim = 20 */
/* pel_ix_img_sz = 784 */
/* pel_ix_img_nomod = (pel_ix/784) */
/* pel_ix_img = (pel_ix/784) */
/* pel_ix_sz = 15680 */
/* in_ix_x_dim = 28 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%28) */
/* in_ix_y_dim = 28 */
/* in_ix_y_sz = 28 */
/* in_ix_y_nomod = (in_ix/28) */
/* in_ix_y = ((in_ix/28)%%28) */
/* in_ix_chan_dim = 256 */
/* in_ix_chan_sz = 784 */
/* in_ix_chan_nomod = (in_ix/784) */
/* in_ix_chan = ((in_ix/784)%%256) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 200704 */
/* in_ix_img_nomod = (in_ix/200704) */
/* in_ix_img = (in_ix/200704) */
/* in_ix_sz = 4014080 */
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_128__in_chans_256__kysz_1__kxsz_1( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile
{
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg
int32_t const filts_ix = GLOB_ID_1D;
if( filts_ix >= 32768 ) { return; }
int32_t const fioc = (filts_ix/256);
float val = 0.0f;
int32_t const filts_xp_ix =
(fioc/128)*32768 +
(fioc%8)*16 +
((fioc/8)%16)*1 +
(filts_ix%256)*128 +
(filts_ix%1)*128 +
(filts_ix%1)*128;
#if 1
val = in[filts_ix];
#else
if( (filts_ix%256) == 0 ) {
// if( ((filts_ix%1) == 5) && ((filts_ix%1) == 5) )
{
val = (filts_ix%1)*100 + (filts_ix%1);
}
}
#endif
out[filts_xp_ix] = val;
}
// -- template substituion table used: --
/* out_chans = 128 */
/* in_chans = 256 */
/* kysz = 1 */
/* kxsz = 1 */
/* rtc_func_name = xpose_filts__out_chans_128__in_chans_256__kysz_1__kxsz_1 */
/* t_tile_sz = 8 */
/* filts_ix_x_dim = 1 */
/* filts_ix_x_sz = 1 */
/* filts_ix_x_nomod = filts_ix */
/* filts_ix_x = (filts_ix%%1) */
/* filts_ix_y_dim = 1 */
/* filts_ix_y_sz = 1 */
/* filts_ix_y_nomod = filts_ix */
/* filts_ix_y = (filts_ix%%1) */
/* filts_ix_in_chan_dim = 256 */
/* filts_ix_in_chan_sz = 1 */
/* filts_ix_in_chan_nomod = filts_ix */
/* filts_ix_in_chan = (filts_ix%%256) */
/* filts_ix_out_chan_dim = 128 */
/* filts_ix_out_chan_sz = 256 */
/* filts_ix_out_chan_nomod = (filts_ix/256) */
/* filts_ix_out_chan = (filts_ix/256) */
/* filts_ix_sz = 32768 */
/* filts_xp_ix_out_chan_tile_dim = 16 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%16) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 16 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/16) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/16)%%8) */
/* filts_xp_ix_x_dim = 1 */
/* filts_xp_ix_x_sz = 128 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/128) */
/* filts_xp_ix_x = ((filts_xp_ix/128)%%1) */
/* filts_xp_ix_y_dim = 1 */
/* filts_xp_ix_y_sz = 128 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/128) */
/* filts_xp_ix_y = ((filts_xp_ix/128)%%1) */
/* filts_xp_ix_in_chan_dim = 256 */
/* filts_xp_ix_in_chan_sz = 128 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/128) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/128)%%256) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 32768 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/32768) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/32768) */
/* filts_xp_ix_sz = 32768 */
/* fioc_out_chan_reg_dim = 8 */
/* fioc_out_chan_reg_sz = 1 */
/* fioc_out_chan_reg_nomod = fioc */
/* fioc_out_chan_reg = (fioc%%8) */
/* fioc_out_chan_tile_dim = 16 */
/* fioc_out_chan_tile_sz = 8 */
/* fioc_out_chan_tile_nomod = (fioc/8) */
/* fioc_out_chan_tile = ((fioc/8)%%16) */
/* fioc_out_chan_blk_dim = 1 */
/* fioc_out_chan_blk_sz = 128 */
/* fioc_out_chan_blk_nomod = (fioc/128) */
/* fioc_out_chan_blk = (fioc/128) */
/* fioc_sz = 128 */
// each thread: computes 8x8 block of out
// loop over k dim
CUCL_GLOBAL_KERNEL void tconv__num_imgs_20__in_dim_0_28__in_dim_1_28__kern_sz_3__stride_1__in_pad_1__t_tile_sz_8__conv_has_relu_1__out_chans_192__in_chans_128( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) {
LOCSHAR_MEM float all_smem[1024]; // note: max(filts+in,out) == max(384+120,1024)
LSMASQ float * const filts_smem = all_smem;
LSMASQ float * const in_smem = filts_smem + 384;
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz )
float in_strip[10]; // segment of input line sufficient for one unrolling of inner loop
int32_t blk_in_ix_base = (GRP_ID_1D/2)*15360 + LOC_ID_1D;// index of first input pel to load for this thread
int32_t const blk_filt_ix_base = (GRP_ID_1D%2)*147456; // index of first out chan
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D;
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%16);
int32_t out_line = (GRP_ID_1D/8)*8; // first out_line of block
int32_t const blk_fli = (out_line/28); // image of first out_line of block
out_line += (LOC_ID_1D/16); // adjust to out_line of this thread
// offset in lines to deal with >1 img/block = (number of prior images (partial or full) in this block) * (adj to next img)
int32_t const img_off_lines = ((out_line/28) - blk_fli)*(3-1);
int32_t const in_y = (out_line%28)*1 - 1;
for( int32_t in_chan = 0; in_chan != 128; ++in_chan ) {
BARRIER_SYNC;
// begin in_smem_loads
if( (LOC_ID_1D + 128 * 0) < 120) { in_smem[(LOC_ID_1D + 128 * 0)] = in[ blk_in_ix_base + (128*0) ];}
blk_in_ix_base += 120;
// end in_smem_loads;
for( int32_t ky = 0; ky != 3; ++ky ) {
if( ky != 0 ) { BARRIER_SYNC; }
// begin filt_smem_loads
filts_smem[(LOC_ID_1D + 128 * 0)] = filts[filts_off+(128*0)];
filts_smem[(LOC_ID_1D + 128 * 1)] = filts[filts_off+(128*1)];
filts_smem[(LOC_ID_1D + 128 * 2)] = filts[filts_off+(128*2)];
filts_off += 384;
// end filt_smem_loads;
BARRIER_SYNC;
if( (out_line/28) >= 20 ) { continue; } // required: skip lines from invalid images (read might be invalid)
if( ((in_y+ky) < 0) || ((in_y+ky)>28) ) { continue; } // optimization: skip known-to-be-padding input lines
LSMASQ float * const in_smem_off = in_smem + ((LOC_ID_1D/16)*1+ky+img_off_lines)*10;
// begin inner_loop_body
in_strip[0] = in_smem_off[0];
in_strip[1] = in_smem_off[1];
in_strip[2] = in_smem_off[2];
in_strip[3] = in_smem_off[3];
in_strip[4] = in_smem_off[4];
in_strip[5] = in_smem_off[5];
in_strip[6] = in_smem_off[6];
in_strip[7] = in_smem_off[7];
in_strip[8] = in_smem_off[8];
in_strip[9] = in_smem_off[9];
filts_strip[0] = filts_smem_off[0*128+0*16];
filts_strip[1] = filts_smem_off[0*128+1*16];
filts_strip[2] = filts_smem_off[0*128+2*16];
filts_strip[3] = filts_smem_off[0*128+3*16];
filts_strip[4] = filts_smem_off[0*128+4*16];
filts_strip[5] = filts_smem_off[0*128+5*16];
filts_strip[6] = filts_smem_off[0*128+6*16];
filts_strip[7] = filts_smem_off[0*128+7*16];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[1*128+0*16];
filts_strip[1] = filts_smem_off[1*128+1*16];
filts_strip[2] = filts_smem_off[1*128+2*16];
filts_strip[3] = filts_smem_off[1*128+3*16];
filts_strip[4] = filts_smem_off[1*128+4*16];
filts_strip[5] = filts_smem_off[1*128+5*16];
filts_strip[6] = filts_smem_off[1*128+6*16];
filts_strip[7] = filts_smem_off[1*128+7*16];
out_tile[0] += filts_strip[0]*in_strip[1];
out_tile[1] += filts_strip[1]*in_strip[1];
out_tile[2] += filts_strip[2]*in_strip[1];
out_tile[3] += filts_strip[3]*in_strip[1];
out_tile[4] += filts_strip[4]*in_strip[1];
out_tile[5] += filts_strip[5]*in_strip[1];
out_tile[6] += filts_strip[6]*in_strip[1];
out_tile[7] += filts_strip[7]*in_strip[1];
out_tile[8] += filts_strip[0]*in_strip[2];
out_tile[9] += filts_strip[1]*in_strip[2];
out_tile[10] += filts_strip[2]*in_strip[2];
out_tile[11] += filts_strip[3]*in_strip[2];
out_tile[12] += filts_strip[4]*in_strip[2];
out_tile[13] += filts_strip[5]*in_strip[2];
out_tile[14] += filts_strip[6]*in_strip[2];
out_tile[15] += filts_strip[7]*in_strip[2];
out_tile[16] += filts_strip[0]*in_strip[3];
out_tile[17] += filts_strip[1]*in_strip[3];
out_tile[18] += filts_strip[2]*in_strip[3];
out_tile[19] += filts_strip[3]*in_strip[3];
out_tile[20] += filts_strip[4]*in_strip[3];
out_tile[21] += filts_strip[5]*in_strip[3];
out_tile[22] += filts_strip[6]*in_strip[3];
out_tile[23] += filts_strip[7]*in_strip[3];
out_tile[24] += filts_strip[0]*in_strip[4];
out_tile[25] += filts_strip[1]*in_strip[4];
out_tile[26] += filts_strip[2]*in_strip[4];
out_tile[27] += filts_strip[3]*in_strip[4];
out_tile[28] += filts_strip[4]*in_strip[4];
out_tile[29] += filts_strip[5]*in_strip[4];
out_tile[30] += filts_strip[6]*in_strip[4];
out_tile[31] += filts_strip[7]*in_strip[4];
out_tile[32] += filts_strip[0]*in_strip[5];
out_tile[33] += filts_strip[1]*in_strip[5];
out_tile[34] += filts_strip[2]*in_strip[5];
out_tile[35] += filts_strip[3]*in_strip[5];
out_tile[36] += filts_strip[4]*in_strip[5];
out_tile[37] += filts_strip[5]*in_strip[5];
out_tile[38] += filts_strip[6]*in_strip[5];
out_tile[39] += filts_strip[7]*in_strip[5];
out_tile[40] += filts_strip[0]*in_strip[6];
out_tile[41] += filts_strip[1]*in_strip[6];
out_tile[42] += filts_strip[2]*in_strip[6];
out_tile[43] += filts_strip[3]*in_strip[6];
out_tile[44] += filts_strip[4]*in_strip[6];
out_tile[45] += filts_strip[5]*in_strip[6];
out_tile[46] += filts_strip[6]*in_strip[6];
out_tile[47] += filts_strip[7]*in_strip[6];
out_tile[48] += filts_strip[0]*in_strip[7];
out_tile[49] += filts_strip[1]*in_strip[7];
out_tile[50] += filts_strip[2]*in_strip[7];
out_tile[51] += filts_strip[3]*in_strip[7];
out_tile[52] += filts_strip[4]*in_strip[7];
out_tile[53] += filts_strip[5]*in_strip[7];
out_tile[54] += filts_strip[6]*in_strip[7];
out_tile[55] += filts_strip[7]*in_strip[7];
out_tile[56] += filts_strip[0]*in_strip[8];
out_tile[57] += filts_strip[1]*in_strip[8];
out_tile[58] += filts_strip[2]*in_strip[8];
out_tile[59] += filts_strip[3]*in_strip[8];
out_tile[60] += filts_strip[4]*in_strip[8];
out_tile[61] += filts_strip[5]*in_strip[8];
out_tile[62] += filts_strip[6]*in_strip[8];
out_tile[63] += filts_strip[7]*in_strip[8];
filts_strip[0] = filts_smem_off[2*128+0*16];
filts_strip[1] = filts_smem_off[2*128+1*16];
filts_strip[2] = filts_smem_off[2*128+2*16];
filts_strip[3] = filts_smem_off[2*128+3*16];
filts_strip[4] = filts_smem_off[2*128+4*16];
filts_strip[5] = filts_smem_off[2*128+5*16];
filts_strip[6] = filts_smem_off[2*128+6*16];
filts_strip[7] = filts_smem_off[2*128+7*16];
out_tile[0] += filts_strip[0]*in_strip[2];
out_tile[1] += filts_strip[1]*in_strip[2];
out_tile[2] += filts_strip[2]*in_strip[2];
out_tile[3] += filts_strip[3]*in_strip[2];
out_tile[4] += filts_strip[4]*in_strip[2];
out_tile[5] += filts_strip[5]*in_strip[2];
out_tile[6] += filts_strip[6]*in_strip[2];
out_tile[7] += filts_strip[7]*in_strip[2];
out_tile[8] += filts_strip[0]*in_strip[3];
out_tile[9] += filts_strip[1]*in_strip[3];
out_tile[10] += filts_strip[2]*in_strip[3];
out_tile[11] += filts_strip[3]*in_strip[3];
out_tile[12] += filts_strip[4]*in_strip[3];
out_tile[13] += filts_strip[5]*in_strip[3];
out_tile[14] += filts_strip[6]*in_strip[3];
out_tile[15] += filts_strip[7]*in_strip[3];
out_tile[16] += filts_strip[0]*in_strip[4];
out_tile[17] += filts_strip[1]*in_strip[4];
out_tile[18] += filts_strip[2]*in_strip[4];
out_tile[19] += filts_strip[3]*in_strip[4];
out_tile[20] += filts_strip[4]*in_strip[4];
out_tile[21] += filts_strip[5]*in_strip[4];
out_tile[22] += filts_strip[6]*in_strip[4];
out_tile[23] += filts_strip[7]*in_strip[4];
out_tile[24] += filts_strip[0]*in_strip[5];
out_tile[25] += filts_strip[1]*in_strip[5];
out_tile[26] += filts_strip[2]*in_strip[5];
out_tile[27] += filts_strip[3]*in_strip[5];
out_tile[28] += filts_strip[4]*in_strip[5];
out_tile[29] += filts_strip[5]*in_strip[5];
out_tile[30] += filts_strip[6]*in_strip[5];
out_tile[31] += filts_strip[7]*in_strip[5];
out_tile[32] += filts_strip[0]*in_strip[6];
out_tile[33] += filts_strip[1]*in_strip[6];
out_tile[34] += filts_strip[2]*in_strip[6];
out_tile[35] += filts_strip[3]*in_strip[6];
out_tile[36] += filts_strip[4]*in_strip[6];
out_tile[37] += filts_strip[5]*in_strip[6];
out_tile[38] += filts_strip[6]*in_strip[6];
out_tile[39] += filts_strip[7]*in_strip[6];
out_tile[40] += filts_strip[0]*in_strip[7];
out_tile[41] += filts_strip[1]*in_strip[7];
out_tile[42] += filts_strip[2]*in_strip[7];
out_tile[43] += filts_strip[3]*in_strip[7];
out_tile[44] += filts_strip[4]*in_strip[7];
out_tile[45] += filts_strip[5]*in_strip[7];
out_tile[46] += filts_strip[6]*in_strip[7];
out_tile[47] += filts_strip[7]*in_strip[7];
out_tile[48] += filts_strip[0]*in_strip[8];
out_tile[49] += filts_strip[1]*in_strip[8];
out_tile[50] += filts_strip[2]*in_strip[8];
out_tile[51] += filts_strip[3]*in_strip[8];
out_tile[52] += filts_strip[4]*in_strip[8];
out_tile[53] += filts_strip[5]*in_strip[8];
out_tile[54] += filts_strip[6]*in_strip[8];
out_tile[55] += filts_strip[7]*in_strip[8];
out_tile[56] += filts_strip[0]*in_strip[9];
out_tile[57] += filts_strip[1]*in_strip[9];
out_tile[58] += filts_strip[2]*in_strip[9];
out_tile[59] += filts_strip[3]*in_strip[9];
out_tile[60] += filts_strip[4]*in_strip[9];
out_tile[61] += filts_strip[5]*in_strip[9];
out_tile[62] += filts_strip[6]*in_strip[9];
out_tile[63] += filts_strip[7]*in_strip[9];
;
}
}
if( flags == 2 ) { return; }
BARRIER_SYNC;
for( int32_t i = 0; i != 1; ++i ) {
int32_t const t_smem_bias_ix = LOC_ID_1D+128*i;
if( t_smem_bias_ix < 128 ) {
int32_t const ocix_base = (GRP_ID_1D%2)*128;
int32_t const load_reg = t_smem_bias_ix / 16;
int32_t const load_tile = t_smem_bias_ix % 16;
int32_t const ocix = ocix_base + load_tile*8 + load_reg;
if( ocix < 192 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; }
}
}
BARRIER_SYNC;
// begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*16];
filts_strip[1] = filts_smem_off[1*16];
filts_strip[2] = filts_smem_off[2*16];
filts_strip[3] = filts_smem_off[3*16];
filts_strip[4] = filts_smem_off[4*16];
filts_strip[5] = filts_smem_off[5*16];
filts_strip[6] = filts_smem_off[6*16];
filts_strip[7] = filts_smem_off[7*16];
// end t_tile_bias_loads;
if( flags == 1 ) { return; }
// begin t_tile_stores
if( (out_line/28) >= 20 ) { return; }
int32_t out_x = ((GRP_ID_1D/2)%4)*8;
int32_t out_chan = ((GRP_ID_1D%2)*16 + (LOC_ID_1D%16))*8;
GASQ float * out_off = out + (out_line/28)*150528 + out_chan*784 + (out_line%28)*28 + out_x*1 ;
if( (out_x + 0) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 192 ) { out_off[ 0*784 + 0*1 ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( (out_chan + 1) < 192 ) { out_off[ 1*784 + 0*1 ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( (out_chan + 2) < 192 ) { out_off[ 2*784 + 0*1 ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( (out_chan + 3) < 192 ) { out_off[ 3*784 + 0*1 ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( (out_chan + 4) < 192 ) { out_off[ 4*784 + 0*1 ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( (out_chan + 5) < 192 ) { out_off[ 5*784 + 0*1 ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( (out_chan + 6) < 192 ) { out_off[ 6*784 + 0*1 ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( (out_chan + 7) < 192 ) { out_off[ 7*784 + 0*1 ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( (out_x + 1) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 192 ) { out_off[ 0*784 + 1*1 ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( (out_chan + 1) < 192 ) { out_off[ 1*784 + 1*1 ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( (out_chan + 2) < 192 ) { out_off[ 2*784 + 1*1 ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( (out_chan + 3) < 192 ) { out_off[ 3*784 + 1*1 ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( (out_chan + 4) < 192 ) { out_off[ 4*784 + 1*1 ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( (out_chan + 5) < 192 ) { out_off[ 5*784 + 1*1 ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( (out_chan + 6) < 192 ) { out_off[ 6*784 + 1*1 ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( (out_chan + 7) < 192 ) { out_off[ 7*784 + 1*1 ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( (out_x + 2) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 192 ) { out_off[ 0*784 + 2*1 ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( (out_chan + 1) < 192 ) { out_off[ 1*784 + 2*1 ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( (out_chan + 2) < 192 ) { out_off[ 2*784 + 2*1 ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( (out_chan + 3) < 192 ) { out_off[ 3*784 + 2*1 ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( (out_chan + 4) < 192 ) { out_off[ 4*784 + 2*1 ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( (out_chan + 5) < 192 ) { out_off[ 5*784 + 2*1 ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( (out_chan + 6) < 192 ) { out_off[ 6*784 + 2*1 ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( (out_chan + 7) < 192 ) { out_off[ 7*784 + 2*1 ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( (out_x + 3) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 192 ) { out_off[ 0*784 + 3*1 ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( (out_chan + 1) < 192 ) { out_off[ 1*784 + 3*1 ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( (out_chan + 2) < 192 ) { out_off[ 2*784 + 3*1 ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( (out_chan + 3) < 192 ) { out_off[ 3*784 + 3*1 ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( (out_chan + 4) < 192 ) { out_off[ 4*784 + 3*1 ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( (out_chan + 5) < 192 ) { out_off[ 5*784 + 3*1 ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( (out_chan + 6) < 192 ) { out_off[ 6*784 + 3*1 ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( (out_chan + 7) < 192 ) { out_off[ 7*784 + 3*1 ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( (out_x + 4) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 192 ) { out_off[ 0*784 + 4*1 ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( (out_chan + 1) < 192 ) { out_off[ 1*784 + 4*1 ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( (out_chan + 2) < 192 ) { out_off[ 2*784 + 4*1 ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( (out_chan + 3) < 192 ) { out_off[ 3*784 + 4*1 ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( (out_chan + 4) < 192 ) { out_off[ 4*784 + 4*1 ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( (out_chan + 5) < 192 ) { out_off[ 5*784 + 4*1 ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( (out_chan + 6) < 192 ) { out_off[ 6*784 + 4*1 ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( (out_chan + 7) < 192 ) { out_off[ 7*784 + 4*1 ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( (out_x + 5) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 192 ) { out_off[ 0*784 + 5*1 ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( (out_chan + 1) < 192 ) { out_off[ 1*784 + 5*1 ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( (out_chan + 2) < 192 ) { out_off[ 2*784 + 5*1 ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( (out_chan + 3) < 192 ) { out_off[ 3*784 + 5*1 ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( (out_chan + 4) < 192 ) { out_off[ 4*784 + 5*1 ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( (out_chan + 5) < 192 ) { out_off[ 5*784 + 5*1 ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( (out_chan + 6) < 192 ) { out_off[ 6*784 + 5*1 ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( (out_chan + 7) < 192 ) { out_off[ 7*784 + 5*1 ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( (out_x + 6) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 192 ) { out_off[ 0*784 + 6*1 ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( (out_chan + 1) < 192 ) { out_off[ 1*784 + 6*1 ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( (out_chan + 2) < 192 ) { out_off[ 2*784 + 6*1 ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( (out_chan + 3) < 192 ) { out_off[ 3*784 + 6*1 ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( (out_chan + 4) < 192 ) { out_off[ 4*784 + 6*1 ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( (out_chan + 5) < 192 ) { out_off[ 5*784 + 6*1 ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( (out_chan + 6) < 192 ) { out_off[ 6*784 + 6*1 ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( (out_chan + 7) < 192 ) { out_off[ 7*784 + 6*1 ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( (out_x + 7) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < 192 ) { out_off[ 0*784 + 7*1 ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( (out_chan + 1) < 192 ) { out_off[ 1*784 + 7*1 ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( (out_chan + 2) < 192 ) { out_off[ 2*784 + 7*1 ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( (out_chan + 3) < 192 ) { out_off[ 3*784 + 7*1 ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( (out_chan + 4) < 192 ) { out_off[ 4*784 + 7*1 ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( (out_chan + 5) < 192 ) { out_off[ 5*784 + 7*1 ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( (out_chan + 6) < 192 ) { out_off[ 6*784 + 7*1 ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( (out_chan + 7) < 192 ) { out_off[ 7*784 + 7*1 ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_dim_0 = 28 */
/* in_dim_1 = 28 */
/* kern_sz = 3 */
/* stride = 1 */
/* in_pad = 1 */
/* t_tile_sz = 8 */
/* conv_has_relu = 1 */
/* out_chans = 192 */
/* in_chans = 128 */
/* rtc_func_name = tconv__num_imgs_20__in_dim_0_28__in_dim_1_28__kern_sz_3__stride_1__in_pad_1__t_tile_sz_8__conv_has_relu_1__out_chans_192__in_chans_128 */
/* out_ix_x_dim = 28 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%28) */
/* out_ix_y_dim = 28 */
/* out_ix_y_sz = 28 */
/* out_ix_y_nomod = (out_ix/28) */
/* out_ix_y = ((out_ix/28)%%28) */
/* out_ix_chan_dim = 192 */
/* out_ix_chan_sz = 784 */
/* out_ix_chan_nomod = (out_ix/784) */
/* out_ix_chan = ((out_ix/784)%%192) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 150528 */
/* out_ix_img_nomod = (out_ix/150528) */
/* out_ix_img = (out_ix/150528) */
/* out_ix_sz = 3010560 */
/* tpb = 128 */
/* out_line_y_dim = 28 */
/* out_line_y_sz = 1 */
/* out_line_y_nomod = out_line */
/* out_line_y = (out_line%%28) */
/* out_line_img_dim = 20 */
/* out_line_img_sz = 28 */
/* out_line_img_nomod = (out_line/28) */
/* out_line_img = (out_line/28) */
/* out_line_sz = 560 */
/* in_ix_blk_x_dim = 10 */
/* in_ix_blk_x_sz = 1 */
/* in_ix_blk_x_nomod = in_ix */
/* in_ix_blk_x = (in_ix%%10) */
/* in_ix_blk_y_dim = 12 */
/* in_ix_blk_y_sz = 10 */
/* in_ix_blk_y_nomod = (in_ix/10) */
/* in_ix_blk_y = ((in_ix/10)%%12) */
/* in_ix_blk_in_chan_dim = 128 */
/* in_ix_blk_in_chan_sz = 120 */
/* in_ix_blk_in_chan_nomod = (in_ix/120) */
/* in_ix_blk_in_chan = ((in_ix/120)%%128) */
/* in_ix_blk_bx_dim = 4 */
/* in_ix_blk_bx_sz = 15360 */
/* in_ix_blk_bx_nomod = (in_ix/15360) */
/* in_ix_blk_bx = ((in_ix/15360)%%4) */
/* in_ix_blk_bline_dim = 70 */
/* in_ix_blk_bline_sz = 61440 */
/* in_ix_blk_bline_nomod = (in_ix/61440) */
/* in_ix_blk_bline = (in_ix/61440) */
/* in_ix_sz = 4300800 */
/* LOC_ID_1D_out_chan_tile_dim = 16 */
/* LOC_ID_1D_out_chan_tile_sz = 1 */
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%16) */
/* LOC_ID_1D_blk_y_dim = 8 */
/* LOC_ID_1D_blk_y_sz = 16 */
/* LOC_ID_1D_blk_y_nomod = (LOC_ID_1D/16) */
/* LOC_ID_1D_blk_y = (LOC_ID_1D/16) */
/* LOC_ID_1D_sz = 128 */
/* GRP_ID_1D_out_chan_blk_dim = 2 */
/* GRP_ID_1D_out_chan_blk_sz = 1 */
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%2) */
/* GRP_ID_1D_blk_bx_dim = 4 */
/* GRP_ID_1D_blk_bx_sz = 2 */
/* GRP_ID_1D_blk_bx_nomod = (GRP_ID_1D/2) */
/* GRP_ID_1D_blk_bx = ((GRP_ID_1D/2)%%4) */
/* GRP_ID_1D_blk_bline_dim = 70 */
/* GRP_ID_1D_blk_bline_sz = 8 */
/* GRP_ID_1D_blk_bline_nomod = (GRP_ID_1D/8) */
/* GRP_ID_1D_blk_bline = (GRP_ID_1D/8) */
/* GRP_ID_1D_sz = 560 */
/* blk_filt_ix_sz = 128 */
/* filts_smem_sz = 384 */
/* in_smem_sz = 120 */
/* out_smem_sz = 1024 */
/* all_smem_sz = 1024 */
/* filts_xp_ix_out_chan_tile_dim = 16 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%16) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 16 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/16) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/16)%%8) */
/* filts_xp_ix_x_dim = 3 */
/* filts_xp_ix_x_sz = 128 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/128) */
/* filts_xp_ix_x = ((filts_xp_ix/128)%%3) */
/* filts_xp_ix_y_dim = 3 */
/* filts_xp_ix_y_sz = 384 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/384) */
/* filts_xp_ix_y = ((filts_xp_ix/384)%%3) */
/* filts_xp_ix_in_chan_dim = 128 */
/* filts_xp_ix_in_chan_sz = 1152 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/1152) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/1152)%%128) */
/* filts_xp_ix_out_chan_blk_dim = 2 */
/* filts_xp_ix_out_chan_blk_sz = 147456 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/147456) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/147456) */
/* filts_xp_ix_sz = 294912 */
/* out_chan_bias_smem_load_iter = 1 */
/* filts_off_adj = LOC_ID_1D */
/* filt_smem_loads = // begin filt_smem_loads
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)];
filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)];
filts_smem[(LOC_ID_1D + %(tpb) * 2)] = filts[filts_off+(%(tpb)*2)];
filts_off += %(filts_xp_ix_y_sz);
// end filt_smem_loads */
/* in_smem_loads = // begin in_smem_loads
if( (LOC_ID_1D + %(tpb) * 0) < %(in_smem_sz)) { in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ];}
blk_in_ix_base += %(in_ix_blk_in_chan_sz);
// end in_smem_loads */
/* inner_loop_body = // begin inner_loop_body
in_strip[0] = in_smem_off[0];
in_strip[1] = in_smem_off[1];
in_strip[2] = in_smem_off[2];
in_strip[3] = in_smem_off[3];
in_strip[4] = in_smem_off[4];
in_strip[5] = in_smem_off[5];
in_strip[6] = in_smem_off[6];
in_strip[7] = in_smem_off[7];
in_strip[8] = in_smem_off[8];
in_strip[9] = in_smem_off[9];
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[1];
out_tile[1] += filts_strip[1]*in_strip[1];
out_tile[2] += filts_strip[2]*in_strip[1];
out_tile[3] += filts_strip[3]*in_strip[1];
out_tile[4] += filts_strip[4]*in_strip[1];
out_tile[5] += filts_strip[5]*in_strip[1];
out_tile[6] += filts_strip[6]*in_strip[1];
out_tile[7] += filts_strip[7]*in_strip[1];
out_tile[8] += filts_strip[0]*in_strip[2];
out_tile[9] += filts_strip[1]*in_strip[2];
out_tile[10] += filts_strip[2]*in_strip[2];
out_tile[11] += filts_strip[3]*in_strip[2];
out_tile[12] += filts_strip[4]*in_strip[2];
out_tile[13] += filts_strip[5]*in_strip[2];
out_tile[14] += filts_strip[6]*in_strip[2];
out_tile[15] += filts_strip[7]*in_strip[2];
out_tile[16] += filts_strip[0]*in_strip[3];
out_tile[17] += filts_strip[1]*in_strip[3];
out_tile[18] += filts_strip[2]*in_strip[3];
out_tile[19] += filts_strip[3]*in_strip[3];
out_tile[20] += filts_strip[4]*in_strip[3];
out_tile[21] += filts_strip[5]*in_strip[3];
out_tile[22] += filts_strip[6]*in_strip[3];
out_tile[23] += filts_strip[7]*in_strip[3];
out_tile[24] += filts_strip[0]*in_strip[4];
out_tile[25] += filts_strip[1]*in_strip[4];
out_tile[26] += filts_strip[2]*in_strip[4];
out_tile[27] += filts_strip[3]*in_strip[4];
out_tile[28] += filts_strip[4]*in_strip[4];
out_tile[29] += filts_strip[5]*in_strip[4];
out_tile[30] += filts_strip[6]*in_strip[4];
out_tile[31] += filts_strip[7]*in_strip[4];
out_tile[32] += filts_strip[0]*in_strip[5];
out_tile[33] += filts_strip[1]*in_strip[5];
out_tile[34] += filts_strip[2]*in_strip[5];
out_tile[35] += filts_strip[3]*in_strip[5];
out_tile[36] += filts_strip[4]*in_strip[5];
out_tile[37] += filts_strip[5]*in_strip[5];
out_tile[38] += filts_strip[6]*in_strip[5];
out_tile[39] += filts_strip[7]*in_strip[5];
out_tile[40] += filts_strip[0]*in_strip[6];
out_tile[41] += filts_strip[1]*in_strip[6];
out_tile[42] += filts_strip[2]*in_strip[6];
out_tile[43] += filts_strip[3]*in_strip[6];
out_tile[44] += filts_strip[4]*in_strip[6];
out_tile[45] += filts_strip[5]*in_strip[6];
out_tile[46] += filts_strip[6]*in_strip[6];
out_tile[47] += filts_strip[7]*in_strip[6];
out_tile[48] += filts_strip[0]*in_strip[7];
out_tile[49] += filts_strip[1]*in_strip[7];
out_tile[50] += filts_strip[2]*in_strip[7];
out_tile[51] += filts_strip[3]*in_strip[7];
out_tile[52] += filts_strip[4]*in_strip[7];
out_tile[53] += filts_strip[5]*in_strip[7];
out_tile[54] += filts_strip[6]*in_strip[7];
out_tile[55] += filts_strip[7]*in_strip[7];
out_tile[56] += filts_strip[0]*in_strip[8];
out_tile[57] += filts_strip[1]*in_strip[8];
out_tile[58] += filts_strip[2]*in_strip[8];
out_tile[59] += filts_strip[3]*in_strip[8];
out_tile[60] += filts_strip[4]*in_strip[8];
out_tile[61] += filts_strip[5]*in_strip[8];
out_tile[62] += filts_strip[6]*in_strip[8];
out_tile[63] += filts_strip[7]*in_strip[8];
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)];
out_tile[0] += filts_strip[0]*in_strip[2];
out_tile[1] += filts_strip[1]*in_strip[2];
out_tile[2] += filts_strip[2]*in_strip[2];
out_tile[3] += filts_strip[3]*in_strip[2];
out_tile[4] += filts_strip[4]*in_strip[2];
out_tile[5] += filts_strip[5]*in_strip[2];
out_tile[6] += filts_strip[6]*in_strip[2];
out_tile[7] += filts_strip[7]*in_strip[2];
out_tile[8] += filts_strip[0]*in_strip[3];
out_tile[9] += filts_strip[1]*in_strip[3];
out_tile[10] += filts_strip[2]*in_strip[3];
out_tile[11] += filts_strip[3]*in_strip[3];
out_tile[12] += filts_strip[4]*in_strip[3];
out_tile[13] += filts_strip[5]*in_strip[3];
out_tile[14] += filts_strip[6]*in_strip[3];
out_tile[15] += filts_strip[7]*in_strip[3];
out_tile[16] += filts_strip[0]*in_strip[4];
out_tile[17] += filts_strip[1]*in_strip[4];
out_tile[18] += filts_strip[2]*in_strip[4];
out_tile[19] += filts_strip[3]*in_strip[4];
out_tile[20] += filts_strip[4]*in_strip[4];
out_tile[21] += filts_strip[5]*in_strip[4];
out_tile[22] += filts_strip[6]*in_strip[4];
out_tile[23] += filts_strip[7]*in_strip[4];
out_tile[24] += filts_strip[0]*in_strip[5];
out_tile[25] += filts_strip[1]*in_strip[5];
out_tile[26] += filts_strip[2]*in_strip[5];
out_tile[27] += filts_strip[3]*in_strip[5];
out_tile[28] += filts_strip[4]*in_strip[5];
out_tile[29] += filts_strip[5]*in_strip[5];
out_tile[30] += filts_strip[6]*in_strip[5];
out_tile[31] += filts_strip[7]*in_strip[5];
out_tile[32] += filts_strip[0]*in_strip[6];
out_tile[33] += filts_strip[1]*in_strip[6];
out_tile[34] += filts_strip[2]*in_strip[6];
out_tile[35] += filts_strip[3]*in_strip[6];
out_tile[36] += filts_strip[4]*in_strip[6];
out_tile[37] += filts_strip[5]*in_strip[6];
out_tile[38] += filts_strip[6]*in_strip[6];
out_tile[39] += filts_strip[7]*in_strip[6];
out_tile[40] += filts_strip[0]*in_strip[7];
out_tile[41] += filts_strip[1]*in_strip[7];
out_tile[42] += filts_strip[2]*in_strip[7];
out_tile[43] += filts_strip[3]*in_strip[7];
out_tile[44] += filts_strip[4]*in_strip[7];
out_tile[45] += filts_strip[5]*in_strip[7];
out_tile[46] += filts_strip[6]*in_strip[7];
out_tile[47] += filts_strip[7]*in_strip[7];
out_tile[48] += filts_strip[0]*in_strip[8];
out_tile[49] += filts_strip[1]*in_strip[8];
out_tile[50] += filts_strip[2]*in_strip[8];
out_tile[51] += filts_strip[3]*in_strip[8];
out_tile[52] += filts_strip[4]*in_strip[8];
out_tile[53] += filts_strip[5]*in_strip[8];
out_tile[54] += filts_strip[6]*in_strip[8];
out_tile[55] += filts_strip[7]*in_strip[8];
out_tile[56] += filts_strip[0]*in_strip[9];
out_tile[57] += filts_strip[1]*in_strip[9];
out_tile[58] += filts_strip[2]*in_strip[9];
out_tile[59] += filts_strip[3]*in_strip[9];
out_tile[60] += filts_strip[4]*in_strip[9];
out_tile[61] += filts_strip[5]*in_strip[9];
out_tile[62] += filts_strip[6]*in_strip[9];
out_tile[63] += filts_strip[7]*in_strip[9];
*/
/* t_tile_bias_loads = // begin t_tile_bias_loads
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)];
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)];
// end t_tile_bias_loads */
/* t_tile_stores = // begin t_tile_stores
if( %(out_line_img) >= %(out_ix_img_dim) ) { return; }
int32_t out_x = %(GRP_ID_1D_blk_bx)*%(t_tile_sz);
int32_t out_chan = (%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim) + %(LOC_ID_1D_out_chan_tile))*%(t_tile_sz);
GASQ float * out_off = out + %(out_line_img)*%(out_ix_img_sz) + out_chan*%(out_ix_chan_sz) + %(out_line_y)*%(out_ix_y_sz) + out_x*%(out_ix_x_sz) ;
if( (out_x + 0) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( (out_x + 1) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( (out_x + 2) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( (out_x + 3) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( (out_x + 4) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( (out_x + 5) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( (out_x + 6) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( (out_x + 7) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them.
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores */
CUCL_GLOBAL_KERNEL void in_tile_xpose__num_imgs_20__stride_1__kern_sz_3__in_pad_1__in_chans_128__ysz_28__xsz_28__tix_pels_tile_sz_8__t_tile_sz_8__bix_pels_blk_sz_280( GASQ float const * const in, GASQ float * const out ) {
int32_t const out_ix = GLOB_ID_1D;
if( out_ix >= 4300800 ) { return; }
int32_t const out_line = (out_ix/61440)*8;
int32_t const fi_skip_in_lines = (out_line%28)*1;
int32_t const in_line = (((out_ix/10)%12)+fi_skip_in_lines);
int32_t const img_in_lines = (28 - 1)*1 + 3;
int32_t const img_off = in_line/img_in_lines;
int32_t const img = (out_line/28) + img_off;
int32_t const iy = (in_line % img_in_lines) - 1; //(out_line%28)*1 + ((out_ix/10)%12) - 1;
int32_t const ix = ((out_ix/15360)%4)*8*1 + (out_ix%10) - 1;
float v = 0.0f;
if( 1
&& ( ix >= 0 )
&& ( iy >= 0 )
&& ( ix < 28 )
&& ( iy < 28 )
&& ( img < 20 )
)
{
v = in[ img*100352 +
((out_ix/120)%128)*784 +
iy*28 +
ix*1 ];
}
out[out_ix] = v;
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* stride = 1 */
/* kern_sz = 3 */
/* in_pad = 1 */
/* in_chans = 128 */
/* ysz = 28 */
/* xsz = 28 */
/* tix_pels_tile_sz = 8 */
/* t_tile_sz = 8 */
/* bix_pels_blk_sz = 280 */
/* rtc_func_name = in_tile_xpose__num_imgs_20__stride_1__kern_sz_3__in_pad_1__in_chans_128__ysz_28__xsz_28__tix_pels_tile_sz_8__t_tile_sz_8__bix_pels_blk_sz_280 */
/* out_ix_blk_x_dim = 10 */
/* out_ix_blk_x_sz = 1 */
/* out_ix_blk_x_nomod = out_ix */
/* out_ix_blk_x = (out_ix%%10) */
/* out_ix_blk_y_dim = 12 */
/* out_ix_blk_y_sz = 10 */
/* out_ix_blk_y_nomod = (out_ix/10) */
/* out_ix_blk_y = ((out_ix/10)%%12) */
/* out_ix_blk_in_chan_dim = 128 */
/* out_ix_blk_in_chan_sz = 120 */
/* out_ix_blk_in_chan_nomod = (out_ix/120) */
/* out_ix_blk_in_chan = ((out_ix/120)%%128) */
/* out_ix_blk_bx_dim = 4 */
/* out_ix_blk_bx_sz = 15360 */
/* out_ix_blk_bx_nomod = (out_ix/15360) */
/* out_ix_blk_bx = ((out_ix/15360)%%4) */
/* out_ix_blk_bline_dim = 70 */
/* out_ix_blk_bline_sz = 61440 */
/* out_ix_blk_bline_nomod = (out_ix/61440) */
/* out_ix_blk_bline = (out_ix/61440) */
/* out_ix_sz = 4300800 */
/* out_line_y_dim = 28 */
/* out_line_y_sz = 1 */
/* out_line_y_nomod = out_line */
/* out_line_y = (out_line%%28) */
/* out_line_img_dim = 20 */
/* out_line_img_sz = 28 */
/* out_line_img_nomod = (out_line/28) */
/* out_line_img = (out_line/28) */
/* out_line_sz = 560 */
/* in_ix_x_dim = 28 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%28) */
/* in_ix_y_dim = 28 */
/* in_ix_y_sz = 28 */
/* in_ix_y_nomod = (in_ix/28) */
/* in_ix_y = ((in_ix/28)%%28) */
/* in_ix_chan_dim = 128 */
/* in_ix_chan_sz = 784 */
/* in_ix_chan_nomod = (in_ix/784) */
/* in_ix_chan = ((in_ix/784)%%128) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 100352 */
/* in_ix_img_nomod = (in_ix/100352) */
/* in_ix_img = (in_ix/100352) */
/* in_ix_sz = 2007040 */
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_192__in_chans_128__kysz_3__kxsz_3( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile
{
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg
int32_t const filts_ix = GLOB_ID_1D;
if( filts_ix >= 221184 ) { return; }
int32_t const fioc = (filts_ix/1152);
float val = 0.0f;
int32_t const filts_xp_ix =
(fioc/128)*147456 +
(fioc%8)*16 +
((fioc/8)%16)*1 +
((filts_ix/9)%128)*1152 +
((filts_ix/3)%3)*384 +
(filts_ix%3)*128;
#if 1
val = in[filts_ix];
#else
if( ((filts_ix/9)%128) == 0 ) {
// if( ((filts_ix%3) == 5) && (((filts_ix/3)%3) == 5) )
{
val = (filts_ix%3)*100 + ((filts_ix/3)%3);
}
}
#endif
out[filts_xp_ix] = val;
}
// -- template substituion table used: --
/* out_chans = 192 */
/* in_chans = 128 */
/* kysz = 3 */
/* kxsz = 3 */
/* rtc_func_name = xpose_filts__out_chans_192__in_chans_128__kysz_3__kxsz_3 */
/* t_tile_sz = 8 */
/* filts_ix_x_dim = 3 */
/* filts_ix_x_sz = 1 */
/* filts_ix_x_nomod = filts_ix */
/* filts_ix_x = (filts_ix%%3) */
/* filts_ix_y_dim = 3 */
/* filts_ix_y_sz = 3 */
/* filts_ix_y_nomod = (filts_ix/3) */
/* filts_ix_y = ((filts_ix/3)%%3) */
/* filts_ix_in_chan_dim = 128 */
/* filts_ix_in_chan_sz = 9 */
/* filts_ix_in_chan_nomod = (filts_ix/9) */
/* filts_ix_in_chan = ((filts_ix/9)%%128) */
/* filts_ix_out_chan_dim = 192 */
/* filts_ix_out_chan_sz = 1152 */
/* filts_ix_out_chan_nomod = (filts_ix/1152) */
/* filts_ix_out_chan = (filts_ix/1152) */
/* filts_ix_sz = 221184 */
/* filts_xp_ix_out_chan_tile_dim = 16 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%16) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 16 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/16) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/16)%%8) */
/* filts_xp_ix_x_dim = 3 */
/* filts_xp_ix_x_sz = 128 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/128) */
/* filts_xp_ix_x = ((filts_xp_ix/128)%%3) */
/* filts_xp_ix_y_dim = 3 */
/* filts_xp_ix_y_sz = 384 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/384) */
/* filts_xp_ix_y = ((filts_xp_ix/384)%%3) */
/* filts_xp_ix_in_chan_dim = 128 */
/* filts_xp_ix_in_chan_sz = 1152 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/1152) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/1152)%%128) */
/* filts_xp_ix_out_chan_blk_dim = 2 */
/* filts_xp_ix_out_chan_blk_sz = 147456 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/147456) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/147456) */
/* filts_xp_ix_sz = 294912 */
/* fioc_out_chan_reg_dim = 8 */
/* fioc_out_chan_reg_sz = 1 */
/* fioc_out_chan_reg_nomod = fioc */
/* fioc_out_chan_reg = (fioc%%8) */
/* fioc_out_chan_tile_dim = 16 */
/* fioc_out_chan_tile_sz = 8 */
/* fioc_out_chan_tile_nomod = (fioc/8) */
/* fioc_out_chan_tile = ((fioc/8)%%16) */
/* fioc_out_chan_blk_dim = 2 */
/* fioc_out_chan_blk_sz = 128 */
/* fioc_out_chan_blk_nomod = (fioc/128) */
/* fioc_out_chan_blk = (fioc/128) */
/* fioc_sz = 256 */
// 256 tbp
// each thread: computes 8x8 block of out
// loop over k dim
CUCL_GLOBAL_KERNEL void conv__num_imgs_20__in_pad_0__in_dim_0_28__in_dim_1_28__conv_has_relu_1__kern_sz_1__stride_1__out_chans_32__in_chans_256( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out ) {
LOCSHAR_MEM float in_smem[32*8];
int32_t const blk_filt_ix_sz = 4*8;
LOCSHAR_MEM float filts_smem[4*8]; // aka blk_filt_ix_sz, which wasn't const enough OpenCL
float out_tile[8*8] = {0}; // tile of output for this thread to compute, stored in registers
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz )
float in_strip[8]; // across patches (approx square block in x/y space, favoring x if sqrt() not integer)
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*8192;
int32_t const blk_patch_ix_sz = 32*8;
int32_t const blk_patch_ix_base = GRP_ID_1D*blk_patch_ix_sz;
// iteratate over filter elements
int32_t filts_off = blk_filt_ix_base;
for( int32_t filts_ix_out_chan_elem = 0; filts_ix_out_chan_elem !=
(256 * 1 * 1); ++filts_ix_out_chan_elem ) {
BARRIER_SYNC;
if( LOC_ID_1D < blk_filt_ix_sz ) {
#ifdef NO_IOX // by default, we don't ever disable this, since it's seems about as good as it can be already
//filts_smem[LOC_ID_1D] = LOC_ID_1D;
filts_smem[LOC_ID_1D] = filts[LOC_ID_1D];
#else
filts_smem[LOC_ID_1D] = filts[filts_off + LOC_ID_1D];
#endif
}
for( int32_t i = 0; i != 2; ++i ) {
if( (LOC_ID_1D+LOC_SZ_1D*i) < blk_patch_ix_sz ) {
int32_t const t_smem_patch_ix = (blk_patch_ix_base+LOC_ID_1D+LOC_SZ_1D*i);
#ifdef NO_IO
//float v = LOC_ID_1D;
//float v = in[LOC_ID_1D];
float v = in[filts_off + LOC_ID_1D];
#else
float v = 0;
int const smem_in_ix_y = ((t_smem_patch_ix/28)%28)*1+(filts_ix_out_chan_elem%1) - 0;
int const smem_in_ix_x = (t_smem_patch_ix%28)*1+(filts_ix_out_chan_elem%1) - 0;
if(smem_in_ix_y >= 0 && smem_in_ix_x >= 0 &&
(t_smem_patch_ix/784) < 20 &&
smem_in_ix_x < 28 && smem_in_ix_y < 28 ) {
v = in[(t_smem_patch_ix/784)*200704 +
filts_ix_out_chan_elem*784 +
smem_in_ix_y*28 +
smem_in_ix_x*1];
};
#endif
in_smem[LOC_ID_1D+LOC_SZ_1D*i] = v;
}
}
filts_off += 32;
BARRIER_SYNC;
#ifdef NO_IO
// begin t_tile_dummy_loads
filts_strip[0] = filts_smem[(LOC_ID_1D % 32) + 0];
filts_strip[1] = filts_smem[(LOC_ID_1D % 32) + 1];
filts_strip[2] = filts_smem[(LOC_ID_1D % 32) + 2];
filts_strip[3] = filts_smem[(LOC_ID_1D % 32) + 3];
filts_strip[4] = filts_smem[(LOC_ID_1D % 32) + 4];
filts_strip[5] = filts_smem[(LOC_ID_1D % 32) + 5];
filts_strip[6] = filts_smem[(LOC_ID_1D % 32) + 6];
filts_strip[7] = filts_smem[(LOC_ID_1D % 32) + 7];
in_strip[0] = in_smem[(LOC_ID_1D % 32) + 0];
in_strip[1] = in_smem[(LOC_ID_1D % 32) + 1];
in_strip[2] = in_smem[(LOC_ID_1D % 32) + 2];
in_strip[3] = in_smem[(LOC_ID_1D % 32) + 3];
in_strip[4] = in_smem[(LOC_ID_1D % 32) + 4];
in_strip[5] = in_smem[(LOC_ID_1D % 32) + 5];
in_strip[6] = in_smem[(LOC_ID_1D % 32) + 6];
in_strip[7] = in_smem[(LOC_ID_1D % 32) + 7];
// end t_tile_dummy_loads;
#else
// begin t_tile_loads
filts_strip[0] = filts_smem[(LOC_ID_1D%4)+0*4];
filts_strip[1] = filts_smem[(LOC_ID_1D%4)+1*4];
filts_strip[2] = filts_smem[(LOC_ID_1D%4)+2*4];
filts_strip[3] = filts_smem[(LOC_ID_1D%4)+3*4];
filts_strip[4] = filts_smem[(LOC_ID_1D%4)+4*4];
filts_strip[5] = filts_smem[(LOC_ID_1D%4)+5*4];
filts_strip[6] = filts_smem[(LOC_ID_1D%4)+6*4];
filts_strip[7] = filts_smem[(LOC_ID_1D%4)+7*4];
in_strip[0] = in_smem[8*(LOC_ID_1D/4)+0];
in_strip[1] = in_smem[8*(LOC_ID_1D/4)+1];
in_strip[2] = in_smem[8*(LOC_ID_1D/4)+2];
in_strip[3] = in_smem[8*(LOC_ID_1D/4)+3];
in_strip[4] = in_smem[8*(LOC_ID_1D/4)+4];
in_strip[5] = in_smem[8*(LOC_ID_1D/4)+5];
in_strip[6] = in_smem[8*(LOC_ID_1D/4)+6];
in_strip[7] = in_smem[8*(LOC_ID_1D/4)+7];
// end t_tile_loads;
#endif
// (2) do 8^2 fmas into out_tile
// begin t_tile_fmas
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25] += filts_strip[1]*in_strip[3];
out_tile[26] += filts_strip[2]*in_strip[3];
out_tile[27] += filts_strip[3]*in_strip[3];
out_tile[28] += filts_strip[4]*in_strip[3];
out_tile[29] += filts_strip[5]*in_strip[3];
out_tile[30] += filts_strip[6]*in_strip[3];
out_tile[31] += filts_strip[7]*in_strip[3];
out_tile[32] += filts_strip[0]*in_strip[4];
out_tile[33] += filts_strip[1]*in_strip[4];
out_tile[34] += filts_strip[2]*in_strip[4];
out_tile[35] += filts_strip[3]*in_strip[4];
out_tile[36] += filts_strip[4]*in_strip[4];
out_tile[37] += filts_strip[5]*in_strip[4];
out_tile[38] += filts_strip[6]*in_strip[4];
out_tile[39] += filts_strip[7]*in_strip[4];
out_tile[40] += filts_strip[0]*in_strip[5];
out_tile[41] += filts_strip[1]*in_strip[5];
out_tile[42] += filts_strip[2]*in_strip[5];
out_tile[43] += filts_strip[3]*in_strip[5];
out_tile[44] += filts_strip[4]*in_strip[5];
out_tile[45] += filts_strip[5]*in_strip[5];
out_tile[46] += filts_strip[6]*in_strip[5];
out_tile[47] += filts_strip[7]*in_strip[5];
out_tile[48] += filts_strip[0]*in_strip[6];
out_tile[49] += filts_strip[1]*in_strip[6];
out_tile[50] += filts_strip[2]*in_strip[6];
out_tile[51] += filts_strip[3]*in_strip[6];
out_tile[52] += filts_strip[4]*in_strip[6];
out_tile[53] += filts_strip[5]*in_strip[6];
out_tile[54] += filts_strip[6]*in_strip[6];
out_tile[55] += filts_strip[7]*in_strip[6];
out_tile[56] += filts_strip[0]*in_strip[7];
out_tile[57] += filts_strip[1]*in_strip[7];
out_tile[58] += filts_strip[2]*in_strip[7];
out_tile[59] += filts_strip[3]*in_strip[7];
out_tile[60] += filts_strip[4]*in_strip[7];
out_tile[61] += filts_strip[5]*in_strip[7];
out_tile[62] += filts_strip[6]*in_strip[7];
out_tile[63] += filts_strip[7]*in_strip[7];
// end t_tile_fmas;
}
// load per-block biases into smem
BARRIER_SYNC;
if( LOC_ID_1D < blk_filt_ix_sz ) {
int32_t const ocix_base = (GRP_ID_1D%1)*blk_filt_ix_sz;
int32_t const load_reg = LOC_ID_1D / 4;
int32_t const load_tile = LOC_ID_1D % 4;
int32_t const ocix = ocix_base + load_tile*8 + load_reg;
if( ocix < 32 ) { filts_smem[LOC_ID_1D] = biases[ ocix ]; }
//int32_t const ocix_tile = (ocix / 8) % 4;
//int32_t const ocix_reg = ocix % 8;
//filts_smem[ocix_tile * 1 + ocix_reg * 4] = biases[ocix];
}
BARRIER_SYNC;
// load biases into filts_strip
// begin t_tile_loads
filts_strip[0] = filts_smem[(LOC_ID_1D%4)+0*4];
filts_strip[1] = filts_smem[(LOC_ID_1D%4)+1*4];
filts_strip[2] = filts_smem[(LOC_ID_1D%4)+2*4];
filts_strip[3] = filts_smem[(LOC_ID_1D%4)+3*4];
filts_strip[4] = filts_smem[(LOC_ID_1D%4)+4*4];
filts_strip[5] = filts_smem[(LOC_ID_1D%4)+5*4];
filts_strip[6] = filts_smem[(LOC_ID_1D%4)+6*4];
filts_strip[7] = filts_smem[(LOC_ID_1D%4)+7*4];
in_strip[0] = in_smem[8*(LOC_ID_1D/4)+0];
in_strip[1] = in_smem[8*(LOC_ID_1D/4)+1];
in_strip[2] = in_smem[8*(LOC_ID_1D/4)+2];
in_strip[3] = in_smem[8*(LOC_ID_1D/4)+3];
in_strip[4] = in_smem[8*(LOC_ID_1D/4)+4];
in_strip[5] = in_smem[8*(LOC_ID_1D/4)+5];
in_strip[6] = in_smem[8*(LOC_ID_1D/4)+6];
in_strip[7] = in_smem[8*(LOC_ID_1D/4)+7];
// end t_tile_loads;
// add bias to each elem of out_tile[] and store the results to out[]
#ifdef NO_IO
// begin t_tile_dummy_stores
out[0] = 0.0f
+ max(0.0f,out_tile[0] + filts_strip[0])
+ max(0.0f,out_tile[1] + filts_strip[1])
+ max(0.0f,out_tile[2] + filts_strip[2])
+ max(0.0f,out_tile[3] + filts_strip[3])
+ max(0.0f,out_tile[4] + filts_strip[4])
+ max(0.0f,out_tile[5] + filts_strip[5])
+ max(0.0f,out_tile[6] + filts_strip[6])
+ max(0.0f,out_tile[7] + filts_strip[7])
+ max(0.0f,out_tile[8] + filts_strip[0])
+ max(0.0f,out_tile[9] + filts_strip[1])
+ max(0.0f,out_tile[10] + filts_strip[2])
+ max(0.0f,out_tile[11] + filts_strip[3])
+ max(0.0f,out_tile[12] + filts_strip[4])
+ max(0.0f,out_tile[13] + filts_strip[5])
+ max(0.0f,out_tile[14] + filts_strip[6])
+ max(0.0f,out_tile[15] + filts_strip[7])
+ max(0.0f,out_tile[16] + filts_strip[0])
+ max(0.0f,out_tile[17] + filts_strip[1])
+ max(0.0f,out_tile[18] + filts_strip[2])
+ max(0.0f,out_tile[19] + filts_strip[3])
+ max(0.0f,out_tile[20] + filts_strip[4])
+ max(0.0f,out_tile[21] + filts_strip[5])
+ max(0.0f,out_tile[22] + filts_strip[6])
+ max(0.0f,out_tile[23] + filts_strip[7])
+ max(0.0f,out_tile[24] + filts_strip[0])
+ max(0.0f,out_tile[25] + filts_strip[1])
+ max(0.0f,out_tile[26] + filts_strip[2])
+ max(0.0f,out_tile[27] + filts_strip[3])
+ max(0.0f,out_tile[28] + filts_strip[4])
+ max(0.0f,out_tile[29] + filts_strip[5])
+ max(0.0f,out_tile[30] + filts_strip[6])
+ max(0.0f,out_tile[31] + filts_strip[7])
+ max(0.0f,out_tile[32] + filts_strip[0])
+ max(0.0f,out_tile[33] + filts_strip[1])
+ max(0.0f,out_tile[34] + filts_strip[2])
+ max(0.0f,out_tile[35] + filts_strip[3])
+ max(0.0f,out_tile[36] + filts_strip[4])
+ max(0.0f,out_tile[37] + filts_strip[5])
+ max(0.0f,out_tile[38] + filts_strip[6])
+ max(0.0f,out_tile[39] + filts_strip[7])
+ max(0.0f,out_tile[40] + filts_strip[0])
+ max(0.0f,out_tile[41] + filts_strip[1])
+ max(0.0f,out_tile[42] + filts_strip[2])
+ max(0.0f,out_tile[43] + filts_strip[3])
+ max(0.0f,out_tile[44] + filts_strip[4])
+ max(0.0f,out_tile[45] + filts_strip[5])
+ max(0.0f,out_tile[46] + filts_strip[6])
+ max(0.0f,out_tile[47] + filts_strip[7])
+ max(0.0f,out_tile[48] + filts_strip[0])
+ max(0.0f,out_tile[49] + filts_strip[1])
+ max(0.0f,out_tile[50] + filts_strip[2])
+ max(0.0f,out_tile[51] + filts_strip[3])
+ max(0.0f,out_tile[52] + filts_strip[4])
+ max(0.0f,out_tile[53] + filts_strip[5])
+ max(0.0f,out_tile[54] + filts_strip[6])
+ max(0.0f,out_tile[55] + filts_strip[7])
+ max(0.0f,out_tile[56] + filts_strip[0])
+ max(0.0f,out_tile[57] + filts_strip[1])
+ max(0.0f,out_tile[58] + filts_strip[2])
+ max(0.0f,out_tile[59] + filts_strip[3])
+ max(0.0f,out_tile[60] + filts_strip[4])
+ max(0.0f,out_tile[61] + filts_strip[5])
+ max(0.0f,out_tile[62] + filts_strip[6])
+ max(0.0f,out_tile[63] + filts_strip[7])
;
// end t_tile_dummy_stores;
#else
// begin t_tile_stores
int32_t tpix[8];
int32_t tcix[8];
tpix[0] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+0)/784)*25088 +
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+0) % 784 ); // cache out patch ixs
tpix[1] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+1)/784)*25088 +
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+1) % 784 ); // cache out patch ixs
tpix[2] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+2)/784)*25088 +
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+2) % 784 ); // cache out patch ixs
tpix[3] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+3)/784)*25088 +
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+3) % 784 ); // cache out patch ixs
tpix[4] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+4)/784)*25088 +
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+4) % 784 ); // cache out patch ixs
tpix[5] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+5)/784)*25088 +
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+5) % 784 ); // cache out patch ixs
tpix[6] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+6)/784)*25088 +
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+6) % 784 ); // cache out patch ixs
tpix[7] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+7)/784)*25088 +
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+7) % 784 ); // cache out patch ixs
tcix[0] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+0)*784; // cache out chan ixs
tcix[1] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+1)*784; // cache out chan ixs
tcix[2] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+2)*784; // cache out chan ixs
tcix[3] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+3)*784; // cache out chan ixs
tcix[4] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+4)*784; // cache out chan ixs
tcix[5] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+5)*784; // cache out chan ixs
tcix[6] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+6)*784; // cache out chan ixs
tcix[7] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+7)*784; // cache out chan ixs
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+0) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (32*784) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); }
if( tcix[1] < (32*784) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); }
if( tcix[2] < (32*784) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); }
if( tcix[3] < (32*784) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); }
if( tcix[4] < (32*784) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); }
if( tcix[5] < (32*784) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); }
if( tcix[6] < (32*784) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); }
if( tcix[7] < (32*784) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); }
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+1) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (32*784) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); }
if( tcix[1] < (32*784) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); }
if( tcix[2] < (32*784) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); }
if( tcix[3] < (32*784) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); }
if( tcix[4] < (32*784) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); }
if( tcix[5] < (32*784) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); }
if( tcix[6] < (32*784) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); }
if( tcix[7] < (32*784) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); }
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+2) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (32*784) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); }
if( tcix[1] < (32*784) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); }
if( tcix[2] < (32*784) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); }
if( tcix[3] < (32*784) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); }
if( tcix[4] < (32*784) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); }
if( tcix[5] < (32*784) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); }
if( tcix[6] < (32*784) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); }
if( tcix[7] < (32*784) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); }
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+3) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (32*784) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); }
if( tcix[1] < (32*784) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); }
if( tcix[2] < (32*784) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); }
if( tcix[3] < (32*784) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); }
if( tcix[4] < (32*784) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); }
if( tcix[5] < (32*784) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); }
if( tcix[6] < (32*784) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); }
if( tcix[7] < (32*784) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); }
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+4) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (32*784) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); }
if( tcix[1] < (32*784) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); }
if( tcix[2] < (32*784) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); }
if( tcix[3] < (32*784) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); }
if( tcix[4] < (32*784) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); }
if( tcix[5] < (32*784) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); }
if( tcix[6] < (32*784) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); }
if( tcix[7] < (32*784) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); }
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+5) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (32*784) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); }
if( tcix[1] < (32*784) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); }
if( tcix[2] < (32*784) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); }
if( tcix[3] < (32*784) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); }
if( tcix[4] < (32*784) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); }
if( tcix[5] < (32*784) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); }
if( tcix[6] < (32*784) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); }
if( tcix[7] < (32*784) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); }
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+6) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (32*784) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); }
if( tcix[1] < (32*784) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); }
if( tcix[2] < (32*784) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); }
if( tcix[3] < (32*784) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); }
if( tcix[4] < (32*784) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); }
if( tcix[5] < (32*784) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); }
if( tcix[6] < (32*784) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); }
if( tcix[7] < (32*784) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); }
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+7) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them.
if( tcix[0] < (32*784) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); }
if( tcix[1] < (32*784) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); }
if( tcix[2] < (32*784) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); }
if( tcix[3] < (32*784) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); }
if( tcix[4] < (32*784) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); }
if( tcix[5] < (32*784) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); }
if( tcix[6] < (32*784) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); }
if( tcix[7] < (32*784) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); }
// end t_tile_stores;
#endif
}
// -- template substituion table used: --
/* num_imgs = 20 */
/* in_pad = 0 */
/* in_dim_0 = 28 */
/* in_dim_1 = 28 */
/* conv_has_relu = 1 */
/* kern_sz = 1 */
/* stride = 1 */
/* out_chans = 32 */
/* in_chans = 256 */
/* rtc_func_name = conv__num_imgs_20__in_pad_0__in_dim_0_28__in_dim_1_28__conv_has_relu_1__kern_sz_1__stride_1__out_chans_32__in_chans_256 */
/* t_tile_sz = 8 */
/* out_ix_x_dim = 28 */
/* out_ix_x_sz = 1 */
/* out_ix_x_nomod = out_ix */
/* out_ix_x = (out_ix%%28) */
/* out_ix_y_dim = 28 */
/* out_ix_y_sz = 28 */
/* out_ix_y_nomod = (out_ix/28) */
/* out_ix_y = ((out_ix/28)%%28) */
/* out_ix_chan_dim = 32 */
/* out_ix_chan_sz = 784 */
/* out_ix_chan_nomod = (out_ix/784) */
/* out_ix_chan = ((out_ix/784)%%32) */
/* out_ix_img_dim = 20 */
/* out_ix_img_sz = 25088 */
/* out_ix_img_nomod = (out_ix/25088) */
/* out_ix_img = (out_ix/25088) */
/* out_ix_sz = 501760 */
/* in_ix_x_dim = 28 */
/* in_ix_x_sz = 1 */
/* in_ix_x_nomod = in_ix */
/* in_ix_x = (in_ix%%28) */
/* in_ix_y_dim = 28 */
/* in_ix_y_sz = 28 */
/* in_ix_y_nomod = (in_ix/28) */
/* in_ix_y = ((in_ix/28)%%28) */
/* in_ix_chan_dim = 256 */
/* in_ix_chan_sz = 784 */
/* in_ix_chan_nomod = (in_ix/784) */
/* in_ix_chan = ((in_ix/784)%%256) */
/* in_ix_img_dim = 20 */
/* in_ix_img_sz = 200704 */
/* in_ix_img_nomod = (in_ix/200704) */
/* in_ix_img = (in_ix/200704) */
/* in_ix_sz = 4014080 */
/* t_smem_patch_ix_x_dim = 28 */
/* t_smem_patch_ix_x_sz = 1 */
/* t_smem_patch_ix_x_nomod = t_smem_patch_ix */
/* t_smem_patch_ix_x = (t_smem_patch_ix%%28) */
/* t_smem_patch_ix_y_dim = 28 */
/* t_smem_patch_ix_y_sz = 28 */
/* t_smem_patch_ix_y_nomod = (t_smem_patch_ix/28) */
/* t_smem_patch_ix_y = ((t_smem_patch_ix/28)%%28) */
/* t_smem_patch_ix_img_dim = 20 */
/* t_smem_patch_ix_img_sz = 784 */
/* t_smem_patch_ix_img_nomod = (t_smem_patch_ix/784) */
/* t_smem_patch_ix_img = (t_smem_patch_ix/784) */
/* t_smem_patch_ix_sz = 15680 */
/* filts_ix_out_chan_elem_x_dim = 1 */
/* filts_ix_out_chan_elem_x_sz = 1 */
/* filts_ix_out_chan_elem_x_nomod = filts_ix_out_chan_elem */
/* filts_ix_out_chan_elem_x = (filts_ix_out_chan_elem%%1) */
/* filts_ix_out_chan_elem_y_dim = 1 */
/* filts_ix_out_chan_elem_y_sz = 1 */
/* filts_ix_out_chan_elem_y_nomod = filts_ix_out_chan_elem */
/* filts_ix_out_chan_elem_y = (filts_ix_out_chan_elem%%1) */
/* filts_ix_out_chan_elem_in_chan_dim = 256 */
/* filts_ix_out_chan_elem_in_chan_sz = 1 */
/* filts_ix_out_chan_elem_in_chan_nomod = filts_ix_out_chan_elem */
/* filts_ix_out_chan_elem_in_chan = filts_ix_out_chan_elem */
/* filts_ix_out_chan_elem_sz = 256 */
/* LOC_ID_1D_out_chan_tile_dim = 4 */
/* LOC_ID_1D_out_chan_tile_sz = 1 */
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%4) */
/* LOC_ID_1D_patch_tile_dim = 32 */
/* LOC_ID_1D_patch_tile_sz = 4 */
/* LOC_ID_1D_patch_tile_nomod = (LOC_ID_1D/4) */
/* LOC_ID_1D_patch_tile = (LOC_ID_1D/4) */
/* LOC_ID_1D_sz = 128 */
/* filts_xp_ix_out_chan_tile_dim = 4 */
/* filts_xp_ix_out_chan_tile_sz = 1 */
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%4) */
/* filts_xp_ix_out_chan_reg_dim = 8 */
/* filts_xp_ix_out_chan_reg_sz = 4 */
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/4) */
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/4)%%8) */
/* filts_xp_ix_x_dim = 1 */
/* filts_xp_ix_x_sz = 32 */
/* filts_xp_ix_x_nomod = (filts_xp_ix/32) */
/* filts_xp_ix_x = ((filts_xp_ix/32)%%1) */
/* filts_xp_ix_y_dim = 1 */
/* filts_xp_ix_y_sz = 32 */
/* filts_xp_ix_y_nomod = (filts_xp_ix/32) */
/* filts_xp_ix_y = ((filts_xp_ix/32)%%1) */
/* filts_xp_ix_in_chan_dim = 256 */
/* filts_xp_ix_in_chan_sz = 32 */
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/32) */
/* filts_xp_ix_in_chan = ((filts_xp_ix/32)%%256) */
/* filts_xp_ix_out_chan_blk_dim = 1 */
/* filts_xp_ix_out_chan_blk_sz = 8192 */
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/8192) */
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/8192) */
/* filts_xp_ix_sz = 8192 */
/* patch_smem_load_iter = 2 */
/* GRP_ID_1D_out_chan_blk_dim = 1 */
/* GRP_ID_1D_out_chan_blk_sz = 1 */
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */
/* GRP_ID_1D_patch_blk_dim = 62 */
/* GRP_ID_1D_patch_blk_sz = 1 */
/* GRP_ID_1D_patch_blk_nomod = GRP_ID_1D */
/* GRP_ID_1D_patch_blk = GRP_ID_1D */
/* GRP_ID_1D_sz = 62 */
/* out_chan_tile = (%(LOC_ID_1D_out_chan_tile)+%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim)) */
/* patch_tile = (%(LOC_ID_1D_patch_tile)+%(GRP_ID_1D_patch_blk)*%(LOC_ID_1D_patch_tile_dim)) */
/* out_chan_ix = (%(out_chan_tile)*%(t_tile_sz)) */
/* patch_ix_0 = (%(patch_tile)*%(t_tile_sz)+0) */
/* patch_ix_0_x_dim = 28 */
/* patch_ix_0_x_sz = 1 */
/* patch_ix_0_x_nomod = %(patch_ix_0) */
/* patch_ix_0_x = (%(patch_ix_0)%%28) */
/* patch_ix_0_y_dim = 28 */
/* patch_ix_0_y_sz = 28 */
/* patch_ix_0_y_nomod = (%(patch_ix_0)/28) */
/* patch_ix_0_y = ((%(patch_ix_0)/28)%%28) */
/* patch_ix_0_img_dim = 20 */
/* patch_ix_0_img_sz = 784 */
/* patch_ix_0_img_nomod = (%(patch_ix_0)/784) */
/* patch_ix_0_img = (%(patch_ix_0)/784) */
/* patch_ix_0_sz = 15680 */
/* patch_ix_1 = (%(patch_tile)*%(t_tile_sz)+1) */
/* patch_ix_1_x_dim = 28 */
/* patch_ix_1_x_sz = 1 */
/* patch_ix_1_x_nomod = %(patch_ix_1) */
/* patch_ix_1_x = (%(patch_ix_1)%%28) */
/* patch_ix_1_y_dim = 28 */
/* patch_ix_1_y_sz = 28 */
/* patch_ix_1_y_nomod = (%(patch_ix_1)/28) */
/* patch_ix_1_y = ((%(patch_ix_1)/28)%%28) */
/* patch_ix_1_img_dim = 20 */
/* patch_ix_1_img_sz = 784 */
/* patch_ix_1_img_nomod = (%(patch_ix_1)/784) */
/* patch_ix_1_img = (%(patch_ix_1)/784) */
/* patch_ix_1_sz = 15680 */
/* patch_ix_2 = (%(patch_tile)*%(t_tile_sz)+2) */
/* patch_ix_2_x_dim = 28 */
/* patch_ix_2_x_sz = 1 */
/* patch_ix_2_x_nomod = %(patch_ix_2) */
/* patch_ix_2_x = (%(patch_ix_2)%%28) */
/* patch_ix_2_y_dim = 28 */
/* patch_ix_2_y_sz = 28 */
/* patch_ix_2_y_nomod = (%(patch_ix_2)/28) */
/* patch_ix_2_y = ((%(patch_ix_2)/28)%%28) */
/* patch_ix_2_img_dim = 20 */
/* patch_ix_2_img_sz = 784 */
/* patch_ix_2_img_nomod = (%(patch_ix_2)/784) */
/* patch_ix_2_img = (%(patch_ix_2)/784) */
/* patch_ix_2_sz = 15680 */
/* patch_ix_3 = (%(patch_tile)*%(t_tile_sz)+3) */
/* patch_ix_3_x_dim = 28 */
/* patch_ix_3_x_sz = 1 */
/* patch_ix_3_x_nomod = %(patch_ix_3) */
/* patch_ix_3_x = (%(patch_ix_3)%%28) */
/* patch_ix_3_y_dim = 28 */
/* patch_ix_3_y_sz = 28 */
/* patch_ix_3_y_nomod = (%(patch_ix_3)/28) */
/* patch_ix_3_y = ((%(patch_ix_3)/28)%%28) */
/* patch_ix_3_img_dim = 20 */
/* patch_ix_3_img_sz = 784 */
/* patch_ix_3_img_nomod = (%(patch_ix_3)/784) */
/* patch_ix_3_img = (%(patch_ix_3)/784) */
/* patch_ix_3_sz = 15680 */
/* patch_ix_4 = (%(patch_tile)*%(t_tile_sz)+4) */
/* patch_ix_4_x_dim = 28 */
/* patch_ix_4_x_sz = 1 */
/* patch_ix_4_x_nomod = %(patch_ix_4) */
/* patch_ix_4_x = (%(patch_ix_4)%%28) */
/* patch_ix_4_y_dim = 28 */
/* patch_ix_4_y_sz = 28 */
/* patch_ix_4_y_nomod = (%(patch_ix_4)/28) */
/* patch_ix_4_y = ((%(patch_ix_4)/28)%%28) */
/* patch_ix_4_img_dim = 20 */
/* patch_ix_4_img_sz = 784 */
/* patch_ix_4_img_nomod = (%(patch_ix_4)/784) */
/* patch_ix_4_img = (%(patch_ix_4)/784) */
/* patch_ix_4_sz = 15680 */
/* patch_ix_5 = (%(patch_tile)*%(t_tile_sz)+5) */
/* patch_ix_5_x_dim = 28 */
/* patch_ix_5_x_sz = 1 */
/* patch_ix_5_x_nomod = %(patch_ix_5) */
/* patch_ix_5_x = (%(patch_ix_5)%%28) */
/* patch_ix_5_y_dim = 28 */
/* patch_ix_5_y_sz = 28 */
/* patch_ix_5_y_nomod = (%(patch_ix_5)/28) */
/* patch_ix_5_y = ((%(patch_ix_5)/28)%%28) */
/* patch_ix_5_img_dim = 20 */
/* patch_ix_5_img_sz = 784 */
/* patch_ix_5_img_nomod = (%(patch_ix_5)/784) */
/* patch_ix_5_img = (%(patch_ix_5)/784) */
/* patch_ix_5_sz = 15680 */
/* patch_ix_6 = (%(patch_tile)*%(t_tile_sz)+6) */
/* patch_ix_6_x_dim = 28 */
/* patch_ix_6_x_sz = 1 */
/* patch_ix_6_x_nomod = %(patch_ix_6) */
/* patch_ix_6_x = (%(patch_ix_6)%%28) */
/* patch_ix_6_y_dim = 28 */
/* patch_ix_6_y_sz = 28 */
/* patch_ix_6_y_nomod = (%(patch_ix_6)/28) */
/* patch_ix_6_y = ((%(patch_ix_6)/28)%%28) */
/* patch_ix_6_img_dim = 20 */
/* patch_ix_6_img_sz = 784 */
/* patch_ix_6_img_nomod = (%(patch_ix_6)/784) */
/* patch_ix_6_img = (%(patch_ix_6)/784) */
/* patch_ix_6_sz = 15680 */
/* patch_ix_7 = (%(patch_tile)*%(t_tile_sz)+7) */
/* patch_ix_7_x_dim = 28 */
/* patch_ix_7_x_sz = 1 */
/* patch_ix_7_x_nomod = %(patch_ix_7) */
/* patch_ix_7_x = (%(patch_ix_7)%%28) */
/* patch_ix_7_y_dim = 28 */
/* patch_ix_7_y_sz = 28 */
/* patch_ix_7_y_nomod = (%(patch_ix_7)/28) */
/* patch_ix_7_y = ((%(patch_ix_7)/28)%%28) */
/* patch_ix_7_img_dim = 20 */
/* patch_ix_7_img_sz = 784 */
/* patch_ix_7_img_nomod = (%(patch_ix_7)/784) */
/* patch_ix_7_img = (%(patch_ix_7)/784) */
/* patch_ix_7_sz = 15680 */
/* get_in = float v = 0;
int const smem_in_ix_y = %(t_smem_patch_ix_y)*%(stride)+%(filts_ix_out_chan_elem_y) - %(in_pad);
int const smem_in_ix_x = %(t_smem_patch_ix_x)*%(stride)+%(filts_ix_out_chan_elem_x) - %(in_pad);
if(smem_in_ix_y >= 0 && smem_in_ix_x >= 0 &&
%(t_smem_patch_ix_img) < %(in_ix_img_dim) &&
smem_in_ix_x < %(in_ix_x_dim) && smem_in_ix_y < %(in_ix_y_dim) ) {
v = in[%(t_smem_patch_ix_img)*%(in_ix_img_sz) +
%(filts_ix_out_chan_elem_in_chan)*%(in_ix_chan_sz) +
smem_in_ix_y*%(in_ix_y_sz) +
smem_in_ix_x*%(in_ix_x_sz)];
} */
/* t_tile_fmas = // begin t_tile_fmas
out_tile[0] += filts_strip[0]*in_strip[0];
out_tile[1] += filts_strip[1]*in_strip[0];
out_tile[2] += filts_strip[2]*in_strip[0];
out_tile[3] += filts_strip[3]*in_strip[0];
out_tile[4] += filts_strip[4]*in_strip[0];
out_tile[5] += filts_strip[5]*in_strip[0];
out_tile[6] += filts_strip[6]*in_strip[0];
out_tile[7] += filts_strip[7]*in_strip[0];
out_tile[8] += filts_strip[0]*in_strip[1];
out_tile[9] += filts_strip[1]*in_strip[1];
out_tile[10] += filts_strip[2]*in_strip[1];
out_tile[11] += filts_strip[3]*in_strip[1];
out_tile[12] += filts_strip[4]*in_strip[1];
out_tile[13] += filts_strip[5]*in_strip[1];
out_tile[14] += filts_strip[6]*in_strip[1];
out_tile[15] += filts_strip[7]*in_strip[1];
out_tile[16] += filts_strip[0]*in_strip[2];
out_tile[17] += filts_strip[1]*in_strip[2];
out_tile[18] += filts_strip[2]*in_strip[2];
out_tile[19] += filts_strip[3]*in_strip[2];
out_tile[20] += filts_strip[4]*in_strip[2];
out_tile[21] += filts_strip[5]*in_strip[2];
out_tile[22] += filts_strip[6]*in_strip[2];
out_tile[23] += filts_strip[7]*in_strip[2];
out_tile[24] += filts_strip[0]*in_strip[3];
out_tile[25]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment