Last active
October 14, 2015 21:21
-
-
Save moskewcz/20b4b19818622c3d3904 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//typedef unsigned uint32_t; | |
typedef int int32_t; | |
//typedef long long int64_t; | |
#define CUCL_GLOBAL_KERNEL kernel | |
#define GASQ global | |
#define GLOB_ID_1D get_global_id(0) | |
#define LOC_ID_1D get_local_id(0) | |
#define GRP_ID_1D get_group_id(0) | |
#define LOC_SZ_1D get_local_size(0) | |
#define LOCSHAR_MEM local | |
#define LSMASQ local | |
#define BARRIER_SYNC barrier(CLK_LOCAL_MEM_FENCE) | |
// note: it seems OpenCL doesn't provide powf(), but instead overloads pow() for double and float. | |
// so, we use this as a compatibility wrapper. | |
// the casts should help uses that might expect implict casts from double->float when using powf() | |
// ... or maybe that's a bad idea? | |
#define powf(v,e) pow((float)v,(float)e) | |
// each thread: computes 8x8 block of out | |
// loop over k dim | |
CUCL_GLOBAL_KERNEL void tconv__num_imgs_20__in_dim_0_227__in_dim_1_227__kern_sz_7__stride_2__in_pad_3__t_tile_sz_8__conv_has_relu_1__out_chans_64__in_chans_3( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) { | |
LOCSHAR_MEM float all_smem[1330]; // note: max(filts+in,out) == max(448+882,1024) | |
LSMASQ float * const filts_smem = all_smem; | |
LSMASQ float * const in_smem = filts_smem + 448; | |
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers | |
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem | |
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz ) | |
float in_strip[21]; // segment of input line sufficient for one unrolling of inner loop | |
int32_t blk_in_ix_base = GRP_ID_1D*2646 + LOC_ID_1D;// index of first input pel to load for this thread | |
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*9408; // index of first out chan | |
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D; | |
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%8); | |
int32_t out_line = (GRP_ID_1D/15)*16; // first out_line of block | |
int32_t const blk_fli = (out_line/114); // image of first out_line of block | |
out_line += (LOC_ID_1D/8); // adjust to out_line of this thread | |
// offset in lines to deal with >1 img/block = (number of prior images (partial or full) in this block) * (adj to next img) | |
int32_t const img_off_lines = ((out_line/114) - blk_fli)*(7-2); | |
int32_t const in_y = (out_line%114)*2 - 3; | |
for( int32_t in_chan = 0; in_chan != 3; ++in_chan ) { | |
BARRIER_SYNC; | |
// begin in_smem_loads | |
in_smem[(LOC_ID_1D + 128 * 0)] = in[ blk_in_ix_base + (128*0) ]; | |
in_smem[(LOC_ID_1D + 128 * 1)] = in[ blk_in_ix_base + (128*1) ]; | |
in_smem[(LOC_ID_1D + 128 * 2)] = in[ blk_in_ix_base + (128*2) ]; | |
in_smem[(LOC_ID_1D + 128 * 3)] = in[ blk_in_ix_base + (128*3) ]; | |
in_smem[(LOC_ID_1D + 128 * 4)] = in[ blk_in_ix_base + (128*4) ]; | |
in_smem[(LOC_ID_1D + 128 * 5)] = in[ blk_in_ix_base + (128*5) ]; | |
if( (LOC_ID_1D + 128 * 6) < 882) { in_smem[(LOC_ID_1D + 128 * 6)] = in[ blk_in_ix_base + (128*6) ];} | |
blk_in_ix_base += 882; | |
// end in_smem_loads; | |
for( int32_t ky = 0; ky != 7; ++ky ) { | |
if( ky != 0 ) { BARRIER_SYNC; } | |
// begin filt_smem_loads | |
filts_smem[(LOC_ID_1D + 128 * 0)] = filts[filts_off+(128*0)]; | |
filts_smem[(LOC_ID_1D + 128 * 1)] = filts[filts_off+(128*1)]; | |
filts_smem[(LOC_ID_1D + 128 * 2)] = filts[filts_off+(128*2)]; | |
if( (LOC_ID_1D + 128 * 3) < 448 ) { filts_smem[(LOC_ID_1D + 128 * 3)] = filts[filts_off+(128*3)];} | |
filts_off += 448; | |
// end filt_smem_loads; | |
BARRIER_SYNC; | |
if( (out_line/114) >= 20 ) { continue; } // required: skip lines from invalid images (read might be invalid) | |
if( ((in_y+ky) < 0) || ((in_y+ky)>227) ) { continue; } // optimization: skip known-to-be-padding input lines | |
LSMASQ float * const in_smem_off = in_smem + ((LOC_ID_1D/8)*2+ky+img_off_lines)*21; | |
// begin inner_loop_body | |
in_strip[0] = in_smem_off[0]; | |
in_strip[1] = in_smem_off[1]; | |
in_strip[2] = in_smem_off[2]; | |
in_strip[3] = in_smem_off[3]; | |
in_strip[4] = in_smem_off[4]; | |
in_strip[5] = in_smem_off[5]; | |
in_strip[6] = in_smem_off[6]; | |
in_strip[7] = in_smem_off[7]; | |
in_strip[8] = in_smem_off[8]; | |
in_strip[9] = in_smem_off[9]; | |
in_strip[10] = in_smem_off[10]; | |
in_strip[11] = in_smem_off[11]; | |
in_strip[12] = in_smem_off[12]; | |
in_strip[13] = in_smem_off[13]; | |
in_strip[14] = in_smem_off[14]; | |
in_strip[15] = in_smem_off[15]; | |
in_strip[16] = in_smem_off[16]; | |
in_strip[17] = in_smem_off[17]; | |
in_strip[18] = in_smem_off[18]; | |
in_strip[19] = in_smem_off[19]; | |
in_strip[20] = in_smem_off[20]; | |
filts_strip[0] = filts_smem_off[0*64+0*8]; | |
filts_strip[1] = filts_smem_off[0*64+1*8]; | |
filts_strip[2] = filts_smem_off[0*64+2*8]; | |
filts_strip[3] = filts_smem_off[0*64+3*8]; | |
filts_strip[4] = filts_smem_off[0*64+4*8]; | |
filts_strip[5] = filts_smem_off[0*64+5*8]; | |
filts_strip[6] = filts_smem_off[0*64+6*8]; | |
filts_strip[7] = filts_smem_off[0*64+7*8]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[2]; | |
out_tile[9] += filts_strip[1]*in_strip[2]; | |
out_tile[10] += filts_strip[2]*in_strip[2]; | |
out_tile[11] += filts_strip[3]*in_strip[2]; | |
out_tile[12] += filts_strip[4]*in_strip[2]; | |
out_tile[13] += filts_strip[5]*in_strip[2]; | |
out_tile[14] += filts_strip[6]*in_strip[2]; | |
out_tile[15] += filts_strip[7]*in_strip[2]; | |
out_tile[16] += filts_strip[0]*in_strip[4]; | |
out_tile[17] += filts_strip[1]*in_strip[4]; | |
out_tile[18] += filts_strip[2]*in_strip[4]; | |
out_tile[19] += filts_strip[3]*in_strip[4]; | |
out_tile[20] += filts_strip[4]*in_strip[4]; | |
out_tile[21] += filts_strip[5]*in_strip[4]; | |
out_tile[22] += filts_strip[6]*in_strip[4]; | |
out_tile[23] += filts_strip[7]*in_strip[4]; | |
out_tile[24] += filts_strip[0]*in_strip[6]; | |
out_tile[25] += filts_strip[1]*in_strip[6]; | |
out_tile[26] += filts_strip[2]*in_strip[6]; | |
out_tile[27] += filts_strip[3]*in_strip[6]; | |
out_tile[28] += filts_strip[4]*in_strip[6]; | |
out_tile[29] += filts_strip[5]*in_strip[6]; | |
out_tile[30] += filts_strip[6]*in_strip[6]; | |
out_tile[31] += filts_strip[7]*in_strip[6]; | |
out_tile[32] += filts_strip[0]*in_strip[8]; | |
out_tile[33] += filts_strip[1]*in_strip[8]; | |
out_tile[34] += filts_strip[2]*in_strip[8]; | |
out_tile[35] += filts_strip[3]*in_strip[8]; | |
out_tile[36] += filts_strip[4]*in_strip[8]; | |
out_tile[37] += filts_strip[5]*in_strip[8]; | |
out_tile[38] += filts_strip[6]*in_strip[8]; | |
out_tile[39] += filts_strip[7]*in_strip[8]; | |
out_tile[40] += filts_strip[0]*in_strip[10]; | |
out_tile[41] += filts_strip[1]*in_strip[10]; | |
out_tile[42] += filts_strip[2]*in_strip[10]; | |
out_tile[43] += filts_strip[3]*in_strip[10]; | |
out_tile[44] += filts_strip[4]*in_strip[10]; | |
out_tile[45] += filts_strip[5]*in_strip[10]; | |
out_tile[46] += filts_strip[6]*in_strip[10]; | |
out_tile[47] += filts_strip[7]*in_strip[10]; | |
out_tile[48] += filts_strip[0]*in_strip[12]; | |
out_tile[49] += filts_strip[1]*in_strip[12]; | |
out_tile[50] += filts_strip[2]*in_strip[12]; | |
out_tile[51] += filts_strip[3]*in_strip[12]; | |
out_tile[52] += filts_strip[4]*in_strip[12]; | |
out_tile[53] += filts_strip[5]*in_strip[12]; | |
out_tile[54] += filts_strip[6]*in_strip[12]; | |
out_tile[55] += filts_strip[7]*in_strip[12]; | |
out_tile[56] += filts_strip[0]*in_strip[14]; | |
out_tile[57] += filts_strip[1]*in_strip[14]; | |
out_tile[58] += filts_strip[2]*in_strip[14]; | |
out_tile[59] += filts_strip[3]*in_strip[14]; | |
out_tile[60] += filts_strip[4]*in_strip[14]; | |
out_tile[61] += filts_strip[5]*in_strip[14]; | |
out_tile[62] += filts_strip[6]*in_strip[14]; | |
out_tile[63] += filts_strip[7]*in_strip[14]; | |
filts_strip[0] = filts_smem_off[1*64+0*8]; | |
filts_strip[1] = filts_smem_off[1*64+1*8]; | |
filts_strip[2] = filts_smem_off[1*64+2*8]; | |
filts_strip[3] = filts_smem_off[1*64+3*8]; | |
filts_strip[4] = filts_smem_off[1*64+4*8]; | |
filts_strip[5] = filts_smem_off[1*64+5*8]; | |
filts_strip[6] = filts_smem_off[1*64+6*8]; | |
filts_strip[7] = filts_smem_off[1*64+7*8]; | |
out_tile[0] += filts_strip[0]*in_strip[1]; | |
out_tile[1] += filts_strip[1]*in_strip[1]; | |
out_tile[2] += filts_strip[2]*in_strip[1]; | |
out_tile[3] += filts_strip[3]*in_strip[1]; | |
out_tile[4] += filts_strip[4]*in_strip[1]; | |
out_tile[5] += filts_strip[5]*in_strip[1]; | |
out_tile[6] += filts_strip[6]*in_strip[1]; | |
out_tile[7] += filts_strip[7]*in_strip[1]; | |
out_tile[8] += filts_strip[0]*in_strip[3]; | |
out_tile[9] += filts_strip[1]*in_strip[3]; | |
out_tile[10] += filts_strip[2]*in_strip[3]; | |
out_tile[11] += filts_strip[3]*in_strip[3]; | |
out_tile[12] += filts_strip[4]*in_strip[3]; | |
out_tile[13] += filts_strip[5]*in_strip[3]; | |
out_tile[14] += filts_strip[6]*in_strip[3]; | |
out_tile[15] += filts_strip[7]*in_strip[3]; | |
out_tile[16] += filts_strip[0]*in_strip[5]; | |
out_tile[17] += filts_strip[1]*in_strip[5]; | |
out_tile[18] += filts_strip[2]*in_strip[5]; | |
out_tile[19] += filts_strip[3]*in_strip[5]; | |
out_tile[20] += filts_strip[4]*in_strip[5]; | |
out_tile[21] += filts_strip[5]*in_strip[5]; | |
out_tile[22] += filts_strip[6]*in_strip[5]; | |
out_tile[23] += filts_strip[7]*in_strip[5]; | |
out_tile[24] += filts_strip[0]*in_strip[7]; | |
out_tile[25] += filts_strip[1]*in_strip[7]; | |
out_tile[26] += filts_strip[2]*in_strip[7]; | |
out_tile[27] += filts_strip[3]*in_strip[7]; | |
out_tile[28] += filts_strip[4]*in_strip[7]; | |
out_tile[29] += filts_strip[5]*in_strip[7]; | |
out_tile[30] += filts_strip[6]*in_strip[7]; | |
out_tile[31] += filts_strip[7]*in_strip[7]; | |
out_tile[32] += filts_strip[0]*in_strip[9]; | |
out_tile[33] += filts_strip[1]*in_strip[9]; | |
out_tile[34] += filts_strip[2]*in_strip[9]; | |
out_tile[35] += filts_strip[3]*in_strip[9]; | |
out_tile[36] += filts_strip[4]*in_strip[9]; | |
out_tile[37] += filts_strip[5]*in_strip[9]; | |
out_tile[38] += filts_strip[6]*in_strip[9]; | |
out_tile[39] += filts_strip[7]*in_strip[9]; | |
out_tile[40] += filts_strip[0]*in_strip[11]; | |
out_tile[41] += filts_strip[1]*in_strip[11]; | |
out_tile[42] += filts_strip[2]*in_strip[11]; | |
out_tile[43] += filts_strip[3]*in_strip[11]; | |
out_tile[44] += filts_strip[4]*in_strip[11]; | |
out_tile[45] += filts_strip[5]*in_strip[11]; | |
out_tile[46] += filts_strip[6]*in_strip[11]; | |
out_tile[47] += filts_strip[7]*in_strip[11]; | |
out_tile[48] += filts_strip[0]*in_strip[13]; | |
out_tile[49] += filts_strip[1]*in_strip[13]; | |
out_tile[50] += filts_strip[2]*in_strip[13]; | |
out_tile[51] += filts_strip[3]*in_strip[13]; | |
out_tile[52] += filts_strip[4]*in_strip[13]; | |
out_tile[53] += filts_strip[5]*in_strip[13]; | |
out_tile[54] += filts_strip[6]*in_strip[13]; | |
out_tile[55] += filts_strip[7]*in_strip[13]; | |
out_tile[56] += filts_strip[0]*in_strip[15]; | |
out_tile[57] += filts_strip[1]*in_strip[15]; | |
out_tile[58] += filts_strip[2]*in_strip[15]; | |
out_tile[59] += filts_strip[3]*in_strip[15]; | |
out_tile[60] += filts_strip[4]*in_strip[15]; | |
out_tile[61] += filts_strip[5]*in_strip[15]; | |
out_tile[62] += filts_strip[6]*in_strip[15]; | |
out_tile[63] += filts_strip[7]*in_strip[15]; | |
filts_strip[0] = filts_smem_off[2*64+0*8]; | |
filts_strip[1] = filts_smem_off[2*64+1*8]; | |
filts_strip[2] = filts_smem_off[2*64+2*8]; | |
filts_strip[3] = filts_smem_off[2*64+3*8]; | |
filts_strip[4] = filts_smem_off[2*64+4*8]; | |
filts_strip[5] = filts_smem_off[2*64+5*8]; | |
filts_strip[6] = filts_smem_off[2*64+6*8]; | |
filts_strip[7] = filts_smem_off[2*64+7*8]; | |
out_tile[0] += filts_strip[0]*in_strip[2]; | |
out_tile[1] += filts_strip[1]*in_strip[2]; | |
out_tile[2] += filts_strip[2]*in_strip[2]; | |
out_tile[3] += filts_strip[3]*in_strip[2]; | |
out_tile[4] += filts_strip[4]*in_strip[2]; | |
out_tile[5] += filts_strip[5]*in_strip[2]; | |
out_tile[6] += filts_strip[6]*in_strip[2]; | |
out_tile[7] += filts_strip[7]*in_strip[2]; | |
out_tile[8] += filts_strip[0]*in_strip[4]; | |
out_tile[9] += filts_strip[1]*in_strip[4]; | |
out_tile[10] += filts_strip[2]*in_strip[4]; | |
out_tile[11] += filts_strip[3]*in_strip[4]; | |
out_tile[12] += filts_strip[4]*in_strip[4]; | |
out_tile[13] += filts_strip[5]*in_strip[4]; | |
out_tile[14] += filts_strip[6]*in_strip[4]; | |
out_tile[15] += filts_strip[7]*in_strip[4]; | |
out_tile[16] += filts_strip[0]*in_strip[6]; | |
out_tile[17] += filts_strip[1]*in_strip[6]; | |
out_tile[18] += filts_strip[2]*in_strip[6]; | |
out_tile[19] += filts_strip[3]*in_strip[6]; | |
out_tile[20] += filts_strip[4]*in_strip[6]; | |
out_tile[21] += filts_strip[5]*in_strip[6]; | |
out_tile[22] += filts_strip[6]*in_strip[6]; | |
out_tile[23] += filts_strip[7]*in_strip[6]; | |
out_tile[24] += filts_strip[0]*in_strip[8]; | |
out_tile[25] += filts_strip[1]*in_strip[8]; | |
out_tile[26] += filts_strip[2]*in_strip[8]; | |
out_tile[27] += filts_strip[3]*in_strip[8]; | |
out_tile[28] += filts_strip[4]*in_strip[8]; | |
out_tile[29] += filts_strip[5]*in_strip[8]; | |
out_tile[30] += filts_strip[6]*in_strip[8]; | |
out_tile[31] += filts_strip[7]*in_strip[8]; | |
out_tile[32] += filts_strip[0]*in_strip[10]; | |
out_tile[33] += filts_strip[1]*in_strip[10]; | |
out_tile[34] += filts_strip[2]*in_strip[10]; | |
out_tile[35] += filts_strip[3]*in_strip[10]; | |
out_tile[36] += filts_strip[4]*in_strip[10]; | |
out_tile[37] += filts_strip[5]*in_strip[10]; | |
out_tile[38] += filts_strip[6]*in_strip[10]; | |
out_tile[39] += filts_strip[7]*in_strip[10]; | |
out_tile[40] += filts_strip[0]*in_strip[12]; | |
out_tile[41] += filts_strip[1]*in_strip[12]; | |
out_tile[42] += filts_strip[2]*in_strip[12]; | |
out_tile[43] += filts_strip[3]*in_strip[12]; | |
out_tile[44] += filts_strip[4]*in_strip[12]; | |
out_tile[45] += filts_strip[5]*in_strip[12]; | |
out_tile[46] += filts_strip[6]*in_strip[12]; | |
out_tile[47] += filts_strip[7]*in_strip[12]; | |
out_tile[48] += filts_strip[0]*in_strip[14]; | |
out_tile[49] += filts_strip[1]*in_strip[14]; | |
out_tile[50] += filts_strip[2]*in_strip[14]; | |
out_tile[51] += filts_strip[3]*in_strip[14]; | |
out_tile[52] += filts_strip[4]*in_strip[14]; | |
out_tile[53] += filts_strip[5]*in_strip[14]; | |
out_tile[54] += filts_strip[6]*in_strip[14]; | |
out_tile[55] += filts_strip[7]*in_strip[14]; | |
out_tile[56] += filts_strip[0]*in_strip[16]; | |
out_tile[57] += filts_strip[1]*in_strip[16]; | |
out_tile[58] += filts_strip[2]*in_strip[16]; | |
out_tile[59] += filts_strip[3]*in_strip[16]; | |
out_tile[60] += filts_strip[4]*in_strip[16]; | |
out_tile[61] += filts_strip[5]*in_strip[16]; | |
out_tile[62] += filts_strip[6]*in_strip[16]; | |
out_tile[63] += filts_strip[7]*in_strip[16]; | |
filts_strip[0] = filts_smem_off[3*64+0*8]; | |
filts_strip[1] = filts_smem_off[3*64+1*8]; | |
filts_strip[2] = filts_smem_off[3*64+2*8]; | |
filts_strip[3] = filts_smem_off[3*64+3*8]; | |
filts_strip[4] = filts_smem_off[3*64+4*8]; | |
filts_strip[5] = filts_smem_off[3*64+5*8]; | |
filts_strip[6] = filts_smem_off[3*64+6*8]; | |
filts_strip[7] = filts_smem_off[3*64+7*8]; | |
out_tile[0] += filts_strip[0]*in_strip[3]; | |
out_tile[1] += filts_strip[1]*in_strip[3]; | |
out_tile[2] += filts_strip[2]*in_strip[3]; | |
out_tile[3] += filts_strip[3]*in_strip[3]; | |
out_tile[4] += filts_strip[4]*in_strip[3]; | |
out_tile[5] += filts_strip[5]*in_strip[3]; | |
out_tile[6] += filts_strip[6]*in_strip[3]; | |
out_tile[7] += filts_strip[7]*in_strip[3]; | |
out_tile[8] += filts_strip[0]*in_strip[5]; | |
out_tile[9] += filts_strip[1]*in_strip[5]; | |
out_tile[10] += filts_strip[2]*in_strip[5]; | |
out_tile[11] += filts_strip[3]*in_strip[5]; | |
out_tile[12] += filts_strip[4]*in_strip[5]; | |
out_tile[13] += filts_strip[5]*in_strip[5]; | |
out_tile[14] += filts_strip[6]*in_strip[5]; | |
out_tile[15] += filts_strip[7]*in_strip[5]; | |
out_tile[16] += filts_strip[0]*in_strip[7]; | |
out_tile[17] += filts_strip[1]*in_strip[7]; | |
out_tile[18] += filts_strip[2]*in_strip[7]; | |
out_tile[19] += filts_strip[3]*in_strip[7]; | |
out_tile[20] += filts_strip[4]*in_strip[7]; | |
out_tile[21] += filts_strip[5]*in_strip[7]; | |
out_tile[22] += filts_strip[6]*in_strip[7]; | |
out_tile[23] += filts_strip[7]*in_strip[7]; | |
out_tile[24] += filts_strip[0]*in_strip[9]; | |
out_tile[25] += filts_strip[1]*in_strip[9]; | |
out_tile[26] += filts_strip[2]*in_strip[9]; | |
out_tile[27] += filts_strip[3]*in_strip[9]; | |
out_tile[28] += filts_strip[4]*in_strip[9]; | |
out_tile[29] += filts_strip[5]*in_strip[9]; | |
out_tile[30] += filts_strip[6]*in_strip[9]; | |
out_tile[31] += filts_strip[7]*in_strip[9]; | |
out_tile[32] += filts_strip[0]*in_strip[11]; | |
out_tile[33] += filts_strip[1]*in_strip[11]; | |
out_tile[34] += filts_strip[2]*in_strip[11]; | |
out_tile[35] += filts_strip[3]*in_strip[11]; | |
out_tile[36] += filts_strip[4]*in_strip[11]; | |
out_tile[37] += filts_strip[5]*in_strip[11]; | |
out_tile[38] += filts_strip[6]*in_strip[11]; | |
out_tile[39] += filts_strip[7]*in_strip[11]; | |
out_tile[40] += filts_strip[0]*in_strip[13]; | |
out_tile[41] += filts_strip[1]*in_strip[13]; | |
out_tile[42] += filts_strip[2]*in_strip[13]; | |
out_tile[43] += filts_strip[3]*in_strip[13]; | |
out_tile[44] += filts_strip[4]*in_strip[13]; | |
out_tile[45] += filts_strip[5]*in_strip[13]; | |
out_tile[46] += filts_strip[6]*in_strip[13]; | |
out_tile[47] += filts_strip[7]*in_strip[13]; | |
out_tile[48] += filts_strip[0]*in_strip[15]; | |
out_tile[49] += filts_strip[1]*in_strip[15]; | |
out_tile[50] += filts_strip[2]*in_strip[15]; | |
out_tile[51] += filts_strip[3]*in_strip[15]; | |
out_tile[52] += filts_strip[4]*in_strip[15]; | |
out_tile[53] += filts_strip[5]*in_strip[15]; | |
out_tile[54] += filts_strip[6]*in_strip[15]; | |
out_tile[55] += filts_strip[7]*in_strip[15]; | |
out_tile[56] += filts_strip[0]*in_strip[17]; | |
out_tile[57] += filts_strip[1]*in_strip[17]; | |
out_tile[58] += filts_strip[2]*in_strip[17]; | |
out_tile[59] += filts_strip[3]*in_strip[17]; | |
out_tile[60] += filts_strip[4]*in_strip[17]; | |
out_tile[61] += filts_strip[5]*in_strip[17]; | |
out_tile[62] += filts_strip[6]*in_strip[17]; | |
out_tile[63] += filts_strip[7]*in_strip[17]; | |
filts_strip[0] = filts_smem_off[4*64+0*8]; | |
filts_strip[1] = filts_smem_off[4*64+1*8]; | |
filts_strip[2] = filts_smem_off[4*64+2*8]; | |
filts_strip[3] = filts_smem_off[4*64+3*8]; | |
filts_strip[4] = filts_smem_off[4*64+4*8]; | |
filts_strip[5] = filts_smem_off[4*64+5*8]; | |
filts_strip[6] = filts_smem_off[4*64+6*8]; | |
filts_strip[7] = filts_smem_off[4*64+7*8]; | |
out_tile[0] += filts_strip[0]*in_strip[4]; | |
out_tile[1] += filts_strip[1]*in_strip[4]; | |
out_tile[2] += filts_strip[2]*in_strip[4]; | |
out_tile[3] += filts_strip[3]*in_strip[4]; | |
out_tile[4] += filts_strip[4]*in_strip[4]; | |
out_tile[5] += filts_strip[5]*in_strip[4]; | |
out_tile[6] += filts_strip[6]*in_strip[4]; | |
out_tile[7] += filts_strip[7]*in_strip[4]; | |
out_tile[8] += filts_strip[0]*in_strip[6]; | |
out_tile[9] += filts_strip[1]*in_strip[6]; | |
out_tile[10] += filts_strip[2]*in_strip[6]; | |
out_tile[11] += filts_strip[3]*in_strip[6]; | |
out_tile[12] += filts_strip[4]*in_strip[6]; | |
out_tile[13] += filts_strip[5]*in_strip[6]; | |
out_tile[14] += filts_strip[6]*in_strip[6]; | |
out_tile[15] += filts_strip[7]*in_strip[6]; | |
out_tile[16] += filts_strip[0]*in_strip[8]; | |
out_tile[17] += filts_strip[1]*in_strip[8]; | |
out_tile[18] += filts_strip[2]*in_strip[8]; | |
out_tile[19] += filts_strip[3]*in_strip[8]; | |
out_tile[20] += filts_strip[4]*in_strip[8]; | |
out_tile[21] += filts_strip[5]*in_strip[8]; | |
out_tile[22] += filts_strip[6]*in_strip[8]; | |
out_tile[23] += filts_strip[7]*in_strip[8]; | |
out_tile[24] += filts_strip[0]*in_strip[10]; | |
out_tile[25] += filts_strip[1]*in_strip[10]; | |
out_tile[26] += filts_strip[2]*in_strip[10]; | |
out_tile[27] += filts_strip[3]*in_strip[10]; | |
out_tile[28] += filts_strip[4]*in_strip[10]; | |
out_tile[29] += filts_strip[5]*in_strip[10]; | |
out_tile[30] += filts_strip[6]*in_strip[10]; | |
out_tile[31] += filts_strip[7]*in_strip[10]; | |
out_tile[32] += filts_strip[0]*in_strip[12]; | |
out_tile[33] += filts_strip[1]*in_strip[12]; | |
out_tile[34] += filts_strip[2]*in_strip[12]; | |
out_tile[35] += filts_strip[3]*in_strip[12]; | |
out_tile[36] += filts_strip[4]*in_strip[12]; | |
out_tile[37] += filts_strip[5]*in_strip[12]; | |
out_tile[38] += filts_strip[6]*in_strip[12]; | |
out_tile[39] += filts_strip[7]*in_strip[12]; | |
out_tile[40] += filts_strip[0]*in_strip[14]; | |
out_tile[41] += filts_strip[1]*in_strip[14]; | |
out_tile[42] += filts_strip[2]*in_strip[14]; | |
out_tile[43] += filts_strip[3]*in_strip[14]; | |
out_tile[44] += filts_strip[4]*in_strip[14]; | |
out_tile[45] += filts_strip[5]*in_strip[14]; | |
out_tile[46] += filts_strip[6]*in_strip[14]; | |
out_tile[47] += filts_strip[7]*in_strip[14]; | |
out_tile[48] += filts_strip[0]*in_strip[16]; | |
out_tile[49] += filts_strip[1]*in_strip[16]; | |
out_tile[50] += filts_strip[2]*in_strip[16]; | |
out_tile[51] += filts_strip[3]*in_strip[16]; | |
out_tile[52] += filts_strip[4]*in_strip[16]; | |
out_tile[53] += filts_strip[5]*in_strip[16]; | |
out_tile[54] += filts_strip[6]*in_strip[16]; | |
out_tile[55] += filts_strip[7]*in_strip[16]; | |
out_tile[56] += filts_strip[0]*in_strip[18]; | |
out_tile[57] += filts_strip[1]*in_strip[18]; | |
out_tile[58] += filts_strip[2]*in_strip[18]; | |
out_tile[59] += filts_strip[3]*in_strip[18]; | |
out_tile[60] += filts_strip[4]*in_strip[18]; | |
out_tile[61] += filts_strip[5]*in_strip[18]; | |
out_tile[62] += filts_strip[6]*in_strip[18]; | |
out_tile[63] += filts_strip[7]*in_strip[18]; | |
filts_strip[0] = filts_smem_off[5*64+0*8]; | |
filts_strip[1] = filts_smem_off[5*64+1*8]; | |
filts_strip[2] = filts_smem_off[5*64+2*8]; | |
filts_strip[3] = filts_smem_off[5*64+3*8]; | |
filts_strip[4] = filts_smem_off[5*64+4*8]; | |
filts_strip[5] = filts_smem_off[5*64+5*8]; | |
filts_strip[6] = filts_smem_off[5*64+6*8]; | |
filts_strip[7] = filts_smem_off[5*64+7*8]; | |
out_tile[0] += filts_strip[0]*in_strip[5]; | |
out_tile[1] += filts_strip[1]*in_strip[5]; | |
out_tile[2] += filts_strip[2]*in_strip[5]; | |
out_tile[3] += filts_strip[3]*in_strip[5]; | |
out_tile[4] += filts_strip[4]*in_strip[5]; | |
out_tile[5] += filts_strip[5]*in_strip[5]; | |
out_tile[6] += filts_strip[6]*in_strip[5]; | |
out_tile[7] += filts_strip[7]*in_strip[5]; | |
out_tile[8] += filts_strip[0]*in_strip[7]; | |
out_tile[9] += filts_strip[1]*in_strip[7]; | |
out_tile[10] += filts_strip[2]*in_strip[7]; | |
out_tile[11] += filts_strip[3]*in_strip[7]; | |
out_tile[12] += filts_strip[4]*in_strip[7]; | |
out_tile[13] += filts_strip[5]*in_strip[7]; | |
out_tile[14] += filts_strip[6]*in_strip[7]; | |
out_tile[15] += filts_strip[7]*in_strip[7]; | |
out_tile[16] += filts_strip[0]*in_strip[9]; | |
out_tile[17] += filts_strip[1]*in_strip[9]; | |
out_tile[18] += filts_strip[2]*in_strip[9]; | |
out_tile[19] += filts_strip[3]*in_strip[9]; | |
out_tile[20] += filts_strip[4]*in_strip[9]; | |
out_tile[21] += filts_strip[5]*in_strip[9]; | |
out_tile[22] += filts_strip[6]*in_strip[9]; | |
out_tile[23] += filts_strip[7]*in_strip[9]; | |
out_tile[24] += filts_strip[0]*in_strip[11]; | |
out_tile[25] += filts_strip[1]*in_strip[11]; | |
out_tile[26] += filts_strip[2]*in_strip[11]; | |
out_tile[27] += filts_strip[3]*in_strip[11]; | |
out_tile[28] += filts_strip[4]*in_strip[11]; | |
out_tile[29] += filts_strip[5]*in_strip[11]; | |
out_tile[30] += filts_strip[6]*in_strip[11]; | |
out_tile[31] += filts_strip[7]*in_strip[11]; | |
out_tile[32] += filts_strip[0]*in_strip[13]; | |
out_tile[33] += filts_strip[1]*in_strip[13]; | |
out_tile[34] += filts_strip[2]*in_strip[13]; | |
out_tile[35] += filts_strip[3]*in_strip[13]; | |
out_tile[36] += filts_strip[4]*in_strip[13]; | |
out_tile[37] += filts_strip[5]*in_strip[13]; | |
out_tile[38] += filts_strip[6]*in_strip[13]; | |
out_tile[39] += filts_strip[7]*in_strip[13]; | |
out_tile[40] += filts_strip[0]*in_strip[15]; | |
out_tile[41] += filts_strip[1]*in_strip[15]; | |
out_tile[42] += filts_strip[2]*in_strip[15]; | |
out_tile[43] += filts_strip[3]*in_strip[15]; | |
out_tile[44] += filts_strip[4]*in_strip[15]; | |
out_tile[45] += filts_strip[5]*in_strip[15]; | |
out_tile[46] += filts_strip[6]*in_strip[15]; | |
out_tile[47] += filts_strip[7]*in_strip[15]; | |
out_tile[48] += filts_strip[0]*in_strip[17]; | |
out_tile[49] += filts_strip[1]*in_strip[17]; | |
out_tile[50] += filts_strip[2]*in_strip[17]; | |
out_tile[51] += filts_strip[3]*in_strip[17]; | |
out_tile[52] += filts_strip[4]*in_strip[17]; | |
out_tile[53] += filts_strip[5]*in_strip[17]; | |
out_tile[54] += filts_strip[6]*in_strip[17]; | |
out_tile[55] += filts_strip[7]*in_strip[17]; | |
out_tile[56] += filts_strip[0]*in_strip[19]; | |
out_tile[57] += filts_strip[1]*in_strip[19]; | |
out_tile[58] += filts_strip[2]*in_strip[19]; | |
out_tile[59] += filts_strip[3]*in_strip[19]; | |
out_tile[60] += filts_strip[4]*in_strip[19]; | |
out_tile[61] += filts_strip[5]*in_strip[19]; | |
out_tile[62] += filts_strip[6]*in_strip[19]; | |
out_tile[63] += filts_strip[7]*in_strip[19]; | |
filts_strip[0] = filts_smem_off[6*64+0*8]; | |
filts_strip[1] = filts_smem_off[6*64+1*8]; | |
filts_strip[2] = filts_smem_off[6*64+2*8]; | |
filts_strip[3] = filts_smem_off[6*64+3*8]; | |
filts_strip[4] = filts_smem_off[6*64+4*8]; | |
filts_strip[5] = filts_smem_off[6*64+5*8]; | |
filts_strip[6] = filts_smem_off[6*64+6*8]; | |
filts_strip[7] = filts_smem_off[6*64+7*8]; | |
out_tile[0] += filts_strip[0]*in_strip[6]; | |
out_tile[1] += filts_strip[1]*in_strip[6]; | |
out_tile[2] += filts_strip[2]*in_strip[6]; | |
out_tile[3] += filts_strip[3]*in_strip[6]; | |
out_tile[4] += filts_strip[4]*in_strip[6]; | |
out_tile[5] += filts_strip[5]*in_strip[6]; | |
out_tile[6] += filts_strip[6]*in_strip[6]; | |
out_tile[7] += filts_strip[7]*in_strip[6]; | |
out_tile[8] += filts_strip[0]*in_strip[8]; | |
out_tile[9] += filts_strip[1]*in_strip[8]; | |
out_tile[10] += filts_strip[2]*in_strip[8]; | |
out_tile[11] += filts_strip[3]*in_strip[8]; | |
out_tile[12] += filts_strip[4]*in_strip[8]; | |
out_tile[13] += filts_strip[5]*in_strip[8]; | |
out_tile[14] += filts_strip[6]*in_strip[8]; | |
out_tile[15] += filts_strip[7]*in_strip[8]; | |
out_tile[16] += filts_strip[0]*in_strip[10]; | |
out_tile[17] += filts_strip[1]*in_strip[10]; | |
out_tile[18] += filts_strip[2]*in_strip[10]; | |
out_tile[19] += filts_strip[3]*in_strip[10]; | |
out_tile[20] += filts_strip[4]*in_strip[10]; | |
out_tile[21] += filts_strip[5]*in_strip[10]; | |
out_tile[22] += filts_strip[6]*in_strip[10]; | |
out_tile[23] += filts_strip[7]*in_strip[10]; | |
out_tile[24] += filts_strip[0]*in_strip[12]; | |
out_tile[25] += filts_strip[1]*in_strip[12]; | |
out_tile[26] += filts_strip[2]*in_strip[12]; | |
out_tile[27] += filts_strip[3]*in_strip[12]; | |
out_tile[28] += filts_strip[4]*in_strip[12]; | |
out_tile[29] += filts_strip[5]*in_strip[12]; | |
out_tile[30] += filts_strip[6]*in_strip[12]; | |
out_tile[31] += filts_strip[7]*in_strip[12]; | |
out_tile[32] += filts_strip[0]*in_strip[14]; | |
out_tile[33] += filts_strip[1]*in_strip[14]; | |
out_tile[34] += filts_strip[2]*in_strip[14]; | |
out_tile[35] += filts_strip[3]*in_strip[14]; | |
out_tile[36] += filts_strip[4]*in_strip[14]; | |
out_tile[37] += filts_strip[5]*in_strip[14]; | |
out_tile[38] += filts_strip[6]*in_strip[14]; | |
out_tile[39] += filts_strip[7]*in_strip[14]; | |
out_tile[40] += filts_strip[0]*in_strip[16]; | |
out_tile[41] += filts_strip[1]*in_strip[16]; | |
out_tile[42] += filts_strip[2]*in_strip[16]; | |
out_tile[43] += filts_strip[3]*in_strip[16]; | |
out_tile[44] += filts_strip[4]*in_strip[16]; | |
out_tile[45] += filts_strip[5]*in_strip[16]; | |
out_tile[46] += filts_strip[6]*in_strip[16]; | |
out_tile[47] += filts_strip[7]*in_strip[16]; | |
out_tile[48] += filts_strip[0]*in_strip[18]; | |
out_tile[49] += filts_strip[1]*in_strip[18]; | |
out_tile[50] += filts_strip[2]*in_strip[18]; | |
out_tile[51] += filts_strip[3]*in_strip[18]; | |
out_tile[52] += filts_strip[4]*in_strip[18]; | |
out_tile[53] += filts_strip[5]*in_strip[18]; | |
out_tile[54] += filts_strip[6]*in_strip[18]; | |
out_tile[55] += filts_strip[7]*in_strip[18]; | |
out_tile[56] += filts_strip[0]*in_strip[20]; | |
out_tile[57] += filts_strip[1]*in_strip[20]; | |
out_tile[58] += filts_strip[2]*in_strip[20]; | |
out_tile[59] += filts_strip[3]*in_strip[20]; | |
out_tile[60] += filts_strip[4]*in_strip[20]; | |
out_tile[61] += filts_strip[5]*in_strip[20]; | |
out_tile[62] += filts_strip[6]*in_strip[20]; | |
out_tile[63] += filts_strip[7]*in_strip[20]; | |
; | |
} | |
} | |
if( flags == 2 ) { return; } | |
BARRIER_SYNC; | |
for( int32_t i = 0; i != 1; ++i ) { | |
int32_t const t_smem_bias_ix = LOC_ID_1D+128*i; | |
if( t_smem_bias_ix < 64 ) { | |
int32_t const ocix_base = (GRP_ID_1D%1)*64; | |
int32_t const load_reg = t_smem_bias_ix / 8; | |
int32_t const load_tile = t_smem_bias_ix % 8; | |
int32_t const ocix = ocix_base + load_tile*8 + load_reg; | |
if( ocix < 64 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; } | |
} | |
} | |
BARRIER_SYNC; | |
// begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*8]; | |
filts_strip[1] = filts_smem_off[1*8]; | |
filts_strip[2] = filts_smem_off[2*8]; | |
filts_strip[3] = filts_smem_off[3*8]; | |
filts_strip[4] = filts_smem_off[4*8]; | |
filts_strip[5] = filts_smem_off[5*8]; | |
filts_strip[6] = filts_smem_off[6*8]; | |
filts_strip[7] = filts_smem_off[7*8]; | |
// end t_tile_bias_loads; | |
if( flags == 1 ) { return; } | |
// begin t_tile_stores | |
if( (out_line/114) >= 20 ) { return; } | |
int32_t out_x = (GRP_ID_1D%15)*8; | |
int32_t out_chan = ((GRP_ID_1D%1)*8 + (LOC_ID_1D%8))*8; | |
GASQ float * out_off = out + (out_line/114)*831744 + out_chan*12996 + (out_line%114)*114 + out_x*1 ; | |
if( (out_x + 0) >= 114 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 64 ) { out_off[ 0*12996 + 0*1 ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( (out_chan + 1) < 64 ) { out_off[ 1*12996 + 0*1 ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( (out_chan + 2) < 64 ) { out_off[ 2*12996 + 0*1 ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( (out_chan + 3) < 64 ) { out_off[ 3*12996 + 0*1 ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( (out_chan + 4) < 64 ) { out_off[ 4*12996 + 0*1 ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( (out_chan + 5) < 64 ) { out_off[ 5*12996 + 0*1 ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( (out_chan + 6) < 64 ) { out_off[ 6*12996 + 0*1 ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( (out_chan + 7) < 64 ) { out_off[ 7*12996 + 0*1 ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( (out_x + 1) >= 114 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 64 ) { out_off[ 0*12996 + 1*1 ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( (out_chan + 1) < 64 ) { out_off[ 1*12996 + 1*1 ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( (out_chan + 2) < 64 ) { out_off[ 2*12996 + 1*1 ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( (out_chan + 3) < 64 ) { out_off[ 3*12996 + 1*1 ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( (out_chan + 4) < 64 ) { out_off[ 4*12996 + 1*1 ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( (out_chan + 5) < 64 ) { out_off[ 5*12996 + 1*1 ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( (out_chan + 6) < 64 ) { out_off[ 6*12996 + 1*1 ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( (out_chan + 7) < 64 ) { out_off[ 7*12996 + 1*1 ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( (out_x + 2) >= 114 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 64 ) { out_off[ 0*12996 + 2*1 ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( (out_chan + 1) < 64 ) { out_off[ 1*12996 + 2*1 ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( (out_chan + 2) < 64 ) { out_off[ 2*12996 + 2*1 ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( (out_chan + 3) < 64 ) { out_off[ 3*12996 + 2*1 ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( (out_chan + 4) < 64 ) { out_off[ 4*12996 + 2*1 ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( (out_chan + 5) < 64 ) { out_off[ 5*12996 + 2*1 ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( (out_chan + 6) < 64 ) { out_off[ 6*12996 + 2*1 ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( (out_chan + 7) < 64 ) { out_off[ 7*12996 + 2*1 ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( (out_x + 3) >= 114 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 64 ) { out_off[ 0*12996 + 3*1 ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( (out_chan + 1) < 64 ) { out_off[ 1*12996 + 3*1 ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( (out_chan + 2) < 64 ) { out_off[ 2*12996 + 3*1 ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( (out_chan + 3) < 64 ) { out_off[ 3*12996 + 3*1 ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( (out_chan + 4) < 64 ) { out_off[ 4*12996 + 3*1 ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( (out_chan + 5) < 64 ) { out_off[ 5*12996 + 3*1 ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( (out_chan + 6) < 64 ) { out_off[ 6*12996 + 3*1 ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( (out_chan + 7) < 64 ) { out_off[ 7*12996 + 3*1 ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( (out_x + 4) >= 114 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 64 ) { out_off[ 0*12996 + 4*1 ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( (out_chan + 1) < 64 ) { out_off[ 1*12996 + 4*1 ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( (out_chan + 2) < 64 ) { out_off[ 2*12996 + 4*1 ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( (out_chan + 3) < 64 ) { out_off[ 3*12996 + 4*1 ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( (out_chan + 4) < 64 ) { out_off[ 4*12996 + 4*1 ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( (out_chan + 5) < 64 ) { out_off[ 5*12996 + 4*1 ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( (out_chan + 6) < 64 ) { out_off[ 6*12996 + 4*1 ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( (out_chan + 7) < 64 ) { out_off[ 7*12996 + 4*1 ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( (out_x + 5) >= 114 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 64 ) { out_off[ 0*12996 + 5*1 ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( (out_chan + 1) < 64 ) { out_off[ 1*12996 + 5*1 ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( (out_chan + 2) < 64 ) { out_off[ 2*12996 + 5*1 ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( (out_chan + 3) < 64 ) { out_off[ 3*12996 + 5*1 ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( (out_chan + 4) < 64 ) { out_off[ 4*12996 + 5*1 ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( (out_chan + 5) < 64 ) { out_off[ 5*12996 + 5*1 ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( (out_chan + 6) < 64 ) { out_off[ 6*12996 + 5*1 ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( (out_chan + 7) < 64 ) { out_off[ 7*12996 + 5*1 ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( (out_x + 6) >= 114 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 64 ) { out_off[ 0*12996 + 6*1 ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( (out_chan + 1) < 64 ) { out_off[ 1*12996 + 6*1 ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( (out_chan + 2) < 64 ) { out_off[ 2*12996 + 6*1 ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( (out_chan + 3) < 64 ) { out_off[ 3*12996 + 6*1 ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( (out_chan + 4) < 64 ) { out_off[ 4*12996 + 6*1 ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( (out_chan + 5) < 64 ) { out_off[ 5*12996 + 6*1 ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( (out_chan + 6) < 64 ) { out_off[ 6*12996 + 6*1 ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( (out_chan + 7) < 64 ) { out_off[ 7*12996 + 6*1 ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( (out_x + 7) >= 114 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 64 ) { out_off[ 0*12996 + 7*1 ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( (out_chan + 1) < 64 ) { out_off[ 1*12996 + 7*1 ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( (out_chan + 2) < 64 ) { out_off[ 2*12996 + 7*1 ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( (out_chan + 3) < 64 ) { out_off[ 3*12996 + 7*1 ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( (out_chan + 4) < 64 ) { out_off[ 4*12996 + 7*1 ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( (out_chan + 5) < 64 ) { out_off[ 5*12996 + 7*1 ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( (out_chan + 6) < 64 ) { out_off[ 6*12996 + 7*1 ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( (out_chan + 7) < 64 ) { out_off[ 7*12996 + 7*1 ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_dim_0 = 227 */ | |
/* in_dim_1 = 227 */ | |
/* kern_sz = 7 */ | |
/* stride = 2 */ | |
/* in_pad = 3 */ | |
/* t_tile_sz = 8 */ | |
/* conv_has_relu = 1 */ | |
/* out_chans = 64 */ | |
/* in_chans = 3 */ | |
/* rtc_func_name = tconv__num_imgs_20__in_dim_0_227__in_dim_1_227__kern_sz_7__stride_2__in_pad_3__t_tile_sz_8__conv_has_relu_1__out_chans_64__in_chans_3 */ | |
/* out_ix_x_dim = 114 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%114) */ | |
/* out_ix_y_dim = 114 */ | |
/* out_ix_y_sz = 114 */ | |
/* out_ix_y_nomod = (out_ix/114) */ | |
/* out_ix_y = ((out_ix/114)%%114) */ | |
/* out_ix_chan_dim = 64 */ | |
/* out_ix_chan_sz = 12996 */ | |
/* out_ix_chan_nomod = (out_ix/12996) */ | |
/* out_ix_chan = ((out_ix/12996)%%64) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 831744 */ | |
/* out_ix_img_nomod = (out_ix/831744) */ | |
/* out_ix_img = (out_ix/831744) */ | |
/* out_ix_sz = 16634880 */ | |
/* tpb = 128 */ | |
/* out_line_y_dim = 114 */ | |
/* out_line_y_sz = 1 */ | |
/* out_line_y_nomod = out_line */ | |
/* out_line_y = (out_line%%114) */ | |
/* out_line_img_dim = 20 */ | |
/* out_line_img_sz = 114 */ | |
/* out_line_img_nomod = (out_line/114) */ | |
/* out_line_img = (out_line/114) */ | |
/* out_line_sz = 2280 */ | |
/* in_ix_blk_x_dim = 21 */ | |
/* in_ix_blk_x_sz = 1 */ | |
/* in_ix_blk_x_nomod = in_ix */ | |
/* in_ix_blk_x = (in_ix%%21) */ | |
/* in_ix_blk_y_dim = 42 */ | |
/* in_ix_blk_y_sz = 21 */ | |
/* in_ix_blk_y_nomod = (in_ix/21) */ | |
/* in_ix_blk_y = ((in_ix/21)%%42) */ | |
/* in_ix_blk_in_chan_dim = 3 */ | |
/* in_ix_blk_in_chan_sz = 882 */ | |
/* in_ix_blk_in_chan_nomod = (in_ix/882) */ | |
/* in_ix_blk_in_chan = ((in_ix/882)%%3) */ | |
/* in_ix_blk_bx_dim = 15 */ | |
/* in_ix_blk_bx_sz = 2646 */ | |
/* in_ix_blk_bx_nomod = (in_ix/2646) */ | |
/* in_ix_blk_bx = ((in_ix/2646)%%15) */ | |
/* in_ix_blk_bline_dim = 143 */ | |
/* in_ix_blk_bline_sz = 39690 */ | |
/* in_ix_blk_bline_nomod = (in_ix/39690) */ | |
/* in_ix_blk_bline = (in_ix/39690) */ | |
/* in_ix_sz = 5675670 */ | |
/* LOC_ID_1D_out_chan_tile_dim = 8 */ | |
/* LOC_ID_1D_out_chan_tile_sz = 1 */ | |
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */ | |
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%8) */ | |
/* LOC_ID_1D_blk_y_dim = 16 */ | |
/* LOC_ID_1D_blk_y_sz = 8 */ | |
/* LOC_ID_1D_blk_y_nomod = (LOC_ID_1D/8) */ | |
/* LOC_ID_1D_blk_y = (LOC_ID_1D/8) */ | |
/* LOC_ID_1D_sz = 128 */ | |
/* GRP_ID_1D_out_chan_blk_dim = 1 */ | |
/* GRP_ID_1D_out_chan_blk_sz = 1 */ | |
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */ | |
/* GRP_ID_1D_blk_bx_dim = 15 */ | |
/* GRP_ID_1D_blk_bx_sz = 1 */ | |
/* GRP_ID_1D_blk_bx_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_blk_bx = (GRP_ID_1D%%15) */ | |
/* GRP_ID_1D_blk_bline_dim = 143 */ | |
/* GRP_ID_1D_blk_bline_sz = 15 */ | |
/* GRP_ID_1D_blk_bline_nomod = (GRP_ID_1D/15) */ | |
/* GRP_ID_1D_blk_bline = (GRP_ID_1D/15) */ | |
/* GRP_ID_1D_sz = 2145 */ | |
/* blk_filt_ix_sz = 64 */ | |
/* filts_smem_sz = 448 */ | |
/* in_smem_sz = 882 */ | |
/* out_smem_sz = 1024 */ | |
/* all_smem_sz = 1330 */ | |
/* filts_xp_ix_out_chan_tile_dim = 8 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%8) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 8 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/8) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/8)%%8) */ | |
/* filts_xp_ix_x_dim = 7 */ | |
/* filts_xp_ix_x_sz = 64 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/64) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/64)%%7) */ | |
/* filts_xp_ix_y_dim = 7 */ | |
/* filts_xp_ix_y_sz = 448 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/448) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/448)%%7) */ | |
/* filts_xp_ix_in_chan_dim = 3 */ | |
/* filts_xp_ix_in_chan_sz = 3136 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/3136) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/3136)%%3) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 9408 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/9408) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/9408) */ | |
/* filts_xp_ix_sz = 9408 */ | |
/* out_chan_bias_smem_load_iter = 1 */ | |
/* filts_off_adj = LOC_ID_1D */ | |
/* filt_smem_loads = // begin filt_smem_loads | |
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 2)] = filts[filts_off+(%(tpb)*2)]; | |
if( (LOC_ID_1D + %(tpb) * 3) < %(filts_smem_sz) ) { filts_smem[(LOC_ID_1D + %(tpb) * 3)] = filts[filts_off+(%(tpb)*3)];} | |
filts_off += %(filts_xp_ix_y_sz); | |
// end filt_smem_loads */ | |
/* in_smem_loads = // begin in_smem_loads | |
in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 1)] = in[ blk_in_ix_base + (%(tpb)*1) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 2)] = in[ blk_in_ix_base + (%(tpb)*2) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 3)] = in[ blk_in_ix_base + (%(tpb)*3) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 4)] = in[ blk_in_ix_base + (%(tpb)*4) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 5)] = in[ blk_in_ix_base + (%(tpb)*5) ]; | |
if( (LOC_ID_1D + %(tpb) * 6) < %(in_smem_sz)) { in_smem[(LOC_ID_1D + %(tpb) * 6)] = in[ blk_in_ix_base + (%(tpb)*6) ];} | |
blk_in_ix_base += %(in_ix_blk_in_chan_sz); | |
// end in_smem_loads */ | |
/* inner_loop_body = // begin inner_loop_body | |
in_strip[0] = in_smem_off[0]; | |
in_strip[1] = in_smem_off[1]; | |
in_strip[2] = in_smem_off[2]; | |
in_strip[3] = in_smem_off[3]; | |
in_strip[4] = in_smem_off[4]; | |
in_strip[5] = in_smem_off[5]; | |
in_strip[6] = in_smem_off[6]; | |
in_strip[7] = in_smem_off[7]; | |
in_strip[8] = in_smem_off[8]; | |
in_strip[9] = in_smem_off[9]; | |
in_strip[10] = in_smem_off[10]; | |
in_strip[11] = in_smem_off[11]; | |
in_strip[12] = in_smem_off[12]; | |
in_strip[13] = in_smem_off[13]; | |
in_strip[14] = in_smem_off[14]; | |
in_strip[15] = in_smem_off[15]; | |
in_strip[16] = in_smem_off[16]; | |
in_strip[17] = in_smem_off[17]; | |
in_strip[18] = in_smem_off[18]; | |
in_strip[19] = in_smem_off[19]; | |
in_strip[20] = in_smem_off[20]; | |
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[2]; | |
out_tile[9] += filts_strip[1]*in_strip[2]; | |
out_tile[10] += filts_strip[2]*in_strip[2]; | |
out_tile[11] += filts_strip[3]*in_strip[2]; | |
out_tile[12] += filts_strip[4]*in_strip[2]; | |
out_tile[13] += filts_strip[5]*in_strip[2]; | |
out_tile[14] += filts_strip[6]*in_strip[2]; | |
out_tile[15] += filts_strip[7]*in_strip[2]; | |
out_tile[16] += filts_strip[0]*in_strip[4]; | |
out_tile[17] += filts_strip[1]*in_strip[4]; | |
out_tile[18] += filts_strip[2]*in_strip[4]; | |
out_tile[19] += filts_strip[3]*in_strip[4]; | |
out_tile[20] += filts_strip[4]*in_strip[4]; | |
out_tile[21] += filts_strip[5]*in_strip[4]; | |
out_tile[22] += filts_strip[6]*in_strip[4]; | |
out_tile[23] += filts_strip[7]*in_strip[4]; | |
out_tile[24] += filts_strip[0]*in_strip[6]; | |
out_tile[25] += filts_strip[1]*in_strip[6]; | |
out_tile[26] += filts_strip[2]*in_strip[6]; | |
out_tile[27] += filts_strip[3]*in_strip[6]; | |
out_tile[28] += filts_strip[4]*in_strip[6]; | |
out_tile[29] += filts_strip[5]*in_strip[6]; | |
out_tile[30] += filts_strip[6]*in_strip[6]; | |
out_tile[31] += filts_strip[7]*in_strip[6]; | |
out_tile[32] += filts_strip[0]*in_strip[8]; | |
out_tile[33] += filts_strip[1]*in_strip[8]; | |
out_tile[34] += filts_strip[2]*in_strip[8]; | |
out_tile[35] += filts_strip[3]*in_strip[8]; | |
out_tile[36] += filts_strip[4]*in_strip[8]; | |
out_tile[37] += filts_strip[5]*in_strip[8]; | |
out_tile[38] += filts_strip[6]*in_strip[8]; | |
out_tile[39] += filts_strip[7]*in_strip[8]; | |
out_tile[40] += filts_strip[0]*in_strip[10]; | |
out_tile[41] += filts_strip[1]*in_strip[10]; | |
out_tile[42] += filts_strip[2]*in_strip[10]; | |
out_tile[43] += filts_strip[3]*in_strip[10]; | |
out_tile[44] += filts_strip[4]*in_strip[10]; | |
out_tile[45] += filts_strip[5]*in_strip[10]; | |
out_tile[46] += filts_strip[6]*in_strip[10]; | |
out_tile[47] += filts_strip[7]*in_strip[10]; | |
out_tile[48] += filts_strip[0]*in_strip[12]; | |
out_tile[49] += filts_strip[1]*in_strip[12]; | |
out_tile[50] += filts_strip[2]*in_strip[12]; | |
out_tile[51] += filts_strip[3]*in_strip[12]; | |
out_tile[52] += filts_strip[4]*in_strip[12]; | |
out_tile[53] += filts_strip[5]*in_strip[12]; | |
out_tile[54] += filts_strip[6]*in_strip[12]; | |
out_tile[55] += filts_strip[7]*in_strip[12]; | |
out_tile[56] += filts_strip[0]*in_strip[14]; | |
out_tile[57] += filts_strip[1]*in_strip[14]; | |
out_tile[58] += filts_strip[2]*in_strip[14]; | |
out_tile[59] += filts_strip[3]*in_strip[14]; | |
out_tile[60] += filts_strip[4]*in_strip[14]; | |
out_tile[61] += filts_strip[5]*in_strip[14]; | |
out_tile[62] += filts_strip[6]*in_strip[14]; | |
out_tile[63] += filts_strip[7]*in_strip[14]; | |
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[1]; | |
out_tile[1] += filts_strip[1]*in_strip[1]; | |
out_tile[2] += filts_strip[2]*in_strip[1]; | |
out_tile[3] += filts_strip[3]*in_strip[1]; | |
out_tile[4] += filts_strip[4]*in_strip[1]; | |
out_tile[5] += filts_strip[5]*in_strip[1]; | |
out_tile[6] += filts_strip[6]*in_strip[1]; | |
out_tile[7] += filts_strip[7]*in_strip[1]; | |
out_tile[8] += filts_strip[0]*in_strip[3]; | |
out_tile[9] += filts_strip[1]*in_strip[3]; | |
out_tile[10] += filts_strip[2]*in_strip[3]; | |
out_tile[11] += filts_strip[3]*in_strip[3]; | |
out_tile[12] += filts_strip[4]*in_strip[3]; | |
out_tile[13] += filts_strip[5]*in_strip[3]; | |
out_tile[14] += filts_strip[6]*in_strip[3]; | |
out_tile[15] += filts_strip[7]*in_strip[3]; | |
out_tile[16] += filts_strip[0]*in_strip[5]; | |
out_tile[17] += filts_strip[1]*in_strip[5]; | |
out_tile[18] += filts_strip[2]*in_strip[5]; | |
out_tile[19] += filts_strip[3]*in_strip[5]; | |
out_tile[20] += filts_strip[4]*in_strip[5]; | |
out_tile[21] += filts_strip[5]*in_strip[5]; | |
out_tile[22] += filts_strip[6]*in_strip[5]; | |
out_tile[23] += filts_strip[7]*in_strip[5]; | |
out_tile[24] += filts_strip[0]*in_strip[7]; | |
out_tile[25] += filts_strip[1]*in_strip[7]; | |
out_tile[26] += filts_strip[2]*in_strip[7]; | |
out_tile[27] += filts_strip[3]*in_strip[7]; | |
out_tile[28] += filts_strip[4]*in_strip[7]; | |
out_tile[29] += filts_strip[5]*in_strip[7]; | |
out_tile[30] += filts_strip[6]*in_strip[7]; | |
out_tile[31] += filts_strip[7]*in_strip[7]; | |
out_tile[32] += filts_strip[0]*in_strip[9]; | |
out_tile[33] += filts_strip[1]*in_strip[9]; | |
out_tile[34] += filts_strip[2]*in_strip[9]; | |
out_tile[35] += filts_strip[3]*in_strip[9]; | |
out_tile[36] += filts_strip[4]*in_strip[9]; | |
out_tile[37] += filts_strip[5]*in_strip[9]; | |
out_tile[38] += filts_strip[6]*in_strip[9]; | |
out_tile[39] += filts_strip[7]*in_strip[9]; | |
out_tile[40] += filts_strip[0]*in_strip[11]; | |
out_tile[41] += filts_strip[1]*in_strip[11]; | |
out_tile[42] += filts_strip[2]*in_strip[11]; | |
out_tile[43] += filts_strip[3]*in_strip[11]; | |
out_tile[44] += filts_strip[4]*in_strip[11]; | |
out_tile[45] += filts_strip[5]*in_strip[11]; | |
out_tile[46] += filts_strip[6]*in_strip[11]; | |
out_tile[47] += filts_strip[7]*in_strip[11]; | |
out_tile[48] += filts_strip[0]*in_strip[13]; | |
out_tile[49] += filts_strip[1]*in_strip[13]; | |
out_tile[50] += filts_strip[2]*in_strip[13]; | |
out_tile[51] += filts_strip[3]*in_strip[13]; | |
out_tile[52] += filts_strip[4]*in_strip[13]; | |
out_tile[53] += filts_strip[5]*in_strip[13]; | |
out_tile[54] += filts_strip[6]*in_strip[13]; | |
out_tile[55] += filts_strip[7]*in_strip[13]; | |
out_tile[56] += filts_strip[0]*in_strip[15]; | |
out_tile[57] += filts_strip[1]*in_strip[15]; | |
out_tile[58] += filts_strip[2]*in_strip[15]; | |
out_tile[59] += filts_strip[3]*in_strip[15]; | |
out_tile[60] += filts_strip[4]*in_strip[15]; | |
out_tile[61] += filts_strip[5]*in_strip[15]; | |
out_tile[62] += filts_strip[6]*in_strip[15]; | |
out_tile[63] += filts_strip[7]*in_strip[15]; | |
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[2]; | |
out_tile[1] += filts_strip[1]*in_strip[2]; | |
out_tile[2] += filts_strip[2]*in_strip[2]; | |
out_tile[3] += filts_strip[3]*in_strip[2]; | |
out_tile[4] += filts_strip[4]*in_strip[2]; | |
out_tile[5] += filts_strip[5]*in_strip[2]; | |
out_tile[6] += filts_strip[6]*in_strip[2]; | |
out_tile[7] += filts_strip[7]*in_strip[2]; | |
out_tile[8] += filts_strip[0]*in_strip[4]; | |
out_tile[9] += filts_strip[1]*in_strip[4]; | |
out_tile[10] += filts_strip[2]*in_strip[4]; | |
out_tile[11] += filts_strip[3]*in_strip[4]; | |
out_tile[12] += filts_strip[4]*in_strip[4]; | |
out_tile[13] += filts_strip[5]*in_strip[4]; | |
out_tile[14] += filts_strip[6]*in_strip[4]; | |
out_tile[15] += filts_strip[7]*in_strip[4]; | |
out_tile[16] += filts_strip[0]*in_strip[6]; | |
out_tile[17] += filts_strip[1]*in_strip[6]; | |
out_tile[18] += filts_strip[2]*in_strip[6]; | |
out_tile[19] += filts_strip[3]*in_strip[6]; | |
out_tile[20] += filts_strip[4]*in_strip[6]; | |
out_tile[21] += filts_strip[5]*in_strip[6]; | |
out_tile[22] += filts_strip[6]*in_strip[6]; | |
out_tile[23] += filts_strip[7]*in_strip[6]; | |
out_tile[24] += filts_strip[0]*in_strip[8]; | |
out_tile[25] += filts_strip[1]*in_strip[8]; | |
out_tile[26] += filts_strip[2]*in_strip[8]; | |
out_tile[27] += filts_strip[3]*in_strip[8]; | |
out_tile[28] += filts_strip[4]*in_strip[8]; | |
out_tile[29] += filts_strip[5]*in_strip[8]; | |
out_tile[30] += filts_strip[6]*in_strip[8]; | |
out_tile[31] += filts_strip[7]*in_strip[8]; | |
out_tile[32] += filts_strip[0]*in_strip[10]; | |
out_tile[33] += filts_strip[1]*in_strip[10]; | |
out_tile[34] += filts_strip[2]*in_strip[10]; | |
out_tile[35] += filts_strip[3]*in_strip[10]; | |
out_tile[36] += filts_strip[4]*in_strip[10]; | |
out_tile[37] += filts_strip[5]*in_strip[10]; | |
out_tile[38] += filts_strip[6]*in_strip[10]; | |
out_tile[39] += filts_strip[7]*in_strip[10]; | |
out_tile[40] += filts_strip[0]*in_strip[12]; | |
out_tile[41] += filts_strip[1]*in_strip[12]; | |
out_tile[42] += filts_strip[2]*in_strip[12]; | |
out_tile[43] += filts_strip[3]*in_strip[12]; | |
out_tile[44] += filts_strip[4]*in_strip[12]; | |
out_tile[45] += filts_strip[5]*in_strip[12]; | |
out_tile[46] += filts_strip[6]*in_strip[12]; | |
out_tile[47] += filts_strip[7]*in_strip[12]; | |
out_tile[48] += filts_strip[0]*in_strip[14]; | |
out_tile[49] += filts_strip[1]*in_strip[14]; | |
out_tile[50] += filts_strip[2]*in_strip[14]; | |
out_tile[51] += filts_strip[3]*in_strip[14]; | |
out_tile[52] += filts_strip[4]*in_strip[14]; | |
out_tile[53] += filts_strip[5]*in_strip[14]; | |
out_tile[54] += filts_strip[6]*in_strip[14]; | |
out_tile[55] += filts_strip[7]*in_strip[14]; | |
out_tile[56] += filts_strip[0]*in_strip[16]; | |
out_tile[57] += filts_strip[1]*in_strip[16]; | |
out_tile[58] += filts_strip[2]*in_strip[16]; | |
out_tile[59] += filts_strip[3]*in_strip[16]; | |
out_tile[60] += filts_strip[4]*in_strip[16]; | |
out_tile[61] += filts_strip[5]*in_strip[16]; | |
out_tile[62] += filts_strip[6]*in_strip[16]; | |
out_tile[63] += filts_strip[7]*in_strip[16]; | |
filts_strip[0] = filts_smem_off[3*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[3*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[3*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[3*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[3*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[3*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[3*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[3*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[3]; | |
out_tile[1] += filts_strip[1]*in_strip[3]; | |
out_tile[2] += filts_strip[2]*in_strip[3]; | |
out_tile[3] += filts_strip[3]*in_strip[3]; | |
out_tile[4] += filts_strip[4]*in_strip[3]; | |
out_tile[5] += filts_strip[5]*in_strip[3]; | |
out_tile[6] += filts_strip[6]*in_strip[3]; | |
out_tile[7] += filts_strip[7]*in_strip[3]; | |
out_tile[8] += filts_strip[0]*in_strip[5]; | |
out_tile[9] += filts_strip[1]*in_strip[5]; | |
out_tile[10] += filts_strip[2]*in_strip[5]; | |
out_tile[11] += filts_strip[3]*in_strip[5]; | |
out_tile[12] += filts_strip[4]*in_strip[5]; | |
out_tile[13] += filts_strip[5]*in_strip[5]; | |
out_tile[14] += filts_strip[6]*in_strip[5]; | |
out_tile[15] += filts_strip[7]*in_strip[5]; | |
out_tile[16] += filts_strip[0]*in_strip[7]; | |
out_tile[17] += filts_strip[1]*in_strip[7]; | |
out_tile[18] += filts_strip[2]*in_strip[7]; | |
out_tile[19] += filts_strip[3]*in_strip[7]; | |
out_tile[20] += filts_strip[4]*in_strip[7]; | |
out_tile[21] += filts_strip[5]*in_strip[7]; | |
out_tile[22] += filts_strip[6]*in_strip[7]; | |
out_tile[23] += filts_strip[7]*in_strip[7]; | |
out_tile[24] += filts_strip[0]*in_strip[9]; | |
out_tile[25] += filts_strip[1]*in_strip[9]; | |
out_tile[26] += filts_strip[2]*in_strip[9]; | |
out_tile[27] += filts_strip[3]*in_strip[9]; | |
out_tile[28] += filts_strip[4]*in_strip[9]; | |
out_tile[29] += filts_strip[5]*in_strip[9]; | |
out_tile[30] += filts_strip[6]*in_strip[9]; | |
out_tile[31] += filts_strip[7]*in_strip[9]; | |
out_tile[32] += filts_strip[0]*in_strip[11]; | |
out_tile[33] += filts_strip[1]*in_strip[11]; | |
out_tile[34] += filts_strip[2]*in_strip[11]; | |
out_tile[35] += filts_strip[3]*in_strip[11]; | |
out_tile[36] += filts_strip[4]*in_strip[11]; | |
out_tile[37] += filts_strip[5]*in_strip[11]; | |
out_tile[38] += filts_strip[6]*in_strip[11]; | |
out_tile[39] += filts_strip[7]*in_strip[11]; | |
out_tile[40] += filts_strip[0]*in_strip[13]; | |
out_tile[41] += filts_strip[1]*in_strip[13]; | |
out_tile[42] += filts_strip[2]*in_strip[13]; | |
out_tile[43] += filts_strip[3]*in_strip[13]; | |
out_tile[44] += filts_strip[4]*in_strip[13]; | |
out_tile[45] += filts_strip[5]*in_strip[13]; | |
out_tile[46] += filts_strip[6]*in_strip[13]; | |
out_tile[47] += filts_strip[7]*in_strip[13]; | |
out_tile[48] += filts_strip[0]*in_strip[15]; | |
out_tile[49] += filts_strip[1]*in_strip[15]; | |
out_tile[50] += filts_strip[2]*in_strip[15]; | |
out_tile[51] += filts_strip[3]*in_strip[15]; | |
out_tile[52] += filts_strip[4]*in_strip[15]; | |
out_tile[53] += filts_strip[5]*in_strip[15]; | |
out_tile[54] += filts_strip[6]*in_strip[15]; | |
out_tile[55] += filts_strip[7]*in_strip[15]; | |
out_tile[56] += filts_strip[0]*in_strip[17]; | |
out_tile[57] += filts_strip[1]*in_strip[17]; | |
out_tile[58] += filts_strip[2]*in_strip[17]; | |
out_tile[59] += filts_strip[3]*in_strip[17]; | |
out_tile[60] += filts_strip[4]*in_strip[17]; | |
out_tile[61] += filts_strip[5]*in_strip[17]; | |
out_tile[62] += filts_strip[6]*in_strip[17]; | |
out_tile[63] += filts_strip[7]*in_strip[17]; | |
filts_strip[0] = filts_smem_off[4*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[4*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[4*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[4*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[4*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[4*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[4*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[4*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[4]; | |
out_tile[1] += filts_strip[1]*in_strip[4]; | |
out_tile[2] += filts_strip[2]*in_strip[4]; | |
out_tile[3] += filts_strip[3]*in_strip[4]; | |
out_tile[4] += filts_strip[4]*in_strip[4]; | |
out_tile[5] += filts_strip[5]*in_strip[4]; | |
out_tile[6] += filts_strip[6]*in_strip[4]; | |
out_tile[7] += filts_strip[7]*in_strip[4]; | |
out_tile[8] += filts_strip[0]*in_strip[6]; | |
out_tile[9] += filts_strip[1]*in_strip[6]; | |
out_tile[10] += filts_strip[2]*in_strip[6]; | |
out_tile[11] += filts_strip[3]*in_strip[6]; | |
out_tile[12] += filts_strip[4]*in_strip[6]; | |
out_tile[13] += filts_strip[5]*in_strip[6]; | |
out_tile[14] += filts_strip[6]*in_strip[6]; | |
out_tile[15] += filts_strip[7]*in_strip[6]; | |
out_tile[16] += filts_strip[0]*in_strip[8]; | |
out_tile[17] += filts_strip[1]*in_strip[8]; | |
out_tile[18] += filts_strip[2]*in_strip[8]; | |
out_tile[19] += filts_strip[3]*in_strip[8]; | |
out_tile[20] += filts_strip[4]*in_strip[8]; | |
out_tile[21] += filts_strip[5]*in_strip[8]; | |
out_tile[22] += filts_strip[6]*in_strip[8]; | |
out_tile[23] += filts_strip[7]*in_strip[8]; | |
out_tile[24] += filts_strip[0]*in_strip[10]; | |
out_tile[25] += filts_strip[1]*in_strip[10]; | |
out_tile[26] += filts_strip[2]*in_strip[10]; | |
out_tile[27] += filts_strip[3]*in_strip[10]; | |
out_tile[28] += filts_strip[4]*in_strip[10]; | |
out_tile[29] += filts_strip[5]*in_strip[10]; | |
out_tile[30] += filts_strip[6]*in_strip[10]; | |
out_tile[31] += filts_strip[7]*in_strip[10]; | |
out_tile[32] += filts_strip[0]*in_strip[12]; | |
out_tile[33] += filts_strip[1]*in_strip[12]; | |
out_tile[34] += filts_strip[2]*in_strip[12]; | |
out_tile[35] += filts_strip[3]*in_strip[12]; | |
out_tile[36] += filts_strip[4]*in_strip[12]; | |
out_tile[37] += filts_strip[5]*in_strip[12]; | |
out_tile[38] += filts_strip[6]*in_strip[12]; | |
out_tile[39] += filts_strip[7]*in_strip[12]; | |
out_tile[40] += filts_strip[0]*in_strip[14]; | |
out_tile[41] += filts_strip[1]*in_strip[14]; | |
out_tile[42] += filts_strip[2]*in_strip[14]; | |
out_tile[43] += filts_strip[3]*in_strip[14]; | |
out_tile[44] += filts_strip[4]*in_strip[14]; | |
out_tile[45] += filts_strip[5]*in_strip[14]; | |
out_tile[46] += filts_strip[6]*in_strip[14]; | |
out_tile[47] += filts_strip[7]*in_strip[14]; | |
out_tile[48] += filts_strip[0]*in_strip[16]; | |
out_tile[49] += filts_strip[1]*in_strip[16]; | |
out_tile[50] += filts_strip[2]*in_strip[16]; | |
out_tile[51] += filts_strip[3]*in_strip[16]; | |
out_tile[52] += filts_strip[4]*in_strip[16]; | |
out_tile[53] += filts_strip[5]*in_strip[16]; | |
out_tile[54] += filts_strip[6]*in_strip[16]; | |
out_tile[55] += filts_strip[7]*in_strip[16]; | |
out_tile[56] += filts_strip[0]*in_strip[18]; | |
out_tile[57] += filts_strip[1]*in_strip[18]; | |
out_tile[58] += filts_strip[2]*in_strip[18]; | |
out_tile[59] += filts_strip[3]*in_strip[18]; | |
out_tile[60] += filts_strip[4]*in_strip[18]; | |
out_tile[61] += filts_strip[5]*in_strip[18]; | |
out_tile[62] += filts_strip[6]*in_strip[18]; | |
out_tile[63] += filts_strip[7]*in_strip[18]; | |
filts_strip[0] = filts_smem_off[5*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[5*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[5*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[5*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[5*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[5*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[5*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[5*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[5]; | |
out_tile[1] += filts_strip[1]*in_strip[5]; | |
out_tile[2] += filts_strip[2]*in_strip[5]; | |
out_tile[3] += filts_strip[3]*in_strip[5]; | |
out_tile[4] += filts_strip[4]*in_strip[5]; | |
out_tile[5] += filts_strip[5]*in_strip[5]; | |
out_tile[6] += filts_strip[6]*in_strip[5]; | |
out_tile[7] += filts_strip[7]*in_strip[5]; | |
out_tile[8] += filts_strip[0]*in_strip[7]; | |
out_tile[9] += filts_strip[1]*in_strip[7]; | |
out_tile[10] += filts_strip[2]*in_strip[7]; | |
out_tile[11] += filts_strip[3]*in_strip[7]; | |
out_tile[12] += filts_strip[4]*in_strip[7]; | |
out_tile[13] += filts_strip[5]*in_strip[7]; | |
out_tile[14] += filts_strip[6]*in_strip[7]; | |
out_tile[15] += filts_strip[7]*in_strip[7]; | |
out_tile[16] += filts_strip[0]*in_strip[9]; | |
out_tile[17] += filts_strip[1]*in_strip[9]; | |
out_tile[18] += filts_strip[2]*in_strip[9]; | |
out_tile[19] += filts_strip[3]*in_strip[9]; | |
out_tile[20] += filts_strip[4]*in_strip[9]; | |
out_tile[21] += filts_strip[5]*in_strip[9]; | |
out_tile[22] += filts_strip[6]*in_strip[9]; | |
out_tile[23] += filts_strip[7]*in_strip[9]; | |
out_tile[24] += filts_strip[0]*in_strip[11]; | |
out_tile[25] += filts_strip[1]*in_strip[11]; | |
out_tile[26] += filts_strip[2]*in_strip[11]; | |
out_tile[27] += filts_strip[3]*in_strip[11]; | |
out_tile[28] += filts_strip[4]*in_strip[11]; | |
out_tile[29] += filts_strip[5]*in_strip[11]; | |
out_tile[30] += filts_strip[6]*in_strip[11]; | |
out_tile[31] += filts_strip[7]*in_strip[11]; | |
out_tile[32] += filts_strip[0]*in_strip[13]; | |
out_tile[33] += filts_strip[1]*in_strip[13]; | |
out_tile[34] += filts_strip[2]*in_strip[13]; | |
out_tile[35] += filts_strip[3]*in_strip[13]; | |
out_tile[36] += filts_strip[4]*in_strip[13]; | |
out_tile[37] += filts_strip[5]*in_strip[13]; | |
out_tile[38] += filts_strip[6]*in_strip[13]; | |
out_tile[39] += filts_strip[7]*in_strip[13]; | |
out_tile[40] += filts_strip[0]*in_strip[15]; | |
out_tile[41] += filts_strip[1]*in_strip[15]; | |
out_tile[42] += filts_strip[2]*in_strip[15]; | |
out_tile[43] += filts_strip[3]*in_strip[15]; | |
out_tile[44] += filts_strip[4]*in_strip[15]; | |
out_tile[45] += filts_strip[5]*in_strip[15]; | |
out_tile[46] += filts_strip[6]*in_strip[15]; | |
out_tile[47] += filts_strip[7]*in_strip[15]; | |
out_tile[48] += filts_strip[0]*in_strip[17]; | |
out_tile[49] += filts_strip[1]*in_strip[17]; | |
out_tile[50] += filts_strip[2]*in_strip[17]; | |
out_tile[51] += filts_strip[3]*in_strip[17]; | |
out_tile[52] += filts_strip[4]*in_strip[17]; | |
out_tile[53] += filts_strip[5]*in_strip[17]; | |
out_tile[54] += filts_strip[6]*in_strip[17]; | |
out_tile[55] += filts_strip[7]*in_strip[17]; | |
out_tile[56] += filts_strip[0]*in_strip[19]; | |
out_tile[57] += filts_strip[1]*in_strip[19]; | |
out_tile[58] += filts_strip[2]*in_strip[19]; | |
out_tile[59] += filts_strip[3]*in_strip[19]; | |
out_tile[60] += filts_strip[4]*in_strip[19]; | |
out_tile[61] += filts_strip[5]*in_strip[19]; | |
out_tile[62] += filts_strip[6]*in_strip[19]; | |
out_tile[63] += filts_strip[7]*in_strip[19]; | |
filts_strip[0] = filts_smem_off[6*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[6*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[6*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[6*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[6*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[6*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[6*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[6*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[6]; | |
out_tile[1] += filts_strip[1]*in_strip[6]; | |
out_tile[2] += filts_strip[2]*in_strip[6]; | |
out_tile[3] += filts_strip[3]*in_strip[6]; | |
out_tile[4] += filts_strip[4]*in_strip[6]; | |
out_tile[5] += filts_strip[5]*in_strip[6]; | |
out_tile[6] += filts_strip[6]*in_strip[6]; | |
out_tile[7] += filts_strip[7]*in_strip[6]; | |
out_tile[8] += filts_strip[0]*in_strip[8]; | |
out_tile[9] += filts_strip[1]*in_strip[8]; | |
out_tile[10] += filts_strip[2]*in_strip[8]; | |
out_tile[11] += filts_strip[3]*in_strip[8]; | |
out_tile[12] += filts_strip[4]*in_strip[8]; | |
out_tile[13] += filts_strip[5]*in_strip[8]; | |
out_tile[14] += filts_strip[6]*in_strip[8]; | |
out_tile[15] += filts_strip[7]*in_strip[8]; | |
out_tile[16] += filts_strip[0]*in_strip[10]; | |
out_tile[17] += filts_strip[1]*in_strip[10]; | |
out_tile[18] += filts_strip[2]*in_strip[10]; | |
out_tile[19] += filts_strip[3]*in_strip[10]; | |
out_tile[20] += filts_strip[4]*in_strip[10]; | |
out_tile[21] += filts_strip[5]*in_strip[10]; | |
out_tile[22] += filts_strip[6]*in_strip[10]; | |
out_tile[23] += filts_strip[7]*in_strip[10]; | |
out_tile[24] += filts_strip[0]*in_strip[12]; | |
out_tile[25] += filts_strip[1]*in_strip[12]; | |
out_tile[26] += filts_strip[2]*in_strip[12]; | |
out_tile[27] += filts_strip[3]*in_strip[12]; | |
out_tile[28] += filts_strip[4]*in_strip[12]; | |
out_tile[29] += filts_strip[5]*in_strip[12]; | |
out_tile[30] += filts_strip[6]*in_strip[12]; | |
out_tile[31] += filts_strip[7]*in_strip[12]; | |
out_tile[32] += filts_strip[0]*in_strip[14]; | |
out_tile[33] += filts_strip[1]*in_strip[14]; | |
out_tile[34] += filts_strip[2]*in_strip[14]; | |
out_tile[35] += filts_strip[3]*in_strip[14]; | |
out_tile[36] += filts_strip[4]*in_strip[14]; | |
out_tile[37] += filts_strip[5]*in_strip[14]; | |
out_tile[38] += filts_strip[6]*in_strip[14]; | |
out_tile[39] += filts_strip[7]*in_strip[14]; | |
out_tile[40] += filts_strip[0]*in_strip[16]; | |
out_tile[41] += filts_strip[1]*in_strip[16]; | |
out_tile[42] += filts_strip[2]*in_strip[16]; | |
out_tile[43] += filts_strip[3]*in_strip[16]; | |
out_tile[44] += filts_strip[4]*in_strip[16]; | |
out_tile[45] += filts_strip[5]*in_strip[16]; | |
out_tile[46] += filts_strip[6]*in_strip[16]; | |
out_tile[47] += filts_strip[7]*in_strip[16]; | |
out_tile[48] += filts_strip[0]*in_strip[18]; | |
out_tile[49] += filts_strip[1]*in_strip[18]; | |
out_tile[50] += filts_strip[2]*in_strip[18]; | |
out_tile[51] += filts_strip[3]*in_strip[18]; | |
out_tile[52] += filts_strip[4]*in_strip[18]; | |
out_tile[53] += filts_strip[5]*in_strip[18]; | |
out_tile[54] += filts_strip[6]*in_strip[18]; | |
out_tile[55] += filts_strip[7]*in_strip[18]; | |
out_tile[56] += filts_strip[0]*in_strip[20]; | |
out_tile[57] += filts_strip[1]*in_strip[20]; | |
out_tile[58] += filts_strip[2]*in_strip[20]; | |
out_tile[59] += filts_strip[3]*in_strip[20]; | |
out_tile[60] += filts_strip[4]*in_strip[20]; | |
out_tile[61] += filts_strip[5]*in_strip[20]; | |
out_tile[62] += filts_strip[6]*in_strip[20]; | |
out_tile[63] += filts_strip[7]*in_strip[20]; | |
*/ | |
/* t_tile_bias_loads = // begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
// end t_tile_bias_loads */ | |
/* t_tile_stores = // begin t_tile_stores | |
if( %(out_line_img) >= %(out_ix_img_dim) ) { return; } | |
int32_t out_x = %(GRP_ID_1D_blk_bx)*%(t_tile_sz); | |
int32_t out_chan = (%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim) + %(LOC_ID_1D_out_chan_tile))*%(t_tile_sz); | |
GASQ float * out_off = out + %(out_line_img)*%(out_ix_img_sz) + out_chan*%(out_ix_chan_sz) + %(out_line_y)*%(out_ix_y_sz) + out_x*%(out_ix_x_sz) ; | |
if( (out_x + 0) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( (out_x + 1) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( (out_x + 2) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( (out_x + 3) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( (out_x + 4) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( (out_x + 5) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( (out_x + 6) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( (out_x + 7) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores */ | |
CUCL_GLOBAL_KERNEL void in_tile_xpose__num_imgs_20__stride_2__kern_sz_7__in_pad_3__in_chans_3__ysz_227__xsz_227__tix_pels_tile_sz_16__t_tile_sz_8__bix_pels_blk_sz_2145( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const out_ix = GLOB_ID_1D; | |
if( out_ix >= 5675670 ) { return; } | |
int32_t const out_line = (out_ix/39690)*16; | |
int32_t const fi_skip_in_lines = (out_line%114)*2; | |
int32_t const in_line = (((out_ix/21)%42)+fi_skip_in_lines); | |
int32_t const img_in_lines = (114 - 1)*2 + 7; | |
int32_t const img_off = in_line/img_in_lines; | |
int32_t const img = (out_line/114) + img_off; | |
int32_t const iy = (in_line % img_in_lines) - 3; //(out_line%114)*2 + ((out_ix/21)%42) - 3; | |
int32_t const ix = ((out_ix/2646)%15)*8*2 + (out_ix%21) - 3; | |
float v = 0.0f; | |
if( 1 | |
&& ( ix >= 0 ) | |
&& ( iy >= 0 ) | |
&& ( ix < 227 ) | |
&& ( iy < 227 ) | |
&& ( img < 20 ) | |
) | |
{ | |
v = in[ img*154587 + | |
((out_ix/882)%3)*51529 + | |
iy*227 + | |
ix*1 ]; | |
} | |
out[out_ix] = v; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* stride = 2 */ | |
/* kern_sz = 7 */ | |
/* in_pad = 3 */ | |
/* in_chans = 3 */ | |
/* ysz = 227 */ | |
/* xsz = 227 */ | |
/* tix_pels_tile_sz = 16 */ | |
/* t_tile_sz = 8 */ | |
/* bix_pels_blk_sz = 2145 */ | |
/* rtc_func_name = in_tile_xpose__num_imgs_20__stride_2__kern_sz_7__in_pad_3__in_chans_3__ysz_227__xsz_227__tix_pels_tile_sz_16__t_tile_sz_8__bix_pels_blk_sz_2145 */ | |
/* out_ix_blk_x_dim = 21 */ | |
/* out_ix_blk_x_sz = 1 */ | |
/* out_ix_blk_x_nomod = out_ix */ | |
/* out_ix_blk_x = (out_ix%%21) */ | |
/* out_ix_blk_y_dim = 42 */ | |
/* out_ix_blk_y_sz = 21 */ | |
/* out_ix_blk_y_nomod = (out_ix/21) */ | |
/* out_ix_blk_y = ((out_ix/21)%%42) */ | |
/* out_ix_blk_in_chan_dim = 3 */ | |
/* out_ix_blk_in_chan_sz = 882 */ | |
/* out_ix_blk_in_chan_nomod = (out_ix/882) */ | |
/* out_ix_blk_in_chan = ((out_ix/882)%%3) */ | |
/* out_ix_blk_bx_dim = 15 */ | |
/* out_ix_blk_bx_sz = 2646 */ | |
/* out_ix_blk_bx_nomod = (out_ix/2646) */ | |
/* out_ix_blk_bx = ((out_ix/2646)%%15) */ | |
/* out_ix_blk_bline_dim = 143 */ | |
/* out_ix_blk_bline_sz = 39690 */ | |
/* out_ix_blk_bline_nomod = (out_ix/39690) */ | |
/* out_ix_blk_bline = (out_ix/39690) */ | |
/* out_ix_sz = 5675670 */ | |
/* out_line_y_dim = 114 */ | |
/* out_line_y_sz = 1 */ | |
/* out_line_y_nomod = out_line */ | |
/* out_line_y = (out_line%%114) */ | |
/* out_line_img_dim = 20 */ | |
/* out_line_img_sz = 114 */ | |
/* out_line_img_nomod = (out_line/114) */ | |
/* out_line_img = (out_line/114) */ | |
/* out_line_sz = 2280 */ | |
/* in_ix_x_dim = 227 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%227) */ | |
/* in_ix_y_dim = 227 */ | |
/* in_ix_y_sz = 227 */ | |
/* in_ix_y_nomod = (in_ix/227) */ | |
/* in_ix_y = ((in_ix/227)%%227) */ | |
/* in_ix_chan_dim = 3 */ | |
/* in_ix_chan_sz = 51529 */ | |
/* in_ix_chan_nomod = (in_ix/51529) */ | |
/* in_ix_chan = ((in_ix/51529)%%3) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 154587 */ | |
/* in_ix_img_nomod = (in_ix/154587) */ | |
/* in_ix_img = (in_ix/154587) */ | |
/* in_ix_sz = 3091740 */ | |
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_64__in_chans_3__kysz_7__kxsz_7( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x | |
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile | |
{ | |
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg | |
int32_t const filts_ix = GLOB_ID_1D; | |
if( filts_ix >= 9408 ) { return; } | |
int32_t const fioc = (filts_ix/147); | |
float val = 0.0f; | |
int32_t const filts_xp_ix = | |
(fioc/64)*9408 + | |
(fioc%8)*8 + | |
((fioc/8)%8)*1 + | |
((filts_ix/49)%3)*3136 + | |
((filts_ix/7)%7)*448 + | |
(filts_ix%7)*64; | |
#if 1 | |
val = in[filts_ix]; | |
#else | |
if( ((filts_ix/49)%3) == 0 ) { | |
// if( ((filts_ix%7) == 5) && (((filts_ix/7)%7) == 5) ) | |
{ | |
val = (filts_ix%7)*100 + ((filts_ix/7)%7); | |
} | |
} | |
#endif | |
out[filts_xp_ix] = val; | |
} | |
// -- template substituion table used: -- | |
/* out_chans = 64 */ | |
/* in_chans = 3 */ | |
/* kysz = 7 */ | |
/* kxsz = 7 */ | |
/* rtc_func_name = xpose_filts__out_chans_64__in_chans_3__kysz_7__kxsz_7 */ | |
/* t_tile_sz = 8 */ | |
/* filts_ix_x_dim = 7 */ | |
/* filts_ix_x_sz = 1 */ | |
/* filts_ix_x_nomod = filts_ix */ | |
/* filts_ix_x = (filts_ix%%7) */ | |
/* filts_ix_y_dim = 7 */ | |
/* filts_ix_y_sz = 7 */ | |
/* filts_ix_y_nomod = (filts_ix/7) */ | |
/* filts_ix_y = ((filts_ix/7)%%7) */ | |
/* filts_ix_in_chan_dim = 3 */ | |
/* filts_ix_in_chan_sz = 49 */ | |
/* filts_ix_in_chan_nomod = (filts_ix/49) */ | |
/* filts_ix_in_chan = ((filts_ix/49)%%3) */ | |
/* filts_ix_out_chan_dim = 64 */ | |
/* filts_ix_out_chan_sz = 147 */ | |
/* filts_ix_out_chan_nomod = (filts_ix/147) */ | |
/* filts_ix_out_chan = (filts_ix/147) */ | |
/* filts_ix_sz = 9408 */ | |
/* filts_xp_ix_out_chan_tile_dim = 8 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%8) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 8 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/8) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/8)%%8) */ | |
/* filts_xp_ix_x_dim = 7 */ | |
/* filts_xp_ix_x_sz = 64 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/64) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/64)%%7) */ | |
/* filts_xp_ix_y_dim = 7 */ | |
/* filts_xp_ix_y_sz = 448 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/448) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/448)%%7) */ | |
/* filts_xp_ix_in_chan_dim = 3 */ | |
/* filts_xp_ix_in_chan_sz = 3136 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/3136) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/3136)%%3) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 9408 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/9408) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/9408) */ | |
/* filts_xp_ix_sz = 9408 */ | |
/* fioc_out_chan_reg_dim = 8 */ | |
/* fioc_out_chan_reg_sz = 1 */ | |
/* fioc_out_chan_reg_nomod = fioc */ | |
/* fioc_out_chan_reg = (fioc%%8) */ | |
/* fioc_out_chan_tile_dim = 8 */ | |
/* fioc_out_chan_tile_sz = 8 */ | |
/* fioc_out_chan_tile_nomod = (fioc/8) */ | |
/* fioc_out_chan_tile = ((fioc/8)%%8) */ | |
/* fioc_out_chan_blk_dim = 1 */ | |
/* fioc_out_chan_blk_sz = 64 */ | |
/* fioc_out_chan_blk_nomod = (fioc/64) */ | |
/* fioc_out_chan_blk = (fioc/64) */ | |
/* fioc_sz = 64 */ | |
CUCL_GLOBAL_KERNEL void pool__num_imgs_20__in_pad_0__in_dim_0_114__in_dim_1_114__conv_has_relu_0__kern_sz_3__stride_2__out_chans_64__avg_pool_0( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const out_ix = GLOB_ID_1D; | |
if( out_ix >= 4158720 ) { return; } | |
float out_v = 0.0f; | |
for( int32_t kx = 0; kx != 3; ++kx ) { | |
for( int32_t ky = 0; ky != 3; ++ky ) { | |
float v = 0; | |
int const in_ix_y = ((out_ix/57)%57)*2 + ky - 0; | |
int const in_ix_x = (out_ix%57)*2 + kx - 0; | |
if(in_ix_y >= 0 && in_ix_x >= 0 && in_ix_x < 114 && in_ix_y < 114 ) { | |
int32_t const in_ix = (out_ix/207936)*831744 + ((out_ix/3249)%64)*12996 + | |
in_ix_y*114 + in_ix_x*1; | |
v = in[in_ix]; | |
} | |
out_v = max( out_v, v ); | |
} | |
} | |
; | |
out[out_ix] = out_v; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_pad = 0 */ | |
/* in_dim_0 = 114 */ | |
/* in_dim_1 = 114 */ | |
/* conv_has_relu = 0 */ | |
/* kern_sz = 3 */ | |
/* stride = 2 */ | |
/* out_chans = 64 */ | |
/* avg_pool = 0 */ | |
/* rtc_func_name = pool__num_imgs_20__in_pad_0__in_dim_0_114__in_dim_1_114__conv_has_relu_0__kern_sz_3__stride_2__out_chans_64__avg_pool_0 */ | |
/* t_tile_sz = 8 */ | |
/* out_ix_x_dim = 57 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%57) */ | |
/* out_ix_y_dim = 57 */ | |
/* out_ix_y_sz = 57 */ | |
/* out_ix_y_nomod = (out_ix/57) */ | |
/* out_ix_y = ((out_ix/57)%%57) */ | |
/* out_ix_chan_dim = 64 */ | |
/* out_ix_chan_sz = 3249 */ | |
/* out_ix_chan_nomod = (out_ix/3249) */ | |
/* out_ix_chan = ((out_ix/3249)%%64) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 207936 */ | |
/* out_ix_img_nomod = (out_ix/207936) */ | |
/* out_ix_img = (out_ix/207936) */ | |
/* out_ix_sz = 4158720 */ | |
/* in_ix_x_dim = 114 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%114) */ | |
/* in_ix_y_dim = 114 */ | |
/* in_ix_y_sz = 114 */ | |
/* in_ix_y_nomod = (in_ix/114) */ | |
/* in_ix_y = ((in_ix/114)%%114) */ | |
/* in_ix_chan_dim = 64 */ | |
/* in_ix_chan_sz = 12996 */ | |
/* in_ix_chan_nomod = (in_ix/12996) */ | |
/* in_ix_chan = ((in_ix/12996)%%64) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 831744 */ | |
/* in_ix_img_nomod = (in_ix/831744) */ | |
/* in_ix_img = (in_ix/831744) */ | |
/* in_ix_sz = 16634880 */ | |
/* op = out_v = max( out_v, v ) */ | |
/* op_post = */ | |
// each thread: computes outputs across chan dim, using inputs across chan dim | |
CUCL_GLOBAL_KERNEL void lrn__num_imgs_20__chans_64__ysz_57__xsz_57__local_size_5__alpha_0_0001__beta_0_75__k_1( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const tix = GLOB_ID_1D; | |
if( tix >= 64980 ) { return; } | |
// iteratate over chans | |
float ls_buf[5] = {0.0f}; | |
int32_t const hls = 5 >> 1; | |
int32_t const out_base_ix = (tix/3249)*207936 + ((tix/57)%57)*57 + (tix%57)*1; | |
for( int32_t in_chan_ix = 0; in_chan_ix < 64 + hls; ++in_chan_ix ) { | |
int32_t const in_off = in_chan_ix*3249; | |
int32_t const lsb_ix = in_chan_ix % 5; | |
ls_buf[lsb_ix] = (in_chan_ix < 64) ? in[out_base_ix + in_off] : 0.0f; | |
if( in_chan_ix >= hls ) { | |
int32_t const out_chan_ix = in_chan_ix - hls; | |
float ls_sum = 0.0f; | |
for( int32_t i = 0; i != 5; ++i ) { ls_sum += ls_buf[i]*ls_buf[i]; } | |
float const scale = powf( (1 + 0.0001*ls_sum/5), -0.75 ); | |
out[out_base_ix + out_chan_ix*3249] = ls_buf[(lsb_ix+5-hls) % 5] * scale; | |
} | |
} | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* chans = 64 */ | |
/* ysz = 57 */ | |
/* xsz = 57 */ | |
/* local_size = 5 */ | |
/* alpha = 0.0001 */ | |
/* beta = 0.75 */ | |
/* k = 1 */ | |
/* rtc_func_name = lrn__num_imgs_20__chans_64__ysz_57__xsz_57__local_size_5__alpha_0_0001__beta_0_75__k_1 */ | |
/* tix_x_dim = 57 */ | |
/* tix_x_sz = 1 */ | |
/* tix_x_nomod = tix */ | |
/* tix_x = (tix%%57) */ | |
/* tix_y_dim = 57 */ | |
/* tix_y_sz = 57 */ | |
/* tix_y_nomod = (tix/57) */ | |
/* tix_y = ((tix/57)%%57) */ | |
/* tix_img_dim = 20 */ | |
/* tix_img_sz = 3249 */ | |
/* tix_img_nomod = (tix/3249) */ | |
/* tix_img = (tix/3249) */ | |
/* tix_sz = 64980 */ | |
/* out_ix_x_dim = 57 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%57) */ | |
/* out_ix_y_dim = 57 */ | |
/* out_ix_y_sz = 57 */ | |
/* out_ix_y_nomod = (out_ix/57) */ | |
/* out_ix_y = ((out_ix/57)%%57) */ | |
/* out_ix_chan_dim = 64 */ | |
/* out_ix_chan_sz = 3249 */ | |
/* out_ix_chan_nomod = (out_ix/3249) */ | |
/* out_ix_chan = ((out_ix/3249)%%64) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 207936 */ | |
/* out_ix_img_nomod = (out_ix/207936) */ | |
/* out_ix_img = (out_ix/207936) */ | |
/* out_ix_sz = 4158720 */ | |
// each thread: computes 8x8 block of out | |
// loop over k dim | |
CUCL_GLOBAL_KERNEL void k1conv__num_imgs_20__in_dim_0_57__in_dim_1_57__conv_has_relu_1__out_chans_64__write_xposed_0__in_chans_64( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) { | |
//int32_t const blk_in_ix_sz = 16*8; | |
LOCSHAR_MEM float all_smem[1536]; // note: max(filts+in,out) == max(512+1024,1024) | |
LSMASQ float * const filts_smem = all_smem; | |
LSMASQ float * const in_smem = filts_smem + 512; | |
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers | |
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem | |
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz ) | |
float in_strip[8]; // segment of input line sufficient for one unrolling of inner loop | |
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*4096; // index of first out chan | |
int32_t blk_in_ix_base = GRP_ID_1D*8192 + LOC_ID_1D;// index of first input pel to load for this thread | |
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%8); | |
LSMASQ float * const in_smem_off = in_smem + 8*(LOC_ID_1D/8); | |
LSMASQ float * const out_smem_off = all_smem + LOC_ID_1D; | |
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D; | |
// iteratate over filter elements | |
for( int32_t blk_iter = 0; blk_iter != 8; ++blk_iter ) { | |
BARRIER_SYNC; | |
// begin smem_loads | |
filts_smem[(LOC_ID_1D + 128 * 0)] = filts[filts_off+(128*0)]; | |
filts_smem[(LOC_ID_1D + 128 * 1)] = filts[filts_off+(128*1)]; | |
filts_smem[(LOC_ID_1D + 128 * 2)] = filts[filts_off+(128*2)]; | |
filts_smem[(LOC_ID_1D + 128 * 3)] = filts[filts_off+(128*3)]; | |
in_smem[(LOC_ID_1D + 128 * 0)] = in[ blk_in_ix_base + (128*0) ]; | |
in_smem[(LOC_ID_1D + 128 * 1)] = in[ blk_in_ix_base + (128*1) ]; | |
in_smem[(LOC_ID_1D + 128 * 2)] = in[ blk_in_ix_base + (128*2) ]; | |
in_smem[(LOC_ID_1D + 128 * 3)] = in[ blk_in_ix_base + (128*3) ]; | |
in_smem[(LOC_ID_1D + 128 * 4)] = in[ blk_in_ix_base + (128*4) ]; | |
in_smem[(LOC_ID_1D + 128 * 5)] = in[ blk_in_ix_base + (128*5) ]; | |
in_smem[(LOC_ID_1D + 128 * 6)] = in[ blk_in_ix_base + (128*6) ]; | |
in_smem[(LOC_ID_1D + 128 * 7)] = in[ blk_in_ix_base + (128*7) ]; | |
// end smem_loads; | |
BARRIER_SYNC; | |
filts_off += 64*8; | |
blk_in_ix_base += 1024; | |
// begin inner_loop_body | |
filts_strip[0] = filts_smem_off[0*64+0*8]; | |
filts_strip[1] = filts_smem_off[0*64+1*8]; | |
filts_strip[2] = filts_smem_off[0*64+2*8]; | |
filts_strip[3] = filts_smem_off[0*64+3*8]; | |
filts_strip[4] = filts_smem_off[0*64+4*8]; | |
filts_strip[5] = filts_smem_off[0*64+5*8]; | |
filts_strip[6] = filts_smem_off[0*64+6*8]; | |
filts_strip[7] = filts_smem_off[0*64+7*8]; | |
in_strip[0] = in_smem_off[(0*8*16+0)]; | |
in_strip[1] = in_smem_off[(0*8*16+1)]; | |
in_strip[2] = in_smem_off[(0*8*16+2)]; | |
in_strip[3] = in_smem_off[(0*8*16+3)]; | |
in_strip[4] = in_smem_off[(0*8*16+4)]; | |
in_strip[5] = in_smem_off[(0*8*16+5)]; | |
in_strip[6] = in_smem_off[(0*8*16+6)]; | |
in_strip[7] = in_smem_off[(0*8*16+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[1*64+0*8]; | |
filts_strip[1] = filts_smem_off[1*64+1*8]; | |
filts_strip[2] = filts_smem_off[1*64+2*8]; | |
filts_strip[3] = filts_smem_off[1*64+3*8]; | |
filts_strip[4] = filts_smem_off[1*64+4*8]; | |
filts_strip[5] = filts_smem_off[1*64+5*8]; | |
filts_strip[6] = filts_smem_off[1*64+6*8]; | |
filts_strip[7] = filts_smem_off[1*64+7*8]; | |
in_strip[0] = in_smem_off[(1*8*16+0)]; | |
in_strip[1] = in_smem_off[(1*8*16+1)]; | |
in_strip[2] = in_smem_off[(1*8*16+2)]; | |
in_strip[3] = in_smem_off[(1*8*16+3)]; | |
in_strip[4] = in_smem_off[(1*8*16+4)]; | |
in_strip[5] = in_smem_off[(1*8*16+5)]; | |
in_strip[6] = in_smem_off[(1*8*16+6)]; | |
in_strip[7] = in_smem_off[(1*8*16+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[2*64+0*8]; | |
filts_strip[1] = filts_smem_off[2*64+1*8]; | |
filts_strip[2] = filts_smem_off[2*64+2*8]; | |
filts_strip[3] = filts_smem_off[2*64+3*8]; | |
filts_strip[4] = filts_smem_off[2*64+4*8]; | |
filts_strip[5] = filts_smem_off[2*64+5*8]; | |
filts_strip[6] = filts_smem_off[2*64+6*8]; | |
filts_strip[7] = filts_smem_off[2*64+7*8]; | |
in_strip[0] = in_smem_off[(2*8*16+0)]; | |
in_strip[1] = in_smem_off[(2*8*16+1)]; | |
in_strip[2] = in_smem_off[(2*8*16+2)]; | |
in_strip[3] = in_smem_off[(2*8*16+3)]; | |
in_strip[4] = in_smem_off[(2*8*16+4)]; | |
in_strip[5] = in_smem_off[(2*8*16+5)]; | |
in_strip[6] = in_smem_off[(2*8*16+6)]; | |
in_strip[7] = in_smem_off[(2*8*16+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[3*64+0*8]; | |
filts_strip[1] = filts_smem_off[3*64+1*8]; | |
filts_strip[2] = filts_smem_off[3*64+2*8]; | |
filts_strip[3] = filts_smem_off[3*64+3*8]; | |
filts_strip[4] = filts_smem_off[3*64+4*8]; | |
filts_strip[5] = filts_smem_off[3*64+5*8]; | |
filts_strip[6] = filts_smem_off[3*64+6*8]; | |
filts_strip[7] = filts_smem_off[3*64+7*8]; | |
in_strip[0] = in_smem_off[(3*8*16+0)]; | |
in_strip[1] = in_smem_off[(3*8*16+1)]; | |
in_strip[2] = in_smem_off[(3*8*16+2)]; | |
in_strip[3] = in_smem_off[(3*8*16+3)]; | |
in_strip[4] = in_smem_off[(3*8*16+4)]; | |
in_strip[5] = in_smem_off[(3*8*16+5)]; | |
in_strip[6] = in_smem_off[(3*8*16+6)]; | |
in_strip[7] = in_smem_off[(3*8*16+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[4*64+0*8]; | |
filts_strip[1] = filts_smem_off[4*64+1*8]; | |
filts_strip[2] = filts_smem_off[4*64+2*8]; | |
filts_strip[3] = filts_smem_off[4*64+3*8]; | |
filts_strip[4] = filts_smem_off[4*64+4*8]; | |
filts_strip[5] = filts_smem_off[4*64+5*8]; | |
filts_strip[6] = filts_smem_off[4*64+6*8]; | |
filts_strip[7] = filts_smem_off[4*64+7*8]; | |
in_strip[0] = in_smem_off[(4*8*16+0)]; | |
in_strip[1] = in_smem_off[(4*8*16+1)]; | |
in_strip[2] = in_smem_off[(4*8*16+2)]; | |
in_strip[3] = in_smem_off[(4*8*16+3)]; | |
in_strip[4] = in_smem_off[(4*8*16+4)]; | |
in_strip[5] = in_smem_off[(4*8*16+5)]; | |
in_strip[6] = in_smem_off[(4*8*16+6)]; | |
in_strip[7] = in_smem_off[(4*8*16+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[5*64+0*8]; | |
filts_strip[1] = filts_smem_off[5*64+1*8]; | |
filts_strip[2] = filts_smem_off[5*64+2*8]; | |
filts_strip[3] = filts_smem_off[5*64+3*8]; | |
filts_strip[4] = filts_smem_off[5*64+4*8]; | |
filts_strip[5] = filts_smem_off[5*64+5*8]; | |
filts_strip[6] = filts_smem_off[5*64+6*8]; | |
filts_strip[7] = filts_smem_off[5*64+7*8]; | |
in_strip[0] = in_smem_off[(5*8*16+0)]; | |
in_strip[1] = in_smem_off[(5*8*16+1)]; | |
in_strip[2] = in_smem_off[(5*8*16+2)]; | |
in_strip[3] = in_smem_off[(5*8*16+3)]; | |
in_strip[4] = in_smem_off[(5*8*16+4)]; | |
in_strip[5] = in_smem_off[(5*8*16+5)]; | |
in_strip[6] = in_smem_off[(5*8*16+6)]; | |
in_strip[7] = in_smem_off[(5*8*16+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[6*64+0*8]; | |
filts_strip[1] = filts_smem_off[6*64+1*8]; | |
filts_strip[2] = filts_smem_off[6*64+2*8]; | |
filts_strip[3] = filts_smem_off[6*64+3*8]; | |
filts_strip[4] = filts_smem_off[6*64+4*8]; | |
filts_strip[5] = filts_smem_off[6*64+5*8]; | |
filts_strip[6] = filts_smem_off[6*64+6*8]; | |
filts_strip[7] = filts_smem_off[6*64+7*8]; | |
in_strip[0] = in_smem_off[(6*8*16+0)]; | |
in_strip[1] = in_smem_off[(6*8*16+1)]; | |
in_strip[2] = in_smem_off[(6*8*16+2)]; | |
in_strip[3] = in_smem_off[(6*8*16+3)]; | |
in_strip[4] = in_smem_off[(6*8*16+4)]; | |
in_strip[5] = in_smem_off[(6*8*16+5)]; | |
in_strip[6] = in_smem_off[(6*8*16+6)]; | |
in_strip[7] = in_smem_off[(6*8*16+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[7*64+0*8]; | |
filts_strip[1] = filts_smem_off[7*64+1*8]; | |
filts_strip[2] = filts_smem_off[7*64+2*8]; | |
filts_strip[3] = filts_smem_off[7*64+3*8]; | |
filts_strip[4] = filts_smem_off[7*64+4*8]; | |
filts_strip[5] = filts_smem_off[7*64+5*8]; | |
filts_strip[6] = filts_smem_off[7*64+6*8]; | |
filts_strip[7] = filts_smem_off[7*64+7*8]; | |
in_strip[0] = in_smem_off[(7*8*16+0)]; | |
in_strip[1] = in_smem_off[(7*8*16+1)]; | |
in_strip[2] = in_smem_off[(7*8*16+2)]; | |
in_strip[3] = in_smem_off[(7*8*16+3)]; | |
in_strip[4] = in_smem_off[(7*8*16+4)]; | |
in_strip[5] = in_smem_off[(7*8*16+5)]; | |
in_strip[6] = in_smem_off[(7*8*16+6)]; | |
in_strip[7] = in_smem_off[(7*8*16+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
; | |
} | |
// load per-block biases into smem | |
if( flags == 2 ) { return; } | |
BARRIER_SYNC; | |
for( int32_t i = 0; i != 1; ++i ) { | |
int32_t const t_smem_bias_ix = LOC_ID_1D+128*i; | |
if( t_smem_bias_ix < 64 ) { | |
int32_t const ocix_base = (GRP_ID_1D%1)*64; | |
int32_t const load_reg = t_smem_bias_ix / 8; | |
int32_t const load_tile = t_smem_bias_ix % 8; | |
int32_t const ocix = ocix_base + load_tile*8 + load_reg; | |
if( ocix < 64 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; } | |
} | |
} | |
BARRIER_SYNC; | |
// load biases into filts_strip | |
// begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*8]; | |
filts_strip[1] = filts_smem_off[1*8]; | |
filts_strip[2] = filts_smem_off[2*8]; | |
filts_strip[3] = filts_smem_off[3*8]; | |
filts_strip[4] = filts_smem_off[4*8]; | |
filts_strip[5] = filts_smem_off[5*8]; | |
filts_strip[6] = filts_smem_off[6*8]; | |
filts_strip[7] = filts_smem_off[7*8]; | |
// end t_tile_bias_loads; | |
if( flags == 1 ) { | |
GASQ float * const out_off = out + LOC_ID_1D; | |
out_off[0] = max(0.0f,out_tile[0]+filts_strip[0]); | |
out_off[128] = max(0.0f,out_tile[1]+filts_strip[1]); | |
out_off[256] = max(0.0f,out_tile[2]+filts_strip[2]); | |
out_off[384] = max(0.0f,out_tile[3]+filts_strip[3]); | |
out_off[512] = max(0.0f,out_tile[4]+filts_strip[4]); | |
out_off[640] = max(0.0f,out_tile[5]+filts_strip[5]); | |
out_off[768] = max(0.0f,out_tile[6]+filts_strip[6]); | |
out_off[896] = max(0.0f,out_tile[7]+filts_strip[7]); | |
out_off[1024] = max(0.0f,out_tile[8]+filts_strip[0]); | |
out_off[1152] = max(0.0f,out_tile[9]+filts_strip[1]); | |
out_off[1280] = max(0.0f,out_tile[10]+filts_strip[2]); | |
out_off[1408] = max(0.0f,out_tile[11]+filts_strip[3]); | |
out_off[1536] = max(0.0f,out_tile[12]+filts_strip[4]); | |
out_off[1664] = max(0.0f,out_tile[13]+filts_strip[5]); | |
out_off[1792] = max(0.0f,out_tile[14]+filts_strip[6]); | |
out_off[1920] = max(0.0f,out_tile[15]+filts_strip[7]); | |
out_off[2048] = max(0.0f,out_tile[16]+filts_strip[0]); | |
out_off[2176] = max(0.0f,out_tile[17]+filts_strip[1]); | |
out_off[2304] = max(0.0f,out_tile[18]+filts_strip[2]); | |
out_off[2432] = max(0.0f,out_tile[19]+filts_strip[3]); | |
out_off[2560] = max(0.0f,out_tile[20]+filts_strip[4]); | |
out_off[2688] = max(0.0f,out_tile[21]+filts_strip[5]); | |
out_off[2816] = max(0.0f,out_tile[22]+filts_strip[6]); | |
out_off[2944] = max(0.0f,out_tile[23]+filts_strip[7]); | |
out_off[3072] = max(0.0f,out_tile[24]+filts_strip[0]); | |
out_off[3200] = max(0.0f,out_tile[25]+filts_strip[1]); | |
out_off[3328] = max(0.0f,out_tile[26]+filts_strip[2]); | |
out_off[3456] = max(0.0f,out_tile[27]+filts_strip[3]); | |
out_off[3584] = max(0.0f,out_tile[28]+filts_strip[4]); | |
out_off[3712] = max(0.0f,out_tile[29]+filts_strip[5]); | |
out_off[3840] = max(0.0f,out_tile[30]+filts_strip[6]); | |
out_off[3968] = max(0.0f,out_tile[31]+filts_strip[7]); | |
out_off[4096] = max(0.0f,out_tile[32]+filts_strip[0]); | |
out_off[4224] = max(0.0f,out_tile[33]+filts_strip[1]); | |
out_off[4352] = max(0.0f,out_tile[34]+filts_strip[2]); | |
out_off[4480] = max(0.0f,out_tile[35]+filts_strip[3]); | |
out_off[4608] = max(0.0f,out_tile[36]+filts_strip[4]); | |
out_off[4736] = max(0.0f,out_tile[37]+filts_strip[5]); | |
out_off[4864] = max(0.0f,out_tile[38]+filts_strip[6]); | |
out_off[4992] = max(0.0f,out_tile[39]+filts_strip[7]); | |
out_off[5120] = max(0.0f,out_tile[40]+filts_strip[0]); | |
out_off[5248] = max(0.0f,out_tile[41]+filts_strip[1]); | |
out_off[5376] = max(0.0f,out_tile[42]+filts_strip[2]); | |
out_off[5504] = max(0.0f,out_tile[43]+filts_strip[3]); | |
out_off[5632] = max(0.0f,out_tile[44]+filts_strip[4]); | |
out_off[5760] = max(0.0f,out_tile[45]+filts_strip[5]); | |
out_off[5888] = max(0.0f,out_tile[46]+filts_strip[6]); | |
out_off[6016] = max(0.0f,out_tile[47]+filts_strip[7]); | |
out_off[6144] = max(0.0f,out_tile[48]+filts_strip[0]); | |
out_off[6272] = max(0.0f,out_tile[49]+filts_strip[1]); | |
out_off[6400] = max(0.0f,out_tile[50]+filts_strip[2]); | |
out_off[6528] = max(0.0f,out_tile[51]+filts_strip[3]); | |
out_off[6656] = max(0.0f,out_tile[52]+filts_strip[4]); | |
out_off[6784] = max(0.0f,out_tile[53]+filts_strip[5]); | |
out_off[6912] = max(0.0f,out_tile[54]+filts_strip[6]); | |
out_off[7040] = max(0.0f,out_tile[55]+filts_strip[7]); | |
out_off[7168] = max(0.0f,out_tile[56]+filts_strip[0]); | |
out_off[7296] = max(0.0f,out_tile[57]+filts_strip[1]); | |
out_off[7424] = max(0.0f,out_tile[58]+filts_strip[2]); | |
out_off[7552] = max(0.0f,out_tile[59]+filts_strip[3]); | |
out_off[7680] = max(0.0f,out_tile[60]+filts_strip[4]); | |
out_off[7808] = max(0.0f,out_tile[61]+filts_strip[5]); | |
out_off[7936] = max(0.0f,out_tile[62]+filts_strip[6]); | |
out_off[8064] = max(0.0f,out_tile[63]+filts_strip[7]); | |
; | |
return; | |
} | |
// add bias to each elem of out_tile[] and store the results to out[] | |
// begin t_tile_stores | |
int32_t tpix[8]; | |
int32_t tcix[8]; | |
tpix[0] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+0)/3249)*207936 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+0)%3249)*1 ; // cache out patch ixs | |
tpix[1] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+1)/3249)*207936 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+1)%3249)*1 ; // cache out patch ixs | |
tpix[2] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+2)/3249)*207936 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+2)%3249)*1 ; // cache out patch ixs | |
tpix[3] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+3)/3249)*207936 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+3)%3249)*1 ; // cache out patch ixs | |
tpix[4] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+4)/3249)*207936 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+4)%3249)*1 ; // cache out patch ixs | |
tpix[5] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+5)/3249)*207936 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+5)%3249)*1 ; // cache out patch ixs | |
tpix[6] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+6)/3249)*207936 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+6)%3249)*1 ; // cache out patch ixs | |
tpix[7] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+7)/3249)*207936 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+7)%3249)*1 ; // cache out patch ixs | |
tcix[0] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+0)*3249; // cache out chan ixs | |
tcix[1] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+1)*3249; // cache out chan ixs | |
tcix[2] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+2)*3249; // cache out chan ixs | |
tcix[3] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+3)*3249; // cache out chan ixs | |
tcix[4] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+4)*3249; // cache out chan ixs | |
tcix[5] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+5)*3249; // cache out chan ixs | |
tcix[6] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+6)*3249; // cache out chan ixs | |
tcix[7] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+7)*3249; // cache out chan ixs | |
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+0)/3249) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (64*3249) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( tcix[1] < (64*3249) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( tcix[2] < (64*3249) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( tcix[3] < (64*3249) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( tcix[4] < (64*3249) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( tcix[5] < (64*3249) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( tcix[6] < (64*3249) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( tcix[7] < (64*3249) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+1)/3249) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (64*3249) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( tcix[1] < (64*3249) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( tcix[2] < (64*3249) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( tcix[3] < (64*3249) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( tcix[4] < (64*3249) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( tcix[5] < (64*3249) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( tcix[6] < (64*3249) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( tcix[7] < (64*3249) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+2)/3249) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (64*3249) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( tcix[1] < (64*3249) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( tcix[2] < (64*3249) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( tcix[3] < (64*3249) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( tcix[4] < (64*3249) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( tcix[5] < (64*3249) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( tcix[6] < (64*3249) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( tcix[7] < (64*3249) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+3)/3249) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (64*3249) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( tcix[1] < (64*3249) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( tcix[2] < (64*3249) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( tcix[3] < (64*3249) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( tcix[4] < (64*3249) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( tcix[5] < (64*3249) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( tcix[6] < (64*3249) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( tcix[7] < (64*3249) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+4)/3249) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (64*3249) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( tcix[1] < (64*3249) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( tcix[2] < (64*3249) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( tcix[3] < (64*3249) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( tcix[4] < (64*3249) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( tcix[5] < (64*3249) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( tcix[6] < (64*3249) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( tcix[7] < (64*3249) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+5)/3249) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (64*3249) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( tcix[1] < (64*3249) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( tcix[2] < (64*3249) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( tcix[3] < (64*3249) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( tcix[4] < (64*3249) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( tcix[5] < (64*3249) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( tcix[6] < (64*3249) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( tcix[7] < (64*3249) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+6)/3249) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (64*3249) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( tcix[1] < (64*3249) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( tcix[2] < (64*3249) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( tcix[3] < (64*3249) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( tcix[4] < (64*3249) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( tcix[5] < (64*3249) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( tcix[6] < (64*3249) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( tcix[7] < (64*3249) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+7)/3249) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (64*3249) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( tcix[1] < (64*3249) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( tcix[2] < (64*3249) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( tcix[3] < (64*3249) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( tcix[4] < (64*3249) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( tcix[5] < (64*3249) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( tcix[6] < (64*3249) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( tcix[7] < (64*3249) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_dim_0 = 57 */ | |
/* in_dim_1 = 57 */ | |
/* conv_has_relu = 1 */ | |
/* out_chans = 64 */ | |
/* write_xposed = 0 */ | |
/* in_chans = 64 */ | |
/* rtc_func_name = k1conv__num_imgs_20__in_dim_0_57__in_dim_1_57__conv_has_relu_1__out_chans_64__write_xposed_0__in_chans_64 */ | |
/* t_tile_sz = 8 */ | |
/* out_ix_x_dim = 57 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%57) */ | |
/* out_ix_y_dim = 57 */ | |
/* out_ix_y_sz = 57 */ | |
/* out_ix_y_nomod = (out_ix/57) */ | |
/* out_ix_y = ((out_ix/57)%%57) */ | |
/* out_ix_chan_dim = 64 */ | |
/* out_ix_chan_sz = 3249 */ | |
/* out_ix_chan_nomod = (out_ix/3249) */ | |
/* out_ix_chan = ((out_ix/3249)%%64) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 207936 */ | |
/* out_ix_img_nomod = (out_ix/207936) */ | |
/* out_ix_img = (out_ix/207936) */ | |
/* out_ix_sz = 4158720 */ | |
/* tpb = 128 */ | |
/* in_chan_tile = 8 */ | |
/* LOC_ID_1D_out_chan_tile_dim = 8 */ | |
/* LOC_ID_1D_out_chan_tile_sz = 1 */ | |
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */ | |
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%8) */ | |
/* LOC_ID_1D_pels_tile_dim = 16 */ | |
/* LOC_ID_1D_pels_tile_sz = 8 */ | |
/* LOC_ID_1D_pels_tile_nomod = (LOC_ID_1D/8) */ | |
/* LOC_ID_1D_pels_tile = (LOC_ID_1D/8) */ | |
/* LOC_ID_1D_sz = 128 */ | |
/* GRP_ID_1D_out_chan_blk_dim = 1 */ | |
/* GRP_ID_1D_out_chan_blk_sz = 1 */ | |
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */ | |
/* GRP_ID_1D_pels_blk_dim = 508 */ | |
/* GRP_ID_1D_pels_blk_sz = 1 */ | |
/* GRP_ID_1D_pels_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_pels_blk = GRP_ID_1D */ | |
/* GRP_ID_1D_sz = 508 */ | |
/* in_ix_blk_pel_dim = 128 */ | |
/* in_ix_blk_pel_sz = 1 */ | |
/* in_ix_blk_pel_nomod = in_ix */ | |
/* in_ix_blk_pel = (in_ix%%128) */ | |
/* in_ix_blk_iter_chan_dim = 8 */ | |
/* in_ix_blk_iter_chan_sz = 128 */ | |
/* in_ix_blk_iter_chan_nomod = (in_ix/128) */ | |
/* in_ix_blk_iter_chan = ((in_ix/128)%%8) */ | |
/* in_ix_blk_iter_dim = 8 */ | |
/* in_ix_blk_iter_sz = 1024 */ | |
/* in_ix_blk_iter_nomod = (in_ix/1024) */ | |
/* in_ix_blk_iter = ((in_ix/1024)%%8) */ | |
/* in_ix_blk_dim = 508 */ | |
/* in_ix_blk_sz = 8192 */ | |
/* in_ix_blk_nomod = (in_ix/8192) */ | |
/* in_ix_blk = (in_ix/8192) */ | |
/* in_ix_sz = 4161536 */ | |
/* blk_filt_ix_sz = 64 */ | |
/* filts_smem_sz = 512 */ | |
/* in_smem_sz = 1024 */ | |
/* out_smem_sz = 1024 */ | |
/* all_smem_sz = 1536 */ | |
/* filts_xp_ix_out_chan_tile_dim = 8 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%8) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 8 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/8) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/8)%%8) */ | |
/* filts_xp_ix_in_chan_dim = 64 */ | |
/* filts_xp_ix_in_chan_sz = 64 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/64) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/64)%%64) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 4096 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/4096) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/4096) */ | |
/* filts_xp_ix_sz = 4096 */ | |
/* out_chan_bias_smem_load_iter = 1 */ | |
/* filts_off_adj = LOC_ID_1D */ | |
/* smem_loads = // begin smem_loads | |
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 2)] = filts[filts_off+(%(tpb)*2)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 3)] = filts[filts_off+(%(tpb)*3)]; | |
in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 1)] = in[ blk_in_ix_base + (%(tpb)*1) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 2)] = in[ blk_in_ix_base + (%(tpb)*2) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 3)] = in[ blk_in_ix_base + (%(tpb)*3) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 4)] = in[ blk_in_ix_base + (%(tpb)*4) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 5)] = in[ blk_in_ix_base + (%(tpb)*5) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 6)] = in[ blk_in_ix_base + (%(tpb)*6) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 7)] = in[ blk_in_ix_base + (%(tpb)*7) ]; | |
// end smem_loads */ | |
/* out_chan_tile = (%(LOC_ID_1D_out_chan_tile)+%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim)) */ | |
/* out_chan_ix = (%(out_chan_tile)*%(t_tile_sz)) */ | |
/* t_smem_ld_pel_pel_dim = 128 */ | |
/* t_smem_ld_pel_pel_sz = 1 */ | |
/* t_smem_ld_pel_pel_nomod = t_smem_ld_pel */ | |
/* t_smem_ld_pel_pel = (t_smem_ld_pel%%128) */ | |
/* t_smem_ld_pel_chan_dim = 8 */ | |
/* t_smem_ld_pel_chan_sz = 128 */ | |
/* t_smem_ld_pel_chan_nomod = (t_smem_ld_pel/128) */ | |
/* t_smem_ld_pel_chan = (t_smem_ld_pel/128) */ | |
/* t_smem_ld_pel_sz = 1024 */ | |
/* out_pel_0 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+0) */ | |
/* out_pel_0_pel_dim = 3249 */ | |
/* out_pel_0_pel_sz = 1 */ | |
/* out_pel_0_pel_nomod = %(out_pel_0) */ | |
/* out_pel_0_pel = (%(out_pel_0)%%3249) */ | |
/* out_pel_0_img_dim = 20 */ | |
/* out_pel_0_img_sz = 3249 */ | |
/* out_pel_0_img_nomod = (%(out_pel_0)/3249) */ | |
/* out_pel_0_img = (%(out_pel_0)/3249) */ | |
/* out_pel_0_sz = 64980 */ | |
/* out_pel_1 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+1) */ | |
/* out_pel_1_pel_dim = 3249 */ | |
/* out_pel_1_pel_sz = 1 */ | |
/* out_pel_1_pel_nomod = %(out_pel_1) */ | |
/* out_pel_1_pel = (%(out_pel_1)%%3249) */ | |
/* out_pel_1_img_dim = 20 */ | |
/* out_pel_1_img_sz = 3249 */ | |
/* out_pel_1_img_nomod = (%(out_pel_1)/3249) */ | |
/* out_pel_1_img = (%(out_pel_1)/3249) */ | |
/* out_pel_1_sz = 64980 */ | |
/* out_pel_2 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+2) */ | |
/* out_pel_2_pel_dim = 3249 */ | |
/* out_pel_2_pel_sz = 1 */ | |
/* out_pel_2_pel_nomod = %(out_pel_2) */ | |
/* out_pel_2_pel = (%(out_pel_2)%%3249) */ | |
/* out_pel_2_img_dim = 20 */ | |
/* out_pel_2_img_sz = 3249 */ | |
/* out_pel_2_img_nomod = (%(out_pel_2)/3249) */ | |
/* out_pel_2_img = (%(out_pel_2)/3249) */ | |
/* out_pel_2_sz = 64980 */ | |
/* out_pel_3 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+3) */ | |
/* out_pel_3_pel_dim = 3249 */ | |
/* out_pel_3_pel_sz = 1 */ | |
/* out_pel_3_pel_nomod = %(out_pel_3) */ | |
/* out_pel_3_pel = (%(out_pel_3)%%3249) */ | |
/* out_pel_3_img_dim = 20 */ | |
/* out_pel_3_img_sz = 3249 */ | |
/* out_pel_3_img_nomod = (%(out_pel_3)/3249) */ | |
/* out_pel_3_img = (%(out_pel_3)/3249) */ | |
/* out_pel_3_sz = 64980 */ | |
/* out_pel_4 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+4) */ | |
/* out_pel_4_pel_dim = 3249 */ | |
/* out_pel_4_pel_sz = 1 */ | |
/* out_pel_4_pel_nomod = %(out_pel_4) */ | |
/* out_pel_4_pel = (%(out_pel_4)%%3249) */ | |
/* out_pel_4_img_dim = 20 */ | |
/* out_pel_4_img_sz = 3249 */ | |
/* out_pel_4_img_nomod = (%(out_pel_4)/3249) */ | |
/* out_pel_4_img = (%(out_pel_4)/3249) */ | |
/* out_pel_4_sz = 64980 */ | |
/* out_pel_5 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+5) */ | |
/* out_pel_5_pel_dim = 3249 */ | |
/* out_pel_5_pel_sz = 1 */ | |
/* out_pel_5_pel_nomod = %(out_pel_5) */ | |
/* out_pel_5_pel = (%(out_pel_5)%%3249) */ | |
/* out_pel_5_img_dim = 20 */ | |
/* out_pel_5_img_sz = 3249 */ | |
/* out_pel_5_img_nomod = (%(out_pel_5)/3249) */ | |
/* out_pel_5_img = (%(out_pel_5)/3249) */ | |
/* out_pel_5_sz = 64980 */ | |
/* out_pel_6 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+6) */ | |
/* out_pel_6_pel_dim = 3249 */ | |
/* out_pel_6_pel_sz = 1 */ | |
/* out_pel_6_pel_nomod = %(out_pel_6) */ | |
/* out_pel_6_pel = (%(out_pel_6)%%3249) */ | |
/* out_pel_6_img_dim = 20 */ | |
/* out_pel_6_img_sz = 3249 */ | |
/* out_pel_6_img_nomod = (%(out_pel_6)/3249) */ | |
/* out_pel_6_img = (%(out_pel_6)/3249) */ | |
/* out_pel_6_sz = 64980 */ | |
/* out_pel_7 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+7) */ | |
/* out_pel_7_pel_dim = 3249 */ | |
/* out_pel_7_pel_sz = 1 */ | |
/* out_pel_7_pel_nomod = %(out_pel_7) */ | |
/* out_pel_7_pel = (%(out_pel_7)%%3249) */ | |
/* out_pel_7_img_dim = 20 */ | |
/* out_pel_7_img_sz = 3249 */ | |
/* out_pel_7_img_nomod = (%(out_pel_7)/3249) */ | |
/* out_pel_7_img = (%(out_pel_7)/3249) */ | |
/* out_pel_7_sz = 64980 */ | |
/* t_tile_stores = // begin t_tile_stores | |
int32_t tpix[%(t_tile_sz)]; | |
int32_t tcix[%(t_tile_sz)]; | |
tpix[0] = %(out_pel_0_img)*%(out_ix_img_sz) + %(out_pel_0_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[1] = %(out_pel_1_img)*%(out_ix_img_sz) + %(out_pel_1_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[2] = %(out_pel_2_img)*%(out_ix_img_sz) + %(out_pel_2_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[3] = %(out_pel_3_img)*%(out_ix_img_sz) + %(out_pel_3_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[4] = %(out_pel_4_img)*%(out_ix_img_sz) + %(out_pel_4_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[5] = %(out_pel_5_img)*%(out_ix_img_sz) + %(out_pel_5_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[6] = %(out_pel_6_img)*%(out_ix_img_sz) + %(out_pel_6_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[7] = %(out_pel_7_img)*%(out_ix_img_sz) + %(out_pel_7_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tcix[0] = (%(out_chan_ix)+0)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[1] = (%(out_chan_ix)+1)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[2] = (%(out_chan_ix)+2)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[3] = (%(out_chan_ix)+3)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[4] = (%(out_chan_ix)+4)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[5] = (%(out_chan_ix)+5)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[6] = (%(out_chan_ix)+6)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[7] = (%(out_chan_ix)+7)*%(out_ix_chan_sz); // cache out chan ixs | |
if( %(out_pel_0_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( %(out_pel_1_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( %(out_pel_2_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( %(out_pel_3_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( %(out_pel_4_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( %(out_pel_5_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( %(out_pel_6_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( %(out_pel_7_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores */ | |
/* t_tile_dummy_stores = out_off[0] = max(0.0f,out_tile[0]+filts_strip[0]); | |
out_off[128] = max(0.0f,out_tile[1]+filts_strip[1]); | |
out_off[256] = max(0.0f,out_tile[2]+filts_strip[2]); | |
out_off[384] = max(0.0f,out_tile[3]+filts_strip[3]); | |
out_off[512] = max(0.0f,out_tile[4]+filts_strip[4]); | |
out_off[640] = max(0.0f,out_tile[5]+filts_strip[5]); | |
out_off[768] = max(0.0f,out_tile[6]+filts_strip[6]); | |
out_off[896] = max(0.0f,out_tile[7]+filts_strip[7]); | |
out_off[1024] = max(0.0f,out_tile[8]+filts_strip[0]); | |
out_off[1152] = max(0.0f,out_tile[9]+filts_strip[1]); | |
out_off[1280] = max(0.0f,out_tile[10]+filts_strip[2]); | |
out_off[1408] = max(0.0f,out_tile[11]+filts_strip[3]); | |
out_off[1536] = max(0.0f,out_tile[12]+filts_strip[4]); | |
out_off[1664] = max(0.0f,out_tile[13]+filts_strip[5]); | |
out_off[1792] = max(0.0f,out_tile[14]+filts_strip[6]); | |
out_off[1920] = max(0.0f,out_tile[15]+filts_strip[7]); | |
out_off[2048] = max(0.0f,out_tile[16]+filts_strip[0]); | |
out_off[2176] = max(0.0f,out_tile[17]+filts_strip[1]); | |
out_off[2304] = max(0.0f,out_tile[18]+filts_strip[2]); | |
out_off[2432] = max(0.0f,out_tile[19]+filts_strip[3]); | |
out_off[2560] = max(0.0f,out_tile[20]+filts_strip[4]); | |
out_off[2688] = max(0.0f,out_tile[21]+filts_strip[5]); | |
out_off[2816] = max(0.0f,out_tile[22]+filts_strip[6]); | |
out_off[2944] = max(0.0f,out_tile[23]+filts_strip[7]); | |
out_off[3072] = max(0.0f,out_tile[24]+filts_strip[0]); | |
out_off[3200] = max(0.0f,out_tile[25]+filts_strip[1]); | |
out_off[3328] = max(0.0f,out_tile[26]+filts_strip[2]); | |
out_off[3456] = max(0.0f,out_tile[27]+filts_strip[3]); | |
out_off[3584] = max(0.0f,out_tile[28]+filts_strip[4]); | |
out_off[3712] = max(0.0f,out_tile[29]+filts_strip[5]); | |
out_off[3840] = max(0.0f,out_tile[30]+filts_strip[6]); | |
out_off[3968] = max(0.0f,out_tile[31]+filts_strip[7]); | |
out_off[4096] = max(0.0f,out_tile[32]+filts_strip[0]); | |
out_off[4224] = max(0.0f,out_tile[33]+filts_strip[1]); | |
out_off[4352] = max(0.0f,out_tile[34]+filts_strip[2]); | |
out_off[4480] = max(0.0f,out_tile[35]+filts_strip[3]); | |
out_off[4608] = max(0.0f,out_tile[36]+filts_strip[4]); | |
out_off[4736] = max(0.0f,out_tile[37]+filts_strip[5]); | |
out_off[4864] = max(0.0f,out_tile[38]+filts_strip[6]); | |
out_off[4992] = max(0.0f,out_tile[39]+filts_strip[7]); | |
out_off[5120] = max(0.0f,out_tile[40]+filts_strip[0]); | |
out_off[5248] = max(0.0f,out_tile[41]+filts_strip[1]); | |
out_off[5376] = max(0.0f,out_tile[42]+filts_strip[2]); | |
out_off[5504] = max(0.0f,out_tile[43]+filts_strip[3]); | |
out_off[5632] = max(0.0f,out_tile[44]+filts_strip[4]); | |
out_off[5760] = max(0.0f,out_tile[45]+filts_strip[5]); | |
out_off[5888] = max(0.0f,out_tile[46]+filts_strip[6]); | |
out_off[6016] = max(0.0f,out_tile[47]+filts_strip[7]); | |
out_off[6144] = max(0.0f,out_tile[48]+filts_strip[0]); | |
out_off[6272] = max(0.0f,out_tile[49]+filts_strip[1]); | |
out_off[6400] = max(0.0f,out_tile[50]+filts_strip[2]); | |
out_off[6528] = max(0.0f,out_tile[51]+filts_strip[3]); | |
out_off[6656] = max(0.0f,out_tile[52]+filts_strip[4]); | |
out_off[6784] = max(0.0f,out_tile[53]+filts_strip[5]); | |
out_off[6912] = max(0.0f,out_tile[54]+filts_strip[6]); | |
out_off[7040] = max(0.0f,out_tile[55]+filts_strip[7]); | |
out_off[7168] = max(0.0f,out_tile[56]+filts_strip[0]); | |
out_off[7296] = max(0.0f,out_tile[57]+filts_strip[1]); | |
out_off[7424] = max(0.0f,out_tile[58]+filts_strip[2]); | |
out_off[7552] = max(0.0f,out_tile[59]+filts_strip[3]); | |
out_off[7680] = max(0.0f,out_tile[60]+filts_strip[4]); | |
out_off[7808] = max(0.0f,out_tile[61]+filts_strip[5]); | |
out_off[7936] = max(0.0f,out_tile[62]+filts_strip[6]); | |
out_off[8064] = max(0.0f,out_tile[63]+filts_strip[7]); | |
*/ | |
/* t_tile_bias_loads = // begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
// end t_tile_bias_loads */ | |
/* inner_loop_body = // begin inner_loop_body | |
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[3*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[3*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[3*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[3*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[3*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[3*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[3*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[3*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[4*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[4*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[4*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[4*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[4*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[4*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[4*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[4*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[5*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[5*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[5*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[5*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[5*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[5*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[5*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[5*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[6*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[6*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[6*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[6*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[6*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[6*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[6*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[6*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[7*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[7*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[7*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[7*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[7*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[7*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[7*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[7*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
*/ | |
CUCL_GLOBAL_KERNEL void xpose_in__num_imgs_20__in_chan_tile_8__pad_in_chans_64__in_chans_64__ysz_57__xsz_57__tix_pels_tile_sz_16__bix_pels_blk_sz_508( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const out_ix = GLOB_ID_1D; | |
int32_t const chan_ix = ((out_ix/1024)%8)*8 + ((out_ix/128)%8); | |
int32_t const pel_ix = (out_ix/8192)*128 + (out_ix%128); | |
float v = 0.0f; | |
if( ( chan_ix < 64 ) && ( (pel_ix/3249) < 20 ) ) { | |
v = in[ (pel_ix/3249)*207936 + | |
chan_ix*3249 + | |
((pel_ix/57)%57)*57 + | |
(pel_ix%57)*1 ]; | |
} | |
out[out_ix] = v; | |
} | |
/* | |
in_pels = num_img * in.sz.dims_prod() | |
num_in_blks = u32_ceil_div( in_pels, block_chan_pels ) | |
normal in dims: img, chan, y, x OR img, chan, pels // where pels = x,y dims merged | |
block_iters = u32_ceil_div( chan, in_chan_tile ) // for ccp1, 96/8=12 | |
pad_chan = block_iter * in_chan_tile // pad by up to (in_chan_tile-1) [typ. 8; pad with zeros? garbage okay?] | |
block_chan_pels = t_tile_sz*tix_pels_tile_sz // typically 8*8=64 | |
block_iter_pels = block_chan_pels * in_chan_tile; // typically 512 | |
block_pels = 12*512 = 6144 // note: 24576 bytes, prob. too big for SM to fully cache, but 512=2K (per-iter cache) is fine. | |
xposed in dims (inner): (block_iter, block_iter_chan, block_iter_pel) == block_pel | |
sz (inner): (block_iters, in_chan_tile, block_chan_pels) == block_pels (only inner 2 dims need to be linear?) | |
*/ | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_chan_tile = 8 */ | |
/* pad_in_chans = 64 */ | |
/* in_chans = 64 */ | |
/* ysz = 57 */ | |
/* xsz = 57 */ | |
/* tix_pels_tile_sz = 16 */ | |
/* bix_pels_blk_sz = 508 */ | |
/* rtc_func_name = xpose_in__num_imgs_20__in_chan_tile_8__pad_in_chans_64__in_chans_64__ysz_57__xsz_57__tix_pels_tile_sz_16__bix_pels_blk_sz_508 */ | |
/* out_ix_blk_pel_dim = 128 */ | |
/* out_ix_blk_pel_sz = 1 */ | |
/* out_ix_blk_pel_nomod = out_ix */ | |
/* out_ix_blk_pel = (out_ix%%128) */ | |
/* out_ix_blk_iter_chan_dim = 8 */ | |
/* out_ix_blk_iter_chan_sz = 128 */ | |
/* out_ix_blk_iter_chan_nomod = (out_ix/128) */ | |
/* out_ix_blk_iter_chan = ((out_ix/128)%%8) */ | |
/* out_ix_blk_iter_dim = 8 */ | |
/* out_ix_blk_iter_sz = 1024 */ | |
/* out_ix_blk_iter_nomod = (out_ix/1024) */ | |
/* out_ix_blk_iter = ((out_ix/1024)%%8) */ | |
/* out_ix_blk_dim = 508 */ | |
/* out_ix_blk_sz = 8192 */ | |
/* out_ix_blk_nomod = (out_ix/8192) */ | |
/* out_ix_blk = (out_ix/8192) */ | |
/* out_ix_sz = 4161536 */ | |
/* pel_ix_x_dim = 57 */ | |
/* pel_ix_x_sz = 1 */ | |
/* pel_ix_x_nomod = pel_ix */ | |
/* pel_ix_x = (pel_ix%%57) */ | |
/* pel_ix_y_dim = 57 */ | |
/* pel_ix_y_sz = 57 */ | |
/* pel_ix_y_nomod = (pel_ix/57) */ | |
/* pel_ix_y = ((pel_ix/57)%%57) */ | |
/* pel_ix_img_dim = 20 */ | |
/* pel_ix_img_sz = 3249 */ | |
/* pel_ix_img_nomod = (pel_ix/3249) */ | |
/* pel_ix_img = (pel_ix/3249) */ | |
/* pel_ix_sz = 64980 */ | |
/* in_ix_x_dim = 57 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%57) */ | |
/* in_ix_y_dim = 57 */ | |
/* in_ix_y_sz = 57 */ | |
/* in_ix_y_nomod = (in_ix/57) */ | |
/* in_ix_y = ((in_ix/57)%%57) */ | |
/* in_ix_chan_dim = 64 */ | |
/* in_ix_chan_sz = 3249 */ | |
/* in_ix_chan_nomod = (in_ix/3249) */ | |
/* in_ix_chan = ((in_ix/3249)%%64) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 207936 */ | |
/* in_ix_img_nomod = (in_ix/207936) */ | |
/* in_ix_img = (in_ix/207936) */ | |
/* in_ix_sz = 4158720 */ | |
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_64__in_chans_64__kysz_1__kxsz_1( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x | |
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile | |
{ | |
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg | |
int32_t const filts_ix = GLOB_ID_1D; | |
if( filts_ix >= 4096 ) { return; } | |
int32_t const fioc = (filts_ix/64); | |
float val = 0.0f; | |
int32_t const filts_xp_ix = | |
(fioc/64)*4096 + | |
(fioc%8)*8 + | |
((fioc/8)%8)*1 + | |
(filts_ix%64)*64 + | |
(filts_ix%1)*64 + | |
(filts_ix%1)*64; | |
#if 1 | |
val = in[filts_ix]; | |
#else | |
if( (filts_ix%64) == 0 ) { | |
// if( ((filts_ix%1) == 5) && ((filts_ix%1) == 5) ) | |
{ | |
val = (filts_ix%1)*100 + (filts_ix%1); | |
} | |
} | |
#endif | |
out[filts_xp_ix] = val; | |
} | |
// -- template substituion table used: -- | |
/* out_chans = 64 */ | |
/* in_chans = 64 */ | |
/* kysz = 1 */ | |
/* kxsz = 1 */ | |
/* rtc_func_name = xpose_filts__out_chans_64__in_chans_64__kysz_1__kxsz_1 */ | |
/* t_tile_sz = 8 */ | |
/* filts_ix_x_dim = 1 */ | |
/* filts_ix_x_sz = 1 */ | |
/* filts_ix_x_nomod = filts_ix */ | |
/* filts_ix_x = (filts_ix%%1) */ | |
/* filts_ix_y_dim = 1 */ | |
/* filts_ix_y_sz = 1 */ | |
/* filts_ix_y_nomod = filts_ix */ | |
/* filts_ix_y = (filts_ix%%1) */ | |
/* filts_ix_in_chan_dim = 64 */ | |
/* filts_ix_in_chan_sz = 1 */ | |
/* filts_ix_in_chan_nomod = filts_ix */ | |
/* filts_ix_in_chan = (filts_ix%%64) */ | |
/* filts_ix_out_chan_dim = 64 */ | |
/* filts_ix_out_chan_sz = 64 */ | |
/* filts_ix_out_chan_nomod = (filts_ix/64) */ | |
/* filts_ix_out_chan = (filts_ix/64) */ | |
/* filts_ix_sz = 4096 */ | |
/* filts_xp_ix_out_chan_tile_dim = 8 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%8) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 8 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/8) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/8)%%8) */ | |
/* filts_xp_ix_x_dim = 1 */ | |
/* filts_xp_ix_x_sz = 64 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/64) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/64)%%1) */ | |
/* filts_xp_ix_y_dim = 1 */ | |
/* filts_xp_ix_y_sz = 64 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/64) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/64)%%1) */ | |
/* filts_xp_ix_in_chan_dim = 64 */ | |
/* filts_xp_ix_in_chan_sz = 64 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/64) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/64)%%64) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 4096 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/4096) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/4096) */ | |
/* filts_xp_ix_sz = 4096 */ | |
/* fioc_out_chan_reg_dim = 8 */ | |
/* fioc_out_chan_reg_sz = 1 */ | |
/* fioc_out_chan_reg_nomod = fioc */ | |
/* fioc_out_chan_reg = (fioc%%8) */ | |
/* fioc_out_chan_tile_dim = 8 */ | |
/* fioc_out_chan_tile_sz = 8 */ | |
/* fioc_out_chan_tile_nomod = (fioc/8) */ | |
/* fioc_out_chan_tile = ((fioc/8)%%8) */ | |
/* fioc_out_chan_blk_dim = 1 */ | |
/* fioc_out_chan_blk_sz = 64 */ | |
/* fioc_out_chan_blk_nomod = (fioc/64) */ | |
/* fioc_out_chan_blk = (fioc/64) */ | |
/* fioc_sz = 64 */ | |
// each thread: computes 8x8 block of out | |
// loop over k dim | |
CUCL_GLOBAL_KERNEL void tconv__num_imgs_20__in_dim_0_57__in_dim_1_57__kern_sz_3__stride_1__in_pad_1__t_tile_sz_8__conv_has_relu_1__out_chans_192__in_chans_64( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) { | |
LOCSHAR_MEM float all_smem[1024]; // note: max(filts+in,out) == max(384+120,1024) | |
LSMASQ float * const filts_smem = all_smem; | |
LSMASQ float * const in_smem = filts_smem + 384; | |
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers | |
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem | |
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz ) | |
float in_strip[10]; // segment of input line sufficient for one unrolling of inner loop | |
int32_t blk_in_ix_base = (GRP_ID_1D/2)*7680 + LOC_ID_1D;// index of first input pel to load for this thread | |
int32_t const blk_filt_ix_base = (GRP_ID_1D%2)*73728; // index of first out chan | |
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D; | |
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%16); | |
int32_t out_line = (GRP_ID_1D/16)*8; // first out_line of block | |
int32_t const blk_fli = (out_line/57); // image of first out_line of block | |
out_line += (LOC_ID_1D/16); // adjust to out_line of this thread | |
// offset in lines to deal with >1 img/block = (number of prior images (partial or full) in this block) * (adj to next img) | |
int32_t const img_off_lines = ((out_line/57) - blk_fli)*(3-1); | |
int32_t const in_y = (out_line%57)*1 - 1; | |
for( int32_t in_chan = 0; in_chan != 64; ++in_chan ) { | |
BARRIER_SYNC; | |
// begin in_smem_loads | |
if( (LOC_ID_1D + 128 * 0) < 120) { in_smem[(LOC_ID_1D + 128 * 0)] = in[ blk_in_ix_base + (128*0) ];} | |
blk_in_ix_base += 120; | |
// end in_smem_loads; | |
for( int32_t ky = 0; ky != 3; ++ky ) { | |
if( ky != 0 ) { BARRIER_SYNC; } | |
// begin filt_smem_loads | |
filts_smem[(LOC_ID_1D + 128 * 0)] = filts[filts_off+(128*0)]; | |
filts_smem[(LOC_ID_1D + 128 * 1)] = filts[filts_off+(128*1)]; | |
filts_smem[(LOC_ID_1D + 128 * 2)] = filts[filts_off+(128*2)]; | |
filts_off += 384; | |
// end filt_smem_loads; | |
BARRIER_SYNC; | |
if( (out_line/57) >= 20 ) { continue; } // required: skip lines from invalid images (read might be invalid) | |
if( ((in_y+ky) < 0) || ((in_y+ky)>57) ) { continue; } // optimization: skip known-to-be-padding input lines | |
LSMASQ float * const in_smem_off = in_smem + ((LOC_ID_1D/16)*1+ky+img_off_lines)*10; | |
// begin inner_loop_body | |
in_strip[0] = in_smem_off[0]; | |
in_strip[1] = in_smem_off[1]; | |
in_strip[2] = in_smem_off[2]; | |
in_strip[3] = in_smem_off[3]; | |
in_strip[4] = in_smem_off[4]; | |
in_strip[5] = in_smem_off[5]; | |
in_strip[6] = in_smem_off[6]; | |
in_strip[7] = in_smem_off[7]; | |
in_strip[8] = in_smem_off[8]; | |
in_strip[9] = in_smem_off[9]; | |
filts_strip[0] = filts_smem_off[0*128+0*16]; | |
filts_strip[1] = filts_smem_off[0*128+1*16]; | |
filts_strip[2] = filts_smem_off[0*128+2*16]; | |
filts_strip[3] = filts_smem_off[0*128+3*16]; | |
filts_strip[4] = filts_smem_off[0*128+4*16]; | |
filts_strip[5] = filts_smem_off[0*128+5*16]; | |
filts_strip[6] = filts_smem_off[0*128+6*16]; | |
filts_strip[7] = filts_smem_off[0*128+7*16]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[1*128+0*16]; | |
filts_strip[1] = filts_smem_off[1*128+1*16]; | |
filts_strip[2] = filts_smem_off[1*128+2*16]; | |
filts_strip[3] = filts_smem_off[1*128+3*16]; | |
filts_strip[4] = filts_smem_off[1*128+4*16]; | |
filts_strip[5] = filts_smem_off[1*128+5*16]; | |
filts_strip[6] = filts_smem_off[1*128+6*16]; | |
filts_strip[7] = filts_smem_off[1*128+7*16]; | |
out_tile[0] += filts_strip[0]*in_strip[1]; | |
out_tile[1] += filts_strip[1]*in_strip[1]; | |
out_tile[2] += filts_strip[2]*in_strip[1]; | |
out_tile[3] += filts_strip[3]*in_strip[1]; | |
out_tile[4] += filts_strip[4]*in_strip[1]; | |
out_tile[5] += filts_strip[5]*in_strip[1]; | |
out_tile[6] += filts_strip[6]*in_strip[1]; | |
out_tile[7] += filts_strip[7]*in_strip[1]; | |
out_tile[8] += filts_strip[0]*in_strip[2]; | |
out_tile[9] += filts_strip[1]*in_strip[2]; | |
out_tile[10] += filts_strip[2]*in_strip[2]; | |
out_tile[11] += filts_strip[3]*in_strip[2]; | |
out_tile[12] += filts_strip[4]*in_strip[2]; | |
out_tile[13] += filts_strip[5]*in_strip[2]; | |
out_tile[14] += filts_strip[6]*in_strip[2]; | |
out_tile[15] += filts_strip[7]*in_strip[2]; | |
out_tile[16] += filts_strip[0]*in_strip[3]; | |
out_tile[17] += filts_strip[1]*in_strip[3]; | |
out_tile[18] += filts_strip[2]*in_strip[3]; | |
out_tile[19] += filts_strip[3]*in_strip[3]; | |
out_tile[20] += filts_strip[4]*in_strip[3]; | |
out_tile[21] += filts_strip[5]*in_strip[3]; | |
out_tile[22] += filts_strip[6]*in_strip[3]; | |
out_tile[23] += filts_strip[7]*in_strip[3]; | |
out_tile[24] += filts_strip[0]*in_strip[4]; | |
out_tile[25] += filts_strip[1]*in_strip[4]; | |
out_tile[26] += filts_strip[2]*in_strip[4]; | |
out_tile[27] += filts_strip[3]*in_strip[4]; | |
out_tile[28] += filts_strip[4]*in_strip[4]; | |
out_tile[29] += filts_strip[5]*in_strip[4]; | |
out_tile[30] += filts_strip[6]*in_strip[4]; | |
out_tile[31] += filts_strip[7]*in_strip[4]; | |
out_tile[32] += filts_strip[0]*in_strip[5]; | |
out_tile[33] += filts_strip[1]*in_strip[5]; | |
out_tile[34] += filts_strip[2]*in_strip[5]; | |
out_tile[35] += filts_strip[3]*in_strip[5]; | |
out_tile[36] += filts_strip[4]*in_strip[5]; | |
out_tile[37] += filts_strip[5]*in_strip[5]; | |
out_tile[38] += filts_strip[6]*in_strip[5]; | |
out_tile[39] += filts_strip[7]*in_strip[5]; | |
out_tile[40] += filts_strip[0]*in_strip[6]; | |
out_tile[41] += filts_strip[1]*in_strip[6]; | |
out_tile[42] += filts_strip[2]*in_strip[6]; | |
out_tile[43] += filts_strip[3]*in_strip[6]; | |
out_tile[44] += filts_strip[4]*in_strip[6]; | |
out_tile[45] += filts_strip[5]*in_strip[6]; | |
out_tile[46] += filts_strip[6]*in_strip[6]; | |
out_tile[47] += filts_strip[7]*in_strip[6]; | |
out_tile[48] += filts_strip[0]*in_strip[7]; | |
out_tile[49] += filts_strip[1]*in_strip[7]; | |
out_tile[50] += filts_strip[2]*in_strip[7]; | |
out_tile[51] += filts_strip[3]*in_strip[7]; | |
out_tile[52] += filts_strip[4]*in_strip[7]; | |
out_tile[53] += filts_strip[5]*in_strip[7]; | |
out_tile[54] += filts_strip[6]*in_strip[7]; | |
out_tile[55] += filts_strip[7]*in_strip[7]; | |
out_tile[56] += filts_strip[0]*in_strip[8]; | |
out_tile[57] += filts_strip[1]*in_strip[8]; | |
out_tile[58] += filts_strip[2]*in_strip[8]; | |
out_tile[59] += filts_strip[3]*in_strip[8]; | |
out_tile[60] += filts_strip[4]*in_strip[8]; | |
out_tile[61] += filts_strip[5]*in_strip[8]; | |
out_tile[62] += filts_strip[6]*in_strip[8]; | |
out_tile[63] += filts_strip[7]*in_strip[8]; | |
filts_strip[0] = filts_smem_off[2*128+0*16]; | |
filts_strip[1] = filts_smem_off[2*128+1*16]; | |
filts_strip[2] = filts_smem_off[2*128+2*16]; | |
filts_strip[3] = filts_smem_off[2*128+3*16]; | |
filts_strip[4] = filts_smem_off[2*128+4*16]; | |
filts_strip[5] = filts_smem_off[2*128+5*16]; | |
filts_strip[6] = filts_smem_off[2*128+6*16]; | |
filts_strip[7] = filts_smem_off[2*128+7*16]; | |
out_tile[0] += filts_strip[0]*in_strip[2]; | |
out_tile[1] += filts_strip[1]*in_strip[2]; | |
out_tile[2] += filts_strip[2]*in_strip[2]; | |
out_tile[3] += filts_strip[3]*in_strip[2]; | |
out_tile[4] += filts_strip[4]*in_strip[2]; | |
out_tile[5] += filts_strip[5]*in_strip[2]; | |
out_tile[6] += filts_strip[6]*in_strip[2]; | |
out_tile[7] += filts_strip[7]*in_strip[2]; | |
out_tile[8] += filts_strip[0]*in_strip[3]; | |
out_tile[9] += filts_strip[1]*in_strip[3]; | |
out_tile[10] += filts_strip[2]*in_strip[3]; | |
out_tile[11] += filts_strip[3]*in_strip[3]; | |
out_tile[12] += filts_strip[4]*in_strip[3]; | |
out_tile[13] += filts_strip[5]*in_strip[3]; | |
out_tile[14] += filts_strip[6]*in_strip[3]; | |
out_tile[15] += filts_strip[7]*in_strip[3]; | |
out_tile[16] += filts_strip[0]*in_strip[4]; | |
out_tile[17] += filts_strip[1]*in_strip[4]; | |
out_tile[18] += filts_strip[2]*in_strip[4]; | |
out_tile[19] += filts_strip[3]*in_strip[4]; | |
out_tile[20] += filts_strip[4]*in_strip[4]; | |
out_tile[21] += filts_strip[5]*in_strip[4]; | |
out_tile[22] += filts_strip[6]*in_strip[4]; | |
out_tile[23] += filts_strip[7]*in_strip[4]; | |
out_tile[24] += filts_strip[0]*in_strip[5]; | |
out_tile[25] += filts_strip[1]*in_strip[5]; | |
out_tile[26] += filts_strip[2]*in_strip[5]; | |
out_tile[27] += filts_strip[3]*in_strip[5]; | |
out_tile[28] += filts_strip[4]*in_strip[5]; | |
out_tile[29] += filts_strip[5]*in_strip[5]; | |
out_tile[30] += filts_strip[6]*in_strip[5]; | |
out_tile[31] += filts_strip[7]*in_strip[5]; | |
out_tile[32] += filts_strip[0]*in_strip[6]; | |
out_tile[33] += filts_strip[1]*in_strip[6]; | |
out_tile[34] += filts_strip[2]*in_strip[6]; | |
out_tile[35] += filts_strip[3]*in_strip[6]; | |
out_tile[36] += filts_strip[4]*in_strip[6]; | |
out_tile[37] += filts_strip[5]*in_strip[6]; | |
out_tile[38] += filts_strip[6]*in_strip[6]; | |
out_tile[39] += filts_strip[7]*in_strip[6]; | |
out_tile[40] += filts_strip[0]*in_strip[7]; | |
out_tile[41] += filts_strip[1]*in_strip[7]; | |
out_tile[42] += filts_strip[2]*in_strip[7]; | |
out_tile[43] += filts_strip[3]*in_strip[7]; | |
out_tile[44] += filts_strip[4]*in_strip[7]; | |
out_tile[45] += filts_strip[5]*in_strip[7]; | |
out_tile[46] += filts_strip[6]*in_strip[7]; | |
out_tile[47] += filts_strip[7]*in_strip[7]; | |
out_tile[48] += filts_strip[0]*in_strip[8]; | |
out_tile[49] += filts_strip[1]*in_strip[8]; | |
out_tile[50] += filts_strip[2]*in_strip[8]; | |
out_tile[51] += filts_strip[3]*in_strip[8]; | |
out_tile[52] += filts_strip[4]*in_strip[8]; | |
out_tile[53] += filts_strip[5]*in_strip[8]; | |
out_tile[54] += filts_strip[6]*in_strip[8]; | |
out_tile[55] += filts_strip[7]*in_strip[8]; | |
out_tile[56] += filts_strip[0]*in_strip[9]; | |
out_tile[57] += filts_strip[1]*in_strip[9]; | |
out_tile[58] += filts_strip[2]*in_strip[9]; | |
out_tile[59] += filts_strip[3]*in_strip[9]; | |
out_tile[60] += filts_strip[4]*in_strip[9]; | |
out_tile[61] += filts_strip[5]*in_strip[9]; | |
out_tile[62] += filts_strip[6]*in_strip[9]; | |
out_tile[63] += filts_strip[7]*in_strip[9]; | |
; | |
} | |
} | |
if( flags == 2 ) { return; } | |
BARRIER_SYNC; | |
for( int32_t i = 0; i != 1; ++i ) { | |
int32_t const t_smem_bias_ix = LOC_ID_1D+128*i; | |
if( t_smem_bias_ix < 128 ) { | |
int32_t const ocix_base = (GRP_ID_1D%2)*128; | |
int32_t const load_reg = t_smem_bias_ix / 16; | |
int32_t const load_tile = t_smem_bias_ix % 16; | |
int32_t const ocix = ocix_base + load_tile*8 + load_reg; | |
if( ocix < 192 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; } | |
} | |
} | |
BARRIER_SYNC; | |
// begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*16]; | |
filts_strip[1] = filts_smem_off[1*16]; | |
filts_strip[2] = filts_smem_off[2*16]; | |
filts_strip[3] = filts_smem_off[3*16]; | |
filts_strip[4] = filts_smem_off[4*16]; | |
filts_strip[5] = filts_smem_off[5*16]; | |
filts_strip[6] = filts_smem_off[6*16]; | |
filts_strip[7] = filts_smem_off[7*16]; | |
// end t_tile_bias_loads; | |
if( flags == 1 ) { return; } | |
// begin t_tile_stores | |
if( (out_line/57) >= 20 ) { return; } | |
int32_t out_x = ((GRP_ID_1D/2)%8)*8; | |
int32_t out_chan = ((GRP_ID_1D%2)*16 + (LOC_ID_1D%16))*8; | |
GASQ float * out_off = out + (out_line/57)*623808 + out_chan*3249 + (out_line%57)*57 + out_x*1 ; | |
if( (out_x + 0) >= 57 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 192 ) { out_off[ 0*3249 + 0*1 ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( (out_chan + 1) < 192 ) { out_off[ 1*3249 + 0*1 ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( (out_chan + 2) < 192 ) { out_off[ 2*3249 + 0*1 ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( (out_chan + 3) < 192 ) { out_off[ 3*3249 + 0*1 ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( (out_chan + 4) < 192 ) { out_off[ 4*3249 + 0*1 ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( (out_chan + 5) < 192 ) { out_off[ 5*3249 + 0*1 ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( (out_chan + 6) < 192 ) { out_off[ 6*3249 + 0*1 ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( (out_chan + 7) < 192 ) { out_off[ 7*3249 + 0*1 ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( (out_x + 1) >= 57 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 192 ) { out_off[ 0*3249 + 1*1 ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( (out_chan + 1) < 192 ) { out_off[ 1*3249 + 1*1 ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( (out_chan + 2) < 192 ) { out_off[ 2*3249 + 1*1 ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( (out_chan + 3) < 192 ) { out_off[ 3*3249 + 1*1 ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( (out_chan + 4) < 192 ) { out_off[ 4*3249 + 1*1 ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( (out_chan + 5) < 192 ) { out_off[ 5*3249 + 1*1 ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( (out_chan + 6) < 192 ) { out_off[ 6*3249 + 1*1 ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( (out_chan + 7) < 192 ) { out_off[ 7*3249 + 1*1 ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( (out_x + 2) >= 57 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 192 ) { out_off[ 0*3249 + 2*1 ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( (out_chan + 1) < 192 ) { out_off[ 1*3249 + 2*1 ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( (out_chan + 2) < 192 ) { out_off[ 2*3249 + 2*1 ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( (out_chan + 3) < 192 ) { out_off[ 3*3249 + 2*1 ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( (out_chan + 4) < 192 ) { out_off[ 4*3249 + 2*1 ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( (out_chan + 5) < 192 ) { out_off[ 5*3249 + 2*1 ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( (out_chan + 6) < 192 ) { out_off[ 6*3249 + 2*1 ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( (out_chan + 7) < 192 ) { out_off[ 7*3249 + 2*1 ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( (out_x + 3) >= 57 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 192 ) { out_off[ 0*3249 + 3*1 ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( (out_chan + 1) < 192 ) { out_off[ 1*3249 + 3*1 ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( (out_chan + 2) < 192 ) { out_off[ 2*3249 + 3*1 ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( (out_chan + 3) < 192 ) { out_off[ 3*3249 + 3*1 ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( (out_chan + 4) < 192 ) { out_off[ 4*3249 + 3*1 ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( (out_chan + 5) < 192 ) { out_off[ 5*3249 + 3*1 ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( (out_chan + 6) < 192 ) { out_off[ 6*3249 + 3*1 ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( (out_chan + 7) < 192 ) { out_off[ 7*3249 + 3*1 ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( (out_x + 4) >= 57 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 192 ) { out_off[ 0*3249 + 4*1 ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( (out_chan + 1) < 192 ) { out_off[ 1*3249 + 4*1 ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( (out_chan + 2) < 192 ) { out_off[ 2*3249 + 4*1 ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( (out_chan + 3) < 192 ) { out_off[ 3*3249 + 4*1 ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( (out_chan + 4) < 192 ) { out_off[ 4*3249 + 4*1 ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( (out_chan + 5) < 192 ) { out_off[ 5*3249 + 4*1 ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( (out_chan + 6) < 192 ) { out_off[ 6*3249 + 4*1 ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( (out_chan + 7) < 192 ) { out_off[ 7*3249 + 4*1 ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( (out_x + 5) >= 57 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 192 ) { out_off[ 0*3249 + 5*1 ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( (out_chan + 1) < 192 ) { out_off[ 1*3249 + 5*1 ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( (out_chan + 2) < 192 ) { out_off[ 2*3249 + 5*1 ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( (out_chan + 3) < 192 ) { out_off[ 3*3249 + 5*1 ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( (out_chan + 4) < 192 ) { out_off[ 4*3249 + 5*1 ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( (out_chan + 5) < 192 ) { out_off[ 5*3249 + 5*1 ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( (out_chan + 6) < 192 ) { out_off[ 6*3249 + 5*1 ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( (out_chan + 7) < 192 ) { out_off[ 7*3249 + 5*1 ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( (out_x + 6) >= 57 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 192 ) { out_off[ 0*3249 + 6*1 ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( (out_chan + 1) < 192 ) { out_off[ 1*3249 + 6*1 ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( (out_chan + 2) < 192 ) { out_off[ 2*3249 + 6*1 ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( (out_chan + 3) < 192 ) { out_off[ 3*3249 + 6*1 ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( (out_chan + 4) < 192 ) { out_off[ 4*3249 + 6*1 ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( (out_chan + 5) < 192 ) { out_off[ 5*3249 + 6*1 ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( (out_chan + 6) < 192 ) { out_off[ 6*3249 + 6*1 ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( (out_chan + 7) < 192 ) { out_off[ 7*3249 + 6*1 ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( (out_x + 7) >= 57 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 192 ) { out_off[ 0*3249 + 7*1 ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( (out_chan + 1) < 192 ) { out_off[ 1*3249 + 7*1 ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( (out_chan + 2) < 192 ) { out_off[ 2*3249 + 7*1 ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( (out_chan + 3) < 192 ) { out_off[ 3*3249 + 7*1 ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( (out_chan + 4) < 192 ) { out_off[ 4*3249 + 7*1 ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( (out_chan + 5) < 192 ) { out_off[ 5*3249 + 7*1 ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( (out_chan + 6) < 192 ) { out_off[ 6*3249 + 7*1 ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( (out_chan + 7) < 192 ) { out_off[ 7*3249 + 7*1 ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_dim_0 = 57 */ | |
/* in_dim_1 = 57 */ | |
/* kern_sz = 3 */ | |
/* stride = 1 */ | |
/* in_pad = 1 */ | |
/* t_tile_sz = 8 */ | |
/* conv_has_relu = 1 */ | |
/* out_chans = 192 */ | |
/* in_chans = 64 */ | |
/* rtc_func_name = tconv__num_imgs_20__in_dim_0_57__in_dim_1_57__kern_sz_3__stride_1__in_pad_1__t_tile_sz_8__conv_has_relu_1__out_chans_192__in_chans_64 */ | |
/* out_ix_x_dim = 57 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%57) */ | |
/* out_ix_y_dim = 57 */ | |
/* out_ix_y_sz = 57 */ | |
/* out_ix_y_nomod = (out_ix/57) */ | |
/* out_ix_y = ((out_ix/57)%%57) */ | |
/* out_ix_chan_dim = 192 */ | |
/* out_ix_chan_sz = 3249 */ | |
/* out_ix_chan_nomod = (out_ix/3249) */ | |
/* out_ix_chan = ((out_ix/3249)%%192) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 623808 */ | |
/* out_ix_img_nomod = (out_ix/623808) */ | |
/* out_ix_img = (out_ix/623808) */ | |
/* out_ix_sz = 12476160 */ | |
/* tpb = 128 */ | |
/* out_line_y_dim = 57 */ | |
/* out_line_y_sz = 1 */ | |
/* out_line_y_nomod = out_line */ | |
/* out_line_y = (out_line%%57) */ | |
/* out_line_img_dim = 20 */ | |
/* out_line_img_sz = 57 */ | |
/* out_line_img_nomod = (out_line/57) */ | |
/* out_line_img = (out_line/57) */ | |
/* out_line_sz = 1140 */ | |
/* in_ix_blk_x_dim = 10 */ | |
/* in_ix_blk_x_sz = 1 */ | |
/* in_ix_blk_x_nomod = in_ix */ | |
/* in_ix_blk_x = (in_ix%%10) */ | |
/* in_ix_blk_y_dim = 12 */ | |
/* in_ix_blk_y_sz = 10 */ | |
/* in_ix_blk_y_nomod = (in_ix/10) */ | |
/* in_ix_blk_y = ((in_ix/10)%%12) */ | |
/* in_ix_blk_in_chan_dim = 64 */ | |
/* in_ix_blk_in_chan_sz = 120 */ | |
/* in_ix_blk_in_chan_nomod = (in_ix/120) */ | |
/* in_ix_blk_in_chan = ((in_ix/120)%%64) */ | |
/* in_ix_blk_bx_dim = 8 */ | |
/* in_ix_blk_bx_sz = 7680 */ | |
/* in_ix_blk_bx_nomod = (in_ix/7680) */ | |
/* in_ix_blk_bx = ((in_ix/7680)%%8) */ | |
/* in_ix_blk_bline_dim = 143 */ | |
/* in_ix_blk_bline_sz = 61440 */ | |
/* in_ix_blk_bline_nomod = (in_ix/61440) */ | |
/* in_ix_blk_bline = (in_ix/61440) */ | |
/* in_ix_sz = 8785920 */ | |
/* LOC_ID_1D_out_chan_tile_dim = 16 */ | |
/* LOC_ID_1D_out_chan_tile_sz = 1 */ | |
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */ | |
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%16) */ | |
/* LOC_ID_1D_blk_y_dim = 8 */ | |
/* LOC_ID_1D_blk_y_sz = 16 */ | |
/* LOC_ID_1D_blk_y_nomod = (LOC_ID_1D/16) */ | |
/* LOC_ID_1D_blk_y = (LOC_ID_1D/16) */ | |
/* LOC_ID_1D_sz = 128 */ | |
/* GRP_ID_1D_out_chan_blk_dim = 2 */ | |
/* GRP_ID_1D_out_chan_blk_sz = 1 */ | |
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%2) */ | |
/* GRP_ID_1D_blk_bx_dim = 8 */ | |
/* GRP_ID_1D_blk_bx_sz = 2 */ | |
/* GRP_ID_1D_blk_bx_nomod = (GRP_ID_1D/2) */ | |
/* GRP_ID_1D_blk_bx = ((GRP_ID_1D/2)%%8) */ | |
/* GRP_ID_1D_blk_bline_dim = 143 */ | |
/* GRP_ID_1D_blk_bline_sz = 16 */ | |
/* GRP_ID_1D_blk_bline_nomod = (GRP_ID_1D/16) */ | |
/* GRP_ID_1D_blk_bline = (GRP_ID_1D/16) */ | |
/* GRP_ID_1D_sz = 2288 */ | |
/* blk_filt_ix_sz = 128 */ | |
/* filts_smem_sz = 384 */ | |
/* in_smem_sz = 120 */ | |
/* out_smem_sz = 1024 */ | |
/* all_smem_sz = 1024 */ | |
/* filts_xp_ix_out_chan_tile_dim = 16 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%16) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 16 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/16) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/16)%%8) */ | |
/* filts_xp_ix_x_dim = 3 */ | |
/* filts_xp_ix_x_sz = 128 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/128) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/128)%%3) */ | |
/* filts_xp_ix_y_dim = 3 */ | |
/* filts_xp_ix_y_sz = 384 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/384) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/384)%%3) */ | |
/* filts_xp_ix_in_chan_dim = 64 */ | |
/* filts_xp_ix_in_chan_sz = 1152 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/1152) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/1152)%%64) */ | |
/* filts_xp_ix_out_chan_blk_dim = 2 */ | |
/* filts_xp_ix_out_chan_blk_sz = 73728 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/73728) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/73728) */ | |
/* filts_xp_ix_sz = 147456 */ | |
/* out_chan_bias_smem_load_iter = 1 */ | |
/* filts_off_adj = LOC_ID_1D */ | |
/* filt_smem_loads = // begin filt_smem_loads | |
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 2)] = filts[filts_off+(%(tpb)*2)]; | |
filts_off += %(filts_xp_ix_y_sz); | |
// end filt_smem_loads */ | |
/* in_smem_loads = // begin in_smem_loads | |
if( (LOC_ID_1D + %(tpb) * 0) < %(in_smem_sz)) { in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ];} | |
blk_in_ix_base += %(in_ix_blk_in_chan_sz); | |
// end in_smem_loads */ | |
/* inner_loop_body = // begin inner_loop_body | |
in_strip[0] = in_smem_off[0]; | |
in_strip[1] = in_smem_off[1]; | |
in_strip[2] = in_smem_off[2]; | |
in_strip[3] = in_smem_off[3]; | |
in_strip[4] = in_smem_off[4]; | |
in_strip[5] = in_smem_off[5]; | |
in_strip[6] = in_smem_off[6]; | |
in_strip[7] = in_smem_off[7]; | |
in_strip[8] = in_smem_off[8]; | |
in_strip[9] = in_smem_off[9]; | |
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[1]; | |
out_tile[1] += filts_strip[1]*in_strip[1]; | |
out_tile[2] += filts_strip[2]*in_strip[1]; | |
out_tile[3] += filts_strip[3]*in_strip[1]; | |
out_tile[4] += filts_strip[4]*in_strip[1]; | |
out_tile[5] += filts_strip[5]*in_strip[1]; | |
out_tile[6] += filts_strip[6]*in_strip[1]; | |
out_tile[7] += filts_strip[7]*in_strip[1]; | |
out_tile[8] += filts_strip[0]*in_strip[2]; | |
out_tile[9] += filts_strip[1]*in_strip[2]; | |
out_tile[10] += filts_strip[2]*in_strip[2]; | |
out_tile[11] += filts_strip[3]*in_strip[2]; | |
out_tile[12] += filts_strip[4]*in_strip[2]; | |
out_tile[13] += filts_strip[5]*in_strip[2]; | |
out_tile[14] += filts_strip[6]*in_strip[2]; | |
out_tile[15] += filts_strip[7]*in_strip[2]; | |
out_tile[16] += filts_strip[0]*in_strip[3]; | |
out_tile[17] += filts_strip[1]*in_strip[3]; | |
out_tile[18] += filts_strip[2]*in_strip[3]; | |
out_tile[19] += filts_strip[3]*in_strip[3]; | |
out_tile[20] += filts_strip[4]*in_strip[3]; | |
out_tile[21] += filts_strip[5]*in_strip[3]; | |
out_tile[22] += filts_strip[6]*in_strip[3]; | |
out_tile[23] += filts_strip[7]*in_strip[3]; | |
out_tile[24] += filts_strip[0]*in_strip[4]; | |
out_tile[25] += filts_strip[1]*in_strip[4]; | |
out_tile[26] += filts_strip[2]*in_strip[4]; | |
out_tile[27] += filts_strip[3]*in_strip[4]; | |
out_tile[28] += filts_strip[4]*in_strip[4]; | |
out_tile[29] += filts_strip[5]*in_strip[4]; | |
out_tile[30] += filts_strip[6]*in_strip[4]; | |
out_tile[31] += filts_strip[7]*in_strip[4]; | |
out_tile[32] += filts_strip[0]*in_strip[5]; | |
out_tile[33] += filts_strip[1]*in_strip[5]; | |
out_tile[34] += filts_strip[2]*in_strip[5]; | |
out_tile[35] += filts_strip[3]*in_strip[5]; | |
out_tile[36] += filts_strip[4]*in_strip[5]; | |
out_tile[37] += filts_strip[5]*in_strip[5]; | |
out_tile[38] += filts_strip[6]*in_strip[5]; | |
out_tile[39] += filts_strip[7]*in_strip[5]; | |
out_tile[40] += filts_strip[0]*in_strip[6]; | |
out_tile[41] += filts_strip[1]*in_strip[6]; | |
out_tile[42] += filts_strip[2]*in_strip[6]; | |
out_tile[43] += filts_strip[3]*in_strip[6]; | |
out_tile[44] += filts_strip[4]*in_strip[6]; | |
out_tile[45] += filts_strip[5]*in_strip[6]; | |
out_tile[46] += filts_strip[6]*in_strip[6]; | |
out_tile[47] += filts_strip[7]*in_strip[6]; | |
out_tile[48] += filts_strip[0]*in_strip[7]; | |
out_tile[49] += filts_strip[1]*in_strip[7]; | |
out_tile[50] += filts_strip[2]*in_strip[7]; | |
out_tile[51] += filts_strip[3]*in_strip[7]; | |
out_tile[52] += filts_strip[4]*in_strip[7]; | |
out_tile[53] += filts_strip[5]*in_strip[7]; | |
out_tile[54] += filts_strip[6]*in_strip[7]; | |
out_tile[55] += filts_strip[7]*in_strip[7]; | |
out_tile[56] += filts_strip[0]*in_strip[8]; | |
out_tile[57] += filts_strip[1]*in_strip[8]; | |
out_tile[58] += filts_strip[2]*in_strip[8]; | |
out_tile[59] += filts_strip[3]*in_strip[8]; | |
out_tile[60] += filts_strip[4]*in_strip[8]; | |
out_tile[61] += filts_strip[5]*in_strip[8]; | |
out_tile[62] += filts_strip[6]*in_strip[8]; | |
out_tile[63] += filts_strip[7]*in_strip[8]; | |
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[2]; | |
out_tile[1] += filts_strip[1]*in_strip[2]; | |
out_tile[2] += filts_strip[2]*in_strip[2]; | |
out_tile[3] += filts_strip[3]*in_strip[2]; | |
out_tile[4] += filts_strip[4]*in_strip[2]; | |
out_tile[5] += filts_strip[5]*in_strip[2]; | |
out_tile[6] += filts_strip[6]*in_strip[2]; | |
out_tile[7] += filts_strip[7]*in_strip[2]; | |
out_tile[8] += filts_strip[0]*in_strip[3]; | |
out_tile[9] += filts_strip[1]*in_strip[3]; | |
out_tile[10] += filts_strip[2]*in_strip[3]; | |
out_tile[11] += filts_strip[3]*in_strip[3]; | |
out_tile[12] += filts_strip[4]*in_strip[3]; | |
out_tile[13] += filts_strip[5]*in_strip[3]; | |
out_tile[14] += filts_strip[6]*in_strip[3]; | |
out_tile[15] += filts_strip[7]*in_strip[3]; | |
out_tile[16] += filts_strip[0]*in_strip[4]; | |
out_tile[17] += filts_strip[1]*in_strip[4]; | |
out_tile[18] += filts_strip[2]*in_strip[4]; | |
out_tile[19] += filts_strip[3]*in_strip[4]; | |
out_tile[20] += filts_strip[4]*in_strip[4]; | |
out_tile[21] += filts_strip[5]*in_strip[4]; | |
out_tile[22] += filts_strip[6]*in_strip[4]; | |
out_tile[23] += filts_strip[7]*in_strip[4]; | |
out_tile[24] += filts_strip[0]*in_strip[5]; | |
out_tile[25] += filts_strip[1]*in_strip[5]; | |
out_tile[26] += filts_strip[2]*in_strip[5]; | |
out_tile[27] += filts_strip[3]*in_strip[5]; | |
out_tile[28] += filts_strip[4]*in_strip[5]; | |
out_tile[29] += filts_strip[5]*in_strip[5]; | |
out_tile[30] += filts_strip[6]*in_strip[5]; | |
out_tile[31] += filts_strip[7]*in_strip[5]; | |
out_tile[32] += filts_strip[0]*in_strip[6]; | |
out_tile[33] += filts_strip[1]*in_strip[6]; | |
out_tile[34] += filts_strip[2]*in_strip[6]; | |
out_tile[35] += filts_strip[3]*in_strip[6]; | |
out_tile[36] += filts_strip[4]*in_strip[6]; | |
out_tile[37] += filts_strip[5]*in_strip[6]; | |
out_tile[38] += filts_strip[6]*in_strip[6]; | |
out_tile[39] += filts_strip[7]*in_strip[6]; | |
out_tile[40] += filts_strip[0]*in_strip[7]; | |
out_tile[41] += filts_strip[1]*in_strip[7]; | |
out_tile[42] += filts_strip[2]*in_strip[7]; | |
out_tile[43] += filts_strip[3]*in_strip[7]; | |
out_tile[44] += filts_strip[4]*in_strip[7]; | |
out_tile[45] += filts_strip[5]*in_strip[7]; | |
out_tile[46] += filts_strip[6]*in_strip[7]; | |
out_tile[47] += filts_strip[7]*in_strip[7]; | |
out_tile[48] += filts_strip[0]*in_strip[8]; | |
out_tile[49] += filts_strip[1]*in_strip[8]; | |
out_tile[50] += filts_strip[2]*in_strip[8]; | |
out_tile[51] += filts_strip[3]*in_strip[8]; | |
out_tile[52] += filts_strip[4]*in_strip[8]; | |
out_tile[53] += filts_strip[5]*in_strip[8]; | |
out_tile[54] += filts_strip[6]*in_strip[8]; | |
out_tile[55] += filts_strip[7]*in_strip[8]; | |
out_tile[56] += filts_strip[0]*in_strip[9]; | |
out_tile[57] += filts_strip[1]*in_strip[9]; | |
out_tile[58] += filts_strip[2]*in_strip[9]; | |
out_tile[59] += filts_strip[3]*in_strip[9]; | |
out_tile[60] += filts_strip[4]*in_strip[9]; | |
out_tile[61] += filts_strip[5]*in_strip[9]; | |
out_tile[62] += filts_strip[6]*in_strip[9]; | |
out_tile[63] += filts_strip[7]*in_strip[9]; | |
*/ | |
/* t_tile_bias_loads = // begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
// end t_tile_bias_loads */ | |
/* t_tile_stores = // begin t_tile_stores | |
if( %(out_line_img) >= %(out_ix_img_dim) ) { return; } | |
int32_t out_x = %(GRP_ID_1D_blk_bx)*%(t_tile_sz); | |
int32_t out_chan = (%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim) + %(LOC_ID_1D_out_chan_tile))*%(t_tile_sz); | |
GASQ float * out_off = out + %(out_line_img)*%(out_ix_img_sz) + out_chan*%(out_ix_chan_sz) + %(out_line_y)*%(out_ix_y_sz) + out_x*%(out_ix_x_sz) ; | |
if( (out_x + 0) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( (out_x + 1) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( (out_x + 2) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( (out_x + 3) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( (out_x + 4) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( (out_x + 5) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( (out_x + 6) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( (out_x + 7) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores */ | |
CUCL_GLOBAL_KERNEL void in_tile_xpose__num_imgs_20__stride_1__kern_sz_3__in_pad_1__in_chans_64__ysz_57__xsz_57__tix_pels_tile_sz_8__t_tile_sz_8__bix_pels_blk_sz_1144( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const out_ix = GLOB_ID_1D; | |
if( out_ix >= 8785920 ) { return; } | |
int32_t const out_line = (out_ix/61440)*8; | |
int32_t const fi_skip_in_lines = (out_line%57)*1; | |
int32_t const in_line = (((out_ix/10)%12)+fi_skip_in_lines); | |
int32_t const img_in_lines = (57 - 1)*1 + 3; | |
int32_t const img_off = in_line/img_in_lines; | |
int32_t const img = (out_line/57) + img_off; | |
int32_t const iy = (in_line % img_in_lines) - 1; //(out_line%57)*1 + ((out_ix/10)%12) - 1; | |
int32_t const ix = ((out_ix/7680)%8)*8*1 + (out_ix%10) - 1; | |
float v = 0.0f; | |
if( 1 | |
&& ( ix >= 0 ) | |
&& ( iy >= 0 ) | |
&& ( ix < 57 ) | |
&& ( iy < 57 ) | |
&& ( img < 20 ) | |
) | |
{ | |
v = in[ img*207936 + | |
((out_ix/120)%64)*3249 + | |
iy*57 + | |
ix*1 ]; | |
} | |
out[out_ix] = v; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* stride = 1 */ | |
/* kern_sz = 3 */ | |
/* in_pad = 1 */ | |
/* in_chans = 64 */ | |
/* ysz = 57 */ | |
/* xsz = 57 */ | |
/* tix_pels_tile_sz = 8 */ | |
/* t_tile_sz = 8 */ | |
/* bix_pels_blk_sz = 1144 */ | |
/* rtc_func_name = in_tile_xpose__num_imgs_20__stride_1__kern_sz_3__in_pad_1__in_chans_64__ysz_57__xsz_57__tix_pels_tile_sz_8__t_tile_sz_8__bix_pels_blk_sz_1144 */ | |
/* out_ix_blk_x_dim = 10 */ | |
/* out_ix_blk_x_sz = 1 */ | |
/* out_ix_blk_x_nomod = out_ix */ | |
/* out_ix_blk_x = (out_ix%%10) */ | |
/* out_ix_blk_y_dim = 12 */ | |
/* out_ix_blk_y_sz = 10 */ | |
/* out_ix_blk_y_nomod = (out_ix/10) */ | |
/* out_ix_blk_y = ((out_ix/10)%%12) */ | |
/* out_ix_blk_in_chan_dim = 64 */ | |
/* out_ix_blk_in_chan_sz = 120 */ | |
/* out_ix_blk_in_chan_nomod = (out_ix/120) */ | |
/* out_ix_blk_in_chan = ((out_ix/120)%%64) */ | |
/* out_ix_blk_bx_dim = 8 */ | |
/* out_ix_blk_bx_sz = 7680 */ | |
/* out_ix_blk_bx_nomod = (out_ix/7680) */ | |
/* out_ix_blk_bx = ((out_ix/7680)%%8) */ | |
/* out_ix_blk_bline_dim = 143 */ | |
/* out_ix_blk_bline_sz = 61440 */ | |
/* out_ix_blk_bline_nomod = (out_ix/61440) */ | |
/* out_ix_blk_bline = (out_ix/61440) */ | |
/* out_ix_sz = 8785920 */ | |
/* out_line_y_dim = 57 */ | |
/* out_line_y_sz = 1 */ | |
/* out_line_y_nomod = out_line */ | |
/* out_line_y = (out_line%%57) */ | |
/* out_line_img_dim = 20 */ | |
/* out_line_img_sz = 57 */ | |
/* out_line_img_nomod = (out_line/57) */ | |
/* out_line_img = (out_line/57) */ | |
/* out_line_sz = 1140 */ | |
/* in_ix_x_dim = 57 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%57) */ | |
/* in_ix_y_dim = 57 */ | |
/* in_ix_y_sz = 57 */ | |
/* in_ix_y_nomod = (in_ix/57) */ | |
/* in_ix_y = ((in_ix/57)%%57) */ | |
/* in_ix_chan_dim = 64 */ | |
/* in_ix_chan_sz = 3249 */ | |
/* in_ix_chan_nomod = (in_ix/3249) */ | |
/* in_ix_chan = ((in_ix/3249)%%64) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 207936 */ | |
/* in_ix_img_nomod = (in_ix/207936) */ | |
/* in_ix_img = (in_ix/207936) */ | |
/* in_ix_sz = 4158720 */ | |
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_192__in_chans_64__kysz_3__kxsz_3( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x | |
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile | |
{ | |
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg | |
int32_t const filts_ix = GLOB_ID_1D; | |
if( filts_ix >= 110592 ) { return; } | |
int32_t const fioc = (filts_ix/576); | |
float val = 0.0f; | |
int32_t const filts_xp_ix = | |
(fioc/128)*73728 + | |
(fioc%8)*16 + | |
((fioc/8)%16)*1 + | |
((filts_ix/9)%64)*1152 + | |
((filts_ix/3)%3)*384 + | |
(filts_ix%3)*128; | |
#if 1 | |
val = in[filts_ix]; | |
#else | |
if( ((filts_ix/9)%64) == 0 ) { | |
// if( ((filts_ix%3) == 5) && (((filts_ix/3)%3) == 5) ) | |
{ | |
val = (filts_ix%3)*100 + ((filts_ix/3)%3); | |
} | |
} | |
#endif | |
out[filts_xp_ix] = val; | |
} | |
// -- template substituion table used: -- | |
/* out_chans = 192 */ | |
/* in_chans = 64 */ | |
/* kysz = 3 */ | |
/* kxsz = 3 */ | |
/* rtc_func_name = xpose_filts__out_chans_192__in_chans_64__kysz_3__kxsz_3 */ | |
/* t_tile_sz = 8 */ | |
/* filts_ix_x_dim = 3 */ | |
/* filts_ix_x_sz = 1 */ | |
/* filts_ix_x_nomod = filts_ix */ | |
/* filts_ix_x = (filts_ix%%3) */ | |
/* filts_ix_y_dim = 3 */ | |
/* filts_ix_y_sz = 3 */ | |
/* filts_ix_y_nomod = (filts_ix/3) */ | |
/* filts_ix_y = ((filts_ix/3)%%3) */ | |
/* filts_ix_in_chan_dim = 64 */ | |
/* filts_ix_in_chan_sz = 9 */ | |
/* filts_ix_in_chan_nomod = (filts_ix/9) */ | |
/* filts_ix_in_chan = ((filts_ix/9)%%64) */ | |
/* filts_ix_out_chan_dim = 192 */ | |
/* filts_ix_out_chan_sz = 576 */ | |
/* filts_ix_out_chan_nomod = (filts_ix/576) */ | |
/* filts_ix_out_chan = (filts_ix/576) */ | |
/* filts_ix_sz = 110592 */ | |
/* filts_xp_ix_out_chan_tile_dim = 16 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%16) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 16 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/16) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/16)%%8) */ | |
/* filts_xp_ix_x_dim = 3 */ | |
/* filts_xp_ix_x_sz = 128 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/128) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/128)%%3) */ | |
/* filts_xp_ix_y_dim = 3 */ | |
/* filts_xp_ix_y_sz = 384 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/384) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/384)%%3) */ | |
/* filts_xp_ix_in_chan_dim = 64 */ | |
/* filts_xp_ix_in_chan_sz = 1152 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/1152) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/1152)%%64) */ | |
/* filts_xp_ix_out_chan_blk_dim = 2 */ | |
/* filts_xp_ix_out_chan_blk_sz = 73728 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/73728) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/73728) */ | |
/* filts_xp_ix_sz = 147456 */ | |
/* fioc_out_chan_reg_dim = 8 */ | |
/* fioc_out_chan_reg_sz = 1 */ | |
/* fioc_out_chan_reg_nomod = fioc */ | |
/* fioc_out_chan_reg = (fioc%%8) */ | |
/* fioc_out_chan_tile_dim = 16 */ | |
/* fioc_out_chan_tile_sz = 8 */ | |
/* fioc_out_chan_tile_nomod = (fioc/8) */ | |
/* fioc_out_chan_tile = ((fioc/8)%%16) */ | |
/* fioc_out_chan_blk_dim = 2 */ | |
/* fioc_out_chan_blk_sz = 128 */ | |
/* fioc_out_chan_blk_nomod = (fioc/128) */ | |
/* fioc_out_chan_blk = (fioc/128) */ | |
/* fioc_sz = 256 */ | |
// each thread: computes outputs across chan dim, using inputs across chan dim | |
CUCL_GLOBAL_KERNEL void lrn__num_imgs_20__chans_192__ysz_57__xsz_57__local_size_5__alpha_0_0001__beta_0_75__k_1( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const tix = GLOB_ID_1D; | |
if( tix >= 64980 ) { return; } | |
// iteratate over chans | |
float ls_buf[5] = {0.0f}; | |
int32_t const hls = 5 >> 1; | |
int32_t const out_base_ix = (tix/3249)*623808 + ((tix/57)%57)*57 + (tix%57)*1; | |
for( int32_t in_chan_ix = 0; in_chan_ix < 192 + hls; ++in_chan_ix ) { | |
int32_t const in_off = in_chan_ix*3249; | |
int32_t const lsb_ix = in_chan_ix % 5; | |
ls_buf[lsb_ix] = (in_chan_ix < 192) ? in[out_base_ix + in_off] : 0.0f; | |
if( in_chan_ix >= hls ) { | |
int32_t const out_chan_ix = in_chan_ix - hls; | |
float ls_sum = 0.0f; | |
for( int32_t i = 0; i != 5; ++i ) { ls_sum += ls_buf[i]*ls_buf[i]; } | |
float const scale = powf( (1 + 0.0001*ls_sum/5), -0.75 ); | |
out[out_base_ix + out_chan_ix*3249] = ls_buf[(lsb_ix+5-hls) % 5] * scale; | |
} | |
} | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* chans = 192 */ | |
/* ysz = 57 */ | |
/* xsz = 57 */ | |
/* local_size = 5 */ | |
/* alpha = 0.0001 */ | |
/* beta = 0.75 */ | |
/* k = 1 */ | |
/* rtc_func_name = lrn__num_imgs_20__chans_192__ysz_57__xsz_57__local_size_5__alpha_0_0001__beta_0_75__k_1 */ | |
/* tix_x_dim = 57 */ | |
/* tix_x_sz = 1 */ | |
/* tix_x_nomod = tix */ | |
/* tix_x = (tix%%57) */ | |
/* tix_y_dim = 57 */ | |
/* tix_y_sz = 57 */ | |
/* tix_y_nomod = (tix/57) */ | |
/* tix_y = ((tix/57)%%57) */ | |
/* tix_img_dim = 20 */ | |
/* tix_img_sz = 3249 */ | |
/* tix_img_nomod = (tix/3249) */ | |
/* tix_img = (tix/3249) */ | |
/* tix_sz = 64980 */ | |
/* out_ix_x_dim = 57 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%57) */ | |
/* out_ix_y_dim = 57 */ | |
/* out_ix_y_sz = 57 */ | |
/* out_ix_y_nomod = (out_ix/57) */ | |
/* out_ix_y = ((out_ix/57)%%57) */ | |
/* out_ix_chan_dim = 192 */ | |
/* out_ix_chan_sz = 3249 */ | |
/* out_ix_chan_nomod = (out_ix/3249) */ | |
/* out_ix_chan = ((out_ix/3249)%%192) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 623808 */ | |
/* out_ix_img_nomod = (out_ix/623808) */ | |
/* out_ix_img = (out_ix/623808) */ | |
/* out_ix_sz = 12476160 */ | |
CUCL_GLOBAL_KERNEL void pool__num_imgs_20__in_pad_0__in_dim_0_57__in_dim_1_57__conv_has_relu_0__kern_sz_3__stride_2__out_chans_192__avg_pool_0( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const out_ix = GLOB_ID_1D; | |
if( out_ix >= 3010560 ) { return; } | |
float out_v = 0.0f; | |
for( int32_t kx = 0; kx != 3; ++kx ) { | |
for( int32_t ky = 0; ky != 3; ++ky ) { | |
float v = 0; | |
int const in_ix_y = ((out_ix/28)%28)*2 + ky - 0; | |
int const in_ix_x = (out_ix%28)*2 + kx - 0; | |
if(in_ix_y >= 0 && in_ix_x >= 0 && in_ix_x < 57 && in_ix_y < 57 ) { | |
int32_t const in_ix = (out_ix/150528)*623808 + ((out_ix/784)%192)*3249 + | |
in_ix_y*57 + in_ix_x*1; | |
v = in[in_ix]; | |
} | |
out_v = max( out_v, v ); | |
} | |
} | |
; | |
out[out_ix] = out_v; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_pad = 0 */ | |
/* in_dim_0 = 57 */ | |
/* in_dim_1 = 57 */ | |
/* conv_has_relu = 0 */ | |
/* kern_sz = 3 */ | |
/* stride = 2 */ | |
/* out_chans = 192 */ | |
/* avg_pool = 0 */ | |
/* rtc_func_name = pool__num_imgs_20__in_pad_0__in_dim_0_57__in_dim_1_57__conv_has_relu_0__kern_sz_3__stride_2__out_chans_192__avg_pool_0 */ | |
/* t_tile_sz = 8 */ | |
/* out_ix_x_dim = 28 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%28) */ | |
/* out_ix_y_dim = 28 */ | |
/* out_ix_y_sz = 28 */ | |
/* out_ix_y_nomod = (out_ix/28) */ | |
/* out_ix_y = ((out_ix/28)%%28) */ | |
/* out_ix_chan_dim = 192 */ | |
/* out_ix_chan_sz = 784 */ | |
/* out_ix_chan_nomod = (out_ix/784) */ | |
/* out_ix_chan = ((out_ix/784)%%192) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 150528 */ | |
/* out_ix_img_nomod = (out_ix/150528) */ | |
/* out_ix_img = (out_ix/150528) */ | |
/* out_ix_sz = 3010560 */ | |
/* in_ix_x_dim = 57 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%57) */ | |
/* in_ix_y_dim = 57 */ | |
/* in_ix_y_sz = 57 */ | |
/* in_ix_y_nomod = (in_ix/57) */ | |
/* in_ix_y = ((in_ix/57)%%57) */ | |
/* in_ix_chan_dim = 192 */ | |
/* in_ix_chan_sz = 3249 */ | |
/* in_ix_chan_nomod = (in_ix/3249) */ | |
/* in_ix_chan = ((in_ix/3249)%%192) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 623808 */ | |
/* in_ix_img_nomod = (in_ix/623808) */ | |
/* in_ix_img = (in_ix/623808) */ | |
/* in_ix_sz = 12476160 */ | |
/* op = out_v = max( out_v, v ) */ | |
/* op_post = */ | |
// each thread: computes 8x8 block of out | |
// loop over k dim | |
CUCL_GLOBAL_KERNEL void k1conv__num_imgs_20__in_dim_0_28__in_dim_1_28__conv_has_relu_1__out_chans_96__write_xposed_0__in_chans_192( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) { | |
//int32_t const blk_in_ix_sz = 10*8; | |
LOCSHAR_MEM float all_smem[1408]; // note: max(filts+in,out) == max(768+640,960) | |
LSMASQ float * const filts_smem = all_smem; | |
LSMASQ float * const in_smem = filts_smem + 768; | |
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers | |
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem | |
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz ) | |
float in_strip[8]; // segment of input line sufficient for one unrolling of inner loop | |
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*18432; // index of first out chan | |
int32_t blk_in_ix_base = GRP_ID_1D*15360 + LOC_ID_1D;// index of first input pel to load for this thread | |
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%12); | |
LSMASQ float * const in_smem_off = in_smem + 8*(LOC_ID_1D/12); | |
LSMASQ float * const out_smem_off = all_smem + LOC_ID_1D; | |
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D; | |
// iteratate over filter elements | |
for( int32_t blk_iter = 0; blk_iter != 24; ++blk_iter ) { | |
BARRIER_SYNC; | |
// begin smem_loads | |
filts_smem[(LOC_ID_1D + 120 * 0)] = filts[filts_off+(120*0)]; | |
filts_smem[(LOC_ID_1D + 120 * 1)] = filts[filts_off+(120*1)]; | |
filts_smem[(LOC_ID_1D + 120 * 2)] = filts[filts_off+(120*2)]; | |
filts_smem[(LOC_ID_1D + 120 * 3)] = filts[filts_off+(120*3)]; | |
filts_smem[(LOC_ID_1D + 120 * 4)] = filts[filts_off+(120*4)]; | |
filts_smem[(LOC_ID_1D + 120 * 5)] = filts[filts_off+(120*5)]; | |
if( (LOC_ID_1D + 120 * 6) < 768 ) { filts_smem[(LOC_ID_1D + 120 * 6)] = filts[filts_off+(120*6)];} | |
in_smem[(LOC_ID_1D + 120 * 0)] = in[ blk_in_ix_base + (120*0) ]; | |
in_smem[(LOC_ID_1D + 120 * 1)] = in[ blk_in_ix_base + (120*1) ]; | |
in_smem[(LOC_ID_1D + 120 * 2)] = in[ blk_in_ix_base + (120*2) ]; | |
in_smem[(LOC_ID_1D + 120 * 3)] = in[ blk_in_ix_base + (120*3) ]; | |
in_smem[(LOC_ID_1D + 120 * 4)] = in[ blk_in_ix_base + (120*4) ]; | |
if( (LOC_ID_1D + 120 * 5) < 640) { in_smem[(LOC_ID_1D + 120 * 5)] = in[ blk_in_ix_base + (120*5) ];} | |
// end smem_loads; | |
BARRIER_SYNC; | |
filts_off += 96*8; | |
blk_in_ix_base += 640; | |
// begin inner_loop_body | |
filts_strip[0] = filts_smem_off[0*96+0*12]; | |
filts_strip[1] = filts_smem_off[0*96+1*12]; | |
filts_strip[2] = filts_smem_off[0*96+2*12]; | |
filts_strip[3] = filts_smem_off[0*96+3*12]; | |
filts_strip[4] = filts_smem_off[0*96+4*12]; | |
filts_strip[5] = filts_smem_off[0*96+5*12]; | |
filts_strip[6] = filts_smem_off[0*96+6*12]; | |
filts_strip[7] = filts_smem_off[0*96+7*12]; | |
in_strip[0] = in_smem_off[(0*8*10+0)]; | |
in_strip[1] = in_smem_off[(0*8*10+1)]; | |
in_strip[2] = in_smem_off[(0*8*10+2)]; | |
in_strip[3] = in_smem_off[(0*8*10+3)]; | |
in_strip[4] = in_smem_off[(0*8*10+4)]; | |
in_strip[5] = in_smem_off[(0*8*10+5)]; | |
in_strip[6] = in_smem_off[(0*8*10+6)]; | |
in_strip[7] = in_smem_off[(0*8*10+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[1*96+0*12]; | |
filts_strip[1] = filts_smem_off[1*96+1*12]; | |
filts_strip[2] = filts_smem_off[1*96+2*12]; | |
filts_strip[3] = filts_smem_off[1*96+3*12]; | |
filts_strip[4] = filts_smem_off[1*96+4*12]; | |
filts_strip[5] = filts_smem_off[1*96+5*12]; | |
filts_strip[6] = filts_smem_off[1*96+6*12]; | |
filts_strip[7] = filts_smem_off[1*96+7*12]; | |
in_strip[0] = in_smem_off[(1*8*10+0)]; | |
in_strip[1] = in_smem_off[(1*8*10+1)]; | |
in_strip[2] = in_smem_off[(1*8*10+2)]; | |
in_strip[3] = in_smem_off[(1*8*10+3)]; | |
in_strip[4] = in_smem_off[(1*8*10+4)]; | |
in_strip[5] = in_smem_off[(1*8*10+5)]; | |
in_strip[6] = in_smem_off[(1*8*10+6)]; | |
in_strip[7] = in_smem_off[(1*8*10+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[2*96+0*12]; | |
filts_strip[1] = filts_smem_off[2*96+1*12]; | |
filts_strip[2] = filts_smem_off[2*96+2*12]; | |
filts_strip[3] = filts_smem_off[2*96+3*12]; | |
filts_strip[4] = filts_smem_off[2*96+4*12]; | |
filts_strip[5] = filts_smem_off[2*96+5*12]; | |
filts_strip[6] = filts_smem_off[2*96+6*12]; | |
filts_strip[7] = filts_smem_off[2*96+7*12]; | |
in_strip[0] = in_smem_off[(2*8*10+0)]; | |
in_strip[1] = in_smem_off[(2*8*10+1)]; | |
in_strip[2] = in_smem_off[(2*8*10+2)]; | |
in_strip[3] = in_smem_off[(2*8*10+3)]; | |
in_strip[4] = in_smem_off[(2*8*10+4)]; | |
in_strip[5] = in_smem_off[(2*8*10+5)]; | |
in_strip[6] = in_smem_off[(2*8*10+6)]; | |
in_strip[7] = in_smem_off[(2*8*10+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[3*96+0*12]; | |
filts_strip[1] = filts_smem_off[3*96+1*12]; | |
filts_strip[2] = filts_smem_off[3*96+2*12]; | |
filts_strip[3] = filts_smem_off[3*96+3*12]; | |
filts_strip[4] = filts_smem_off[3*96+4*12]; | |
filts_strip[5] = filts_smem_off[3*96+5*12]; | |
filts_strip[6] = filts_smem_off[3*96+6*12]; | |
filts_strip[7] = filts_smem_off[3*96+7*12]; | |
in_strip[0] = in_smem_off[(3*8*10+0)]; | |
in_strip[1] = in_smem_off[(3*8*10+1)]; | |
in_strip[2] = in_smem_off[(3*8*10+2)]; | |
in_strip[3] = in_smem_off[(3*8*10+3)]; | |
in_strip[4] = in_smem_off[(3*8*10+4)]; | |
in_strip[5] = in_smem_off[(3*8*10+5)]; | |
in_strip[6] = in_smem_off[(3*8*10+6)]; | |
in_strip[7] = in_smem_off[(3*8*10+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[4*96+0*12]; | |
filts_strip[1] = filts_smem_off[4*96+1*12]; | |
filts_strip[2] = filts_smem_off[4*96+2*12]; | |
filts_strip[3] = filts_smem_off[4*96+3*12]; | |
filts_strip[4] = filts_smem_off[4*96+4*12]; | |
filts_strip[5] = filts_smem_off[4*96+5*12]; | |
filts_strip[6] = filts_smem_off[4*96+6*12]; | |
filts_strip[7] = filts_smem_off[4*96+7*12]; | |
in_strip[0] = in_smem_off[(4*8*10+0)]; | |
in_strip[1] = in_smem_off[(4*8*10+1)]; | |
in_strip[2] = in_smem_off[(4*8*10+2)]; | |
in_strip[3] = in_smem_off[(4*8*10+3)]; | |
in_strip[4] = in_smem_off[(4*8*10+4)]; | |
in_strip[5] = in_smem_off[(4*8*10+5)]; | |
in_strip[6] = in_smem_off[(4*8*10+6)]; | |
in_strip[7] = in_smem_off[(4*8*10+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[5*96+0*12]; | |
filts_strip[1] = filts_smem_off[5*96+1*12]; | |
filts_strip[2] = filts_smem_off[5*96+2*12]; | |
filts_strip[3] = filts_smem_off[5*96+3*12]; | |
filts_strip[4] = filts_smem_off[5*96+4*12]; | |
filts_strip[5] = filts_smem_off[5*96+5*12]; | |
filts_strip[6] = filts_smem_off[5*96+6*12]; | |
filts_strip[7] = filts_smem_off[5*96+7*12]; | |
in_strip[0] = in_smem_off[(5*8*10+0)]; | |
in_strip[1] = in_smem_off[(5*8*10+1)]; | |
in_strip[2] = in_smem_off[(5*8*10+2)]; | |
in_strip[3] = in_smem_off[(5*8*10+3)]; | |
in_strip[4] = in_smem_off[(5*8*10+4)]; | |
in_strip[5] = in_smem_off[(5*8*10+5)]; | |
in_strip[6] = in_smem_off[(5*8*10+6)]; | |
in_strip[7] = in_smem_off[(5*8*10+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[6*96+0*12]; | |
filts_strip[1] = filts_smem_off[6*96+1*12]; | |
filts_strip[2] = filts_smem_off[6*96+2*12]; | |
filts_strip[3] = filts_smem_off[6*96+3*12]; | |
filts_strip[4] = filts_smem_off[6*96+4*12]; | |
filts_strip[5] = filts_smem_off[6*96+5*12]; | |
filts_strip[6] = filts_smem_off[6*96+6*12]; | |
filts_strip[7] = filts_smem_off[6*96+7*12]; | |
in_strip[0] = in_smem_off[(6*8*10+0)]; | |
in_strip[1] = in_smem_off[(6*8*10+1)]; | |
in_strip[2] = in_smem_off[(6*8*10+2)]; | |
in_strip[3] = in_smem_off[(6*8*10+3)]; | |
in_strip[4] = in_smem_off[(6*8*10+4)]; | |
in_strip[5] = in_smem_off[(6*8*10+5)]; | |
in_strip[6] = in_smem_off[(6*8*10+6)]; | |
in_strip[7] = in_smem_off[(6*8*10+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[7*96+0*12]; | |
filts_strip[1] = filts_smem_off[7*96+1*12]; | |
filts_strip[2] = filts_smem_off[7*96+2*12]; | |
filts_strip[3] = filts_smem_off[7*96+3*12]; | |
filts_strip[4] = filts_smem_off[7*96+4*12]; | |
filts_strip[5] = filts_smem_off[7*96+5*12]; | |
filts_strip[6] = filts_smem_off[7*96+6*12]; | |
filts_strip[7] = filts_smem_off[7*96+7*12]; | |
in_strip[0] = in_smem_off[(7*8*10+0)]; | |
in_strip[1] = in_smem_off[(7*8*10+1)]; | |
in_strip[2] = in_smem_off[(7*8*10+2)]; | |
in_strip[3] = in_smem_off[(7*8*10+3)]; | |
in_strip[4] = in_smem_off[(7*8*10+4)]; | |
in_strip[5] = in_smem_off[(7*8*10+5)]; | |
in_strip[6] = in_smem_off[(7*8*10+6)]; | |
in_strip[7] = in_smem_off[(7*8*10+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
; | |
} | |
// load per-block biases into smem | |
if( flags == 2 ) { return; } | |
BARRIER_SYNC; | |
for( int32_t i = 0; i != 1; ++i ) { | |
int32_t const t_smem_bias_ix = LOC_ID_1D+120*i; | |
if( t_smem_bias_ix < 96 ) { | |
int32_t const ocix_base = (GRP_ID_1D%1)*96; | |
int32_t const load_reg = t_smem_bias_ix / 12; | |
int32_t const load_tile = t_smem_bias_ix % 12; | |
int32_t const ocix = ocix_base + load_tile*8 + load_reg; | |
if( ocix < 96 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; } | |
} | |
} | |
BARRIER_SYNC; | |
// load biases into filts_strip | |
// begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*12]; | |
filts_strip[1] = filts_smem_off[1*12]; | |
filts_strip[2] = filts_smem_off[2*12]; | |
filts_strip[3] = filts_smem_off[3*12]; | |
filts_strip[4] = filts_smem_off[4*12]; | |
filts_strip[5] = filts_smem_off[5*12]; | |
filts_strip[6] = filts_smem_off[6*12]; | |
filts_strip[7] = filts_smem_off[7*12]; | |
// end t_tile_bias_loads; | |
if( flags == 1 ) { | |
GASQ float * const out_off = out + LOC_ID_1D; | |
out_off[0] = max(0.0f,out_tile[0]+filts_strip[0]); | |
out_off[120] = max(0.0f,out_tile[1]+filts_strip[1]); | |
out_off[240] = max(0.0f,out_tile[2]+filts_strip[2]); | |
out_off[360] = max(0.0f,out_tile[3]+filts_strip[3]); | |
out_off[480] = max(0.0f,out_tile[4]+filts_strip[4]); | |
out_off[600] = max(0.0f,out_tile[5]+filts_strip[5]); | |
out_off[720] = max(0.0f,out_tile[6]+filts_strip[6]); | |
out_off[840] = max(0.0f,out_tile[7]+filts_strip[7]); | |
out_off[960] = max(0.0f,out_tile[8]+filts_strip[0]); | |
out_off[1080] = max(0.0f,out_tile[9]+filts_strip[1]); | |
out_off[1200] = max(0.0f,out_tile[10]+filts_strip[2]); | |
out_off[1320] = max(0.0f,out_tile[11]+filts_strip[3]); | |
out_off[1440] = max(0.0f,out_tile[12]+filts_strip[4]); | |
out_off[1560] = max(0.0f,out_tile[13]+filts_strip[5]); | |
out_off[1680] = max(0.0f,out_tile[14]+filts_strip[6]); | |
out_off[1800] = max(0.0f,out_tile[15]+filts_strip[7]); | |
out_off[1920] = max(0.0f,out_tile[16]+filts_strip[0]); | |
out_off[2040] = max(0.0f,out_tile[17]+filts_strip[1]); | |
out_off[2160] = max(0.0f,out_tile[18]+filts_strip[2]); | |
out_off[2280] = max(0.0f,out_tile[19]+filts_strip[3]); | |
out_off[2400] = max(0.0f,out_tile[20]+filts_strip[4]); | |
out_off[2520] = max(0.0f,out_tile[21]+filts_strip[5]); | |
out_off[2640] = max(0.0f,out_tile[22]+filts_strip[6]); | |
out_off[2760] = max(0.0f,out_tile[23]+filts_strip[7]); | |
out_off[2880] = max(0.0f,out_tile[24]+filts_strip[0]); | |
out_off[3000] = max(0.0f,out_tile[25]+filts_strip[1]); | |
out_off[3120] = max(0.0f,out_tile[26]+filts_strip[2]); | |
out_off[3240] = max(0.0f,out_tile[27]+filts_strip[3]); | |
out_off[3360] = max(0.0f,out_tile[28]+filts_strip[4]); | |
out_off[3480] = max(0.0f,out_tile[29]+filts_strip[5]); | |
out_off[3600] = max(0.0f,out_tile[30]+filts_strip[6]); | |
out_off[3720] = max(0.0f,out_tile[31]+filts_strip[7]); | |
out_off[3840] = max(0.0f,out_tile[32]+filts_strip[0]); | |
out_off[3960] = max(0.0f,out_tile[33]+filts_strip[1]); | |
out_off[4080] = max(0.0f,out_tile[34]+filts_strip[2]); | |
out_off[4200] = max(0.0f,out_tile[35]+filts_strip[3]); | |
out_off[4320] = max(0.0f,out_tile[36]+filts_strip[4]); | |
out_off[4440] = max(0.0f,out_tile[37]+filts_strip[5]); | |
out_off[4560] = max(0.0f,out_tile[38]+filts_strip[6]); | |
out_off[4680] = max(0.0f,out_tile[39]+filts_strip[7]); | |
out_off[4800] = max(0.0f,out_tile[40]+filts_strip[0]); | |
out_off[4920] = max(0.0f,out_tile[41]+filts_strip[1]); | |
out_off[5040] = max(0.0f,out_tile[42]+filts_strip[2]); | |
out_off[5160] = max(0.0f,out_tile[43]+filts_strip[3]); | |
out_off[5280] = max(0.0f,out_tile[44]+filts_strip[4]); | |
out_off[5400] = max(0.0f,out_tile[45]+filts_strip[5]); | |
out_off[5520] = max(0.0f,out_tile[46]+filts_strip[6]); | |
out_off[5640] = max(0.0f,out_tile[47]+filts_strip[7]); | |
out_off[5760] = max(0.0f,out_tile[48]+filts_strip[0]); | |
out_off[5880] = max(0.0f,out_tile[49]+filts_strip[1]); | |
out_off[6000] = max(0.0f,out_tile[50]+filts_strip[2]); | |
out_off[6120] = max(0.0f,out_tile[51]+filts_strip[3]); | |
out_off[6240] = max(0.0f,out_tile[52]+filts_strip[4]); | |
out_off[6360] = max(0.0f,out_tile[53]+filts_strip[5]); | |
out_off[6480] = max(0.0f,out_tile[54]+filts_strip[6]); | |
out_off[6600] = max(0.0f,out_tile[55]+filts_strip[7]); | |
out_off[6720] = max(0.0f,out_tile[56]+filts_strip[0]); | |
out_off[6840] = max(0.0f,out_tile[57]+filts_strip[1]); | |
out_off[6960] = max(0.0f,out_tile[58]+filts_strip[2]); | |
out_off[7080] = max(0.0f,out_tile[59]+filts_strip[3]); | |
out_off[7200] = max(0.0f,out_tile[60]+filts_strip[4]); | |
out_off[7320] = max(0.0f,out_tile[61]+filts_strip[5]); | |
out_off[7440] = max(0.0f,out_tile[62]+filts_strip[6]); | |
out_off[7560] = max(0.0f,out_tile[63]+filts_strip[7]); | |
; | |
return; | |
} | |
// add bias to each elem of out_tile[] and store the results to out[] | |
// begin t_tile_stores | |
int32_t tpix[8]; | |
int32_t tcix[8]; | |
tpix[0] = ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+0)/784)*75264 + ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+0)%784)*1 ; // cache out patch ixs | |
tpix[1] = ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+1)/784)*75264 + ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+1)%784)*1 ; // cache out patch ixs | |
tpix[2] = ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+2)/784)*75264 + ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+2)%784)*1 ; // cache out patch ixs | |
tpix[3] = ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+3)/784)*75264 + ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+3)%784)*1 ; // cache out patch ixs | |
tpix[4] = ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+4)/784)*75264 + ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+4)%784)*1 ; // cache out patch ixs | |
tpix[5] = ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+5)/784)*75264 + ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+5)%784)*1 ; // cache out patch ixs | |
tpix[6] = ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+6)/784)*75264 + ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+6)%784)*1 ; // cache out patch ixs | |
tpix[7] = ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+7)/784)*75264 + ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+7)%784)*1 ; // cache out patch ixs | |
tcix[0] = ((((LOC_ID_1D%12)+(GRP_ID_1D%1)*12)*8)+0)*784; // cache out chan ixs | |
tcix[1] = ((((LOC_ID_1D%12)+(GRP_ID_1D%1)*12)*8)+1)*784; // cache out chan ixs | |
tcix[2] = ((((LOC_ID_1D%12)+(GRP_ID_1D%1)*12)*8)+2)*784; // cache out chan ixs | |
tcix[3] = ((((LOC_ID_1D%12)+(GRP_ID_1D%1)*12)*8)+3)*784; // cache out chan ixs | |
tcix[4] = ((((LOC_ID_1D%12)+(GRP_ID_1D%1)*12)*8)+4)*784; // cache out chan ixs | |
tcix[5] = ((((LOC_ID_1D%12)+(GRP_ID_1D%1)*12)*8)+5)*784; // cache out chan ixs | |
tcix[6] = ((((LOC_ID_1D%12)+(GRP_ID_1D%1)*12)*8)+6)*784; // cache out chan ixs | |
tcix[7] = ((((LOC_ID_1D%12)+(GRP_ID_1D%1)*12)*8)+7)*784; // cache out chan ixs | |
if( ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+0)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (96*784) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( tcix[1] < (96*784) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( tcix[2] < (96*784) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( tcix[3] < (96*784) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( tcix[4] < (96*784) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( tcix[5] < (96*784) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( tcix[6] < (96*784) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( tcix[7] < (96*784) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+1)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (96*784) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( tcix[1] < (96*784) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( tcix[2] < (96*784) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( tcix[3] < (96*784) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( tcix[4] < (96*784) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( tcix[5] < (96*784) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( tcix[6] < (96*784) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( tcix[7] < (96*784) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+2)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (96*784) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( tcix[1] < (96*784) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( tcix[2] < (96*784) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( tcix[3] < (96*784) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( tcix[4] < (96*784) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( tcix[5] < (96*784) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( tcix[6] < (96*784) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( tcix[7] < (96*784) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+3)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (96*784) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( tcix[1] < (96*784) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( tcix[2] < (96*784) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( tcix[3] < (96*784) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( tcix[4] < (96*784) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( tcix[5] < (96*784) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( tcix[6] < (96*784) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( tcix[7] < (96*784) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+4)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (96*784) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( tcix[1] < (96*784) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( tcix[2] < (96*784) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( tcix[3] < (96*784) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( tcix[4] < (96*784) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( tcix[5] < (96*784) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( tcix[6] < (96*784) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( tcix[7] < (96*784) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+5)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (96*784) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( tcix[1] < (96*784) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( tcix[2] < (96*784) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( tcix[3] < (96*784) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( tcix[4] < (96*784) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( tcix[5] < (96*784) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( tcix[6] < (96*784) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( tcix[7] < (96*784) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+6)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (96*784) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( tcix[1] < (96*784) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( tcix[2] < (96*784) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( tcix[3] < (96*784) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( tcix[4] < (96*784) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( tcix[5] < (96*784) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( tcix[6] < (96*784) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( tcix[7] < (96*784) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( ((GRP_ID_1D*80 + (LOC_ID_1D/12)*8+7)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (96*784) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( tcix[1] < (96*784) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( tcix[2] < (96*784) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( tcix[3] < (96*784) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( tcix[4] < (96*784) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( tcix[5] < (96*784) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( tcix[6] < (96*784) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( tcix[7] < (96*784) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_dim_0 = 28 */ | |
/* in_dim_1 = 28 */ | |
/* conv_has_relu = 1 */ | |
/* out_chans = 96 */ | |
/* write_xposed = 0 */ | |
/* in_chans = 192 */ | |
/* rtc_func_name = k1conv__num_imgs_20__in_dim_0_28__in_dim_1_28__conv_has_relu_1__out_chans_96__write_xposed_0__in_chans_192 */ | |
/* t_tile_sz = 8 */ | |
/* out_ix_x_dim = 28 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%28) */ | |
/* out_ix_y_dim = 28 */ | |
/* out_ix_y_sz = 28 */ | |
/* out_ix_y_nomod = (out_ix/28) */ | |
/* out_ix_y = ((out_ix/28)%%28) */ | |
/* out_ix_chan_dim = 96 */ | |
/* out_ix_chan_sz = 784 */ | |
/* out_ix_chan_nomod = (out_ix/784) */ | |
/* out_ix_chan = ((out_ix/784)%%96) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 75264 */ | |
/* out_ix_img_nomod = (out_ix/75264) */ | |
/* out_ix_img = (out_ix/75264) */ | |
/* out_ix_sz = 1505280 */ | |
/* tpb = 120 */ | |
/* in_chan_tile = 8 */ | |
/* LOC_ID_1D_out_chan_tile_dim = 12 */ | |
/* LOC_ID_1D_out_chan_tile_sz = 1 */ | |
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */ | |
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%12) */ | |
/* LOC_ID_1D_pels_tile_dim = 10 */ | |
/* LOC_ID_1D_pels_tile_sz = 12 */ | |
/* LOC_ID_1D_pels_tile_nomod = (LOC_ID_1D/12) */ | |
/* LOC_ID_1D_pels_tile = (LOC_ID_1D/12) */ | |
/* LOC_ID_1D_sz = 120 */ | |
/* GRP_ID_1D_out_chan_blk_dim = 1 */ | |
/* GRP_ID_1D_out_chan_blk_sz = 1 */ | |
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */ | |
/* GRP_ID_1D_pels_blk_dim = 196 */ | |
/* GRP_ID_1D_pels_blk_sz = 1 */ | |
/* GRP_ID_1D_pels_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_pels_blk = GRP_ID_1D */ | |
/* GRP_ID_1D_sz = 196 */ | |
/* in_ix_blk_pel_dim = 80 */ | |
/* in_ix_blk_pel_sz = 1 */ | |
/* in_ix_blk_pel_nomod = in_ix */ | |
/* in_ix_blk_pel = (in_ix%%80) */ | |
/* in_ix_blk_iter_chan_dim = 8 */ | |
/* in_ix_blk_iter_chan_sz = 80 */ | |
/* in_ix_blk_iter_chan_nomod = (in_ix/80) */ | |
/* in_ix_blk_iter_chan = ((in_ix/80)%%8) */ | |
/* in_ix_blk_iter_dim = 24 */ | |
/* in_ix_blk_iter_sz = 640 */ | |
/* in_ix_blk_iter_nomod = (in_ix/640) */ | |
/* in_ix_blk_iter = ((in_ix/640)%%24) */ | |
/* in_ix_blk_dim = 196 */ | |
/* in_ix_blk_sz = 15360 */ | |
/* in_ix_blk_nomod = (in_ix/15360) */ | |
/* in_ix_blk = (in_ix/15360) */ | |
/* in_ix_sz = 3010560 */ | |
/* blk_filt_ix_sz = 96 */ | |
/* filts_smem_sz = 768 */ | |
/* in_smem_sz = 640 */ | |
/* out_smem_sz = 960 */ | |
/* all_smem_sz = 1408 */ | |
/* filts_xp_ix_out_chan_tile_dim = 12 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%12) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 12 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/12) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/12)%%8) */ | |
/* filts_xp_ix_in_chan_dim = 192 */ | |
/* filts_xp_ix_in_chan_sz = 96 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/96) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/96)%%192) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 18432 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/18432) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/18432) */ | |
/* filts_xp_ix_sz = 18432 */ | |
/* out_chan_bias_smem_load_iter = 1 */ | |
/* filts_off_adj = LOC_ID_1D */ | |
/* smem_loads = // begin smem_loads | |
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 2)] = filts[filts_off+(%(tpb)*2)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 3)] = filts[filts_off+(%(tpb)*3)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 4)] = filts[filts_off+(%(tpb)*4)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 5)] = filts[filts_off+(%(tpb)*5)]; | |
if( (LOC_ID_1D + %(tpb) * 6) < %(filts_smem_sz) ) { filts_smem[(LOC_ID_1D + %(tpb) * 6)] = filts[filts_off+(%(tpb)*6)];} | |
in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 1)] = in[ blk_in_ix_base + (%(tpb)*1) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 2)] = in[ blk_in_ix_base + (%(tpb)*2) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 3)] = in[ blk_in_ix_base + (%(tpb)*3) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 4)] = in[ blk_in_ix_base + (%(tpb)*4) ]; | |
if( (LOC_ID_1D + %(tpb) * 5) < %(in_ix_blk_iter_sz)) { in_smem[(LOC_ID_1D + %(tpb) * 5)] = in[ blk_in_ix_base + (%(tpb)*5) ];} | |
// end smem_loads */ | |
/* out_chan_tile = (%(LOC_ID_1D_out_chan_tile)+%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim)) */ | |
/* out_chan_ix = (%(out_chan_tile)*%(t_tile_sz)) */ | |
/* t_smem_ld_pel_pel_dim = 80 */ | |
/* t_smem_ld_pel_pel_sz = 1 */ | |
/* t_smem_ld_pel_pel_nomod = t_smem_ld_pel */ | |
/* t_smem_ld_pel_pel = (t_smem_ld_pel%%80) */ | |
/* t_smem_ld_pel_chan_dim = 8 */ | |
/* t_smem_ld_pel_chan_sz = 80 */ | |
/* t_smem_ld_pel_chan_nomod = (t_smem_ld_pel/80) */ | |
/* t_smem_ld_pel_chan = (t_smem_ld_pel/80) */ | |
/* t_smem_ld_pel_sz = 640 */ | |
/* out_pel_0 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+0) */ | |
/* out_pel_0_pel_dim = 784 */ | |
/* out_pel_0_pel_sz = 1 */ | |
/* out_pel_0_pel_nomod = %(out_pel_0) */ | |
/* out_pel_0_pel = (%(out_pel_0)%%784) */ | |
/* out_pel_0_img_dim = 20 */ | |
/* out_pel_0_img_sz = 784 */ | |
/* out_pel_0_img_nomod = (%(out_pel_0)/784) */ | |
/* out_pel_0_img = (%(out_pel_0)/784) */ | |
/* out_pel_0_sz = 15680 */ | |
/* out_pel_1 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+1) */ | |
/* out_pel_1_pel_dim = 784 */ | |
/* out_pel_1_pel_sz = 1 */ | |
/* out_pel_1_pel_nomod = %(out_pel_1) */ | |
/* out_pel_1_pel = (%(out_pel_1)%%784) */ | |
/* out_pel_1_img_dim = 20 */ | |
/* out_pel_1_img_sz = 784 */ | |
/* out_pel_1_img_nomod = (%(out_pel_1)/784) */ | |
/* out_pel_1_img = (%(out_pel_1)/784) */ | |
/* out_pel_1_sz = 15680 */ | |
/* out_pel_2 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+2) */ | |
/* out_pel_2_pel_dim = 784 */ | |
/* out_pel_2_pel_sz = 1 */ | |
/* out_pel_2_pel_nomod = %(out_pel_2) */ | |
/* out_pel_2_pel = (%(out_pel_2)%%784) */ | |
/* out_pel_2_img_dim = 20 */ | |
/* out_pel_2_img_sz = 784 */ | |
/* out_pel_2_img_nomod = (%(out_pel_2)/784) */ | |
/* out_pel_2_img = (%(out_pel_2)/784) */ | |
/* out_pel_2_sz = 15680 */ | |
/* out_pel_3 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+3) */ | |
/* out_pel_3_pel_dim = 784 */ | |
/* out_pel_3_pel_sz = 1 */ | |
/* out_pel_3_pel_nomod = %(out_pel_3) */ | |
/* out_pel_3_pel = (%(out_pel_3)%%784) */ | |
/* out_pel_3_img_dim = 20 */ | |
/* out_pel_3_img_sz = 784 */ | |
/* out_pel_3_img_nomod = (%(out_pel_3)/784) */ | |
/* out_pel_3_img = (%(out_pel_3)/784) */ | |
/* out_pel_3_sz = 15680 */ | |
/* out_pel_4 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+4) */ | |
/* out_pel_4_pel_dim = 784 */ | |
/* out_pel_4_pel_sz = 1 */ | |
/* out_pel_4_pel_nomod = %(out_pel_4) */ | |
/* out_pel_4_pel = (%(out_pel_4)%%784) */ | |
/* out_pel_4_img_dim = 20 */ | |
/* out_pel_4_img_sz = 784 */ | |
/* out_pel_4_img_nomod = (%(out_pel_4)/784) */ | |
/* out_pel_4_img = (%(out_pel_4)/784) */ | |
/* out_pel_4_sz = 15680 */ | |
/* out_pel_5 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+5) */ | |
/* out_pel_5_pel_dim = 784 */ | |
/* out_pel_5_pel_sz = 1 */ | |
/* out_pel_5_pel_nomod = %(out_pel_5) */ | |
/* out_pel_5_pel = (%(out_pel_5)%%784) */ | |
/* out_pel_5_img_dim = 20 */ | |
/* out_pel_5_img_sz = 784 */ | |
/* out_pel_5_img_nomod = (%(out_pel_5)/784) */ | |
/* out_pel_5_img = (%(out_pel_5)/784) */ | |
/* out_pel_5_sz = 15680 */ | |
/* out_pel_6 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+6) */ | |
/* out_pel_6_pel_dim = 784 */ | |
/* out_pel_6_pel_sz = 1 */ | |
/* out_pel_6_pel_nomod = %(out_pel_6) */ | |
/* out_pel_6_pel = (%(out_pel_6)%%784) */ | |
/* out_pel_6_img_dim = 20 */ | |
/* out_pel_6_img_sz = 784 */ | |
/* out_pel_6_img_nomod = (%(out_pel_6)/784) */ | |
/* out_pel_6_img = (%(out_pel_6)/784) */ | |
/* out_pel_6_sz = 15680 */ | |
/* out_pel_7 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+7) */ | |
/* out_pel_7_pel_dim = 784 */ | |
/* out_pel_7_pel_sz = 1 */ | |
/* out_pel_7_pel_nomod = %(out_pel_7) */ | |
/* out_pel_7_pel = (%(out_pel_7)%%784) */ | |
/* out_pel_7_img_dim = 20 */ | |
/* out_pel_7_img_sz = 784 */ | |
/* out_pel_7_img_nomod = (%(out_pel_7)/784) */ | |
/* out_pel_7_img = (%(out_pel_7)/784) */ | |
/* out_pel_7_sz = 15680 */ | |
/* t_tile_stores = // begin t_tile_stores | |
int32_t tpix[%(t_tile_sz)]; | |
int32_t tcix[%(t_tile_sz)]; | |
tpix[0] = %(out_pel_0_img)*%(out_ix_img_sz) + %(out_pel_0_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[1] = %(out_pel_1_img)*%(out_ix_img_sz) + %(out_pel_1_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[2] = %(out_pel_2_img)*%(out_ix_img_sz) + %(out_pel_2_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[3] = %(out_pel_3_img)*%(out_ix_img_sz) + %(out_pel_3_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[4] = %(out_pel_4_img)*%(out_ix_img_sz) + %(out_pel_4_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[5] = %(out_pel_5_img)*%(out_ix_img_sz) + %(out_pel_5_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[6] = %(out_pel_6_img)*%(out_ix_img_sz) + %(out_pel_6_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[7] = %(out_pel_7_img)*%(out_ix_img_sz) + %(out_pel_7_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tcix[0] = (%(out_chan_ix)+0)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[1] = (%(out_chan_ix)+1)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[2] = (%(out_chan_ix)+2)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[3] = (%(out_chan_ix)+3)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[4] = (%(out_chan_ix)+4)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[5] = (%(out_chan_ix)+5)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[6] = (%(out_chan_ix)+6)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[7] = (%(out_chan_ix)+7)*%(out_ix_chan_sz); // cache out chan ixs | |
if( %(out_pel_0_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( %(out_pel_1_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( %(out_pel_2_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( %(out_pel_3_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( %(out_pel_4_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( %(out_pel_5_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( %(out_pel_6_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( %(out_pel_7_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores */ | |
/* t_tile_dummy_stores = out_off[0] = max(0.0f,out_tile[0]+filts_strip[0]); | |
out_off[120] = max(0.0f,out_tile[1]+filts_strip[1]); | |
out_off[240] = max(0.0f,out_tile[2]+filts_strip[2]); | |
out_off[360] = max(0.0f,out_tile[3]+filts_strip[3]); | |
out_off[480] = max(0.0f,out_tile[4]+filts_strip[4]); | |
out_off[600] = max(0.0f,out_tile[5]+filts_strip[5]); | |
out_off[720] = max(0.0f,out_tile[6]+filts_strip[6]); | |
out_off[840] = max(0.0f,out_tile[7]+filts_strip[7]); | |
out_off[960] = max(0.0f,out_tile[8]+filts_strip[0]); | |
out_off[1080] = max(0.0f,out_tile[9]+filts_strip[1]); | |
out_off[1200] = max(0.0f,out_tile[10]+filts_strip[2]); | |
out_off[1320] = max(0.0f,out_tile[11]+filts_strip[3]); | |
out_off[1440] = max(0.0f,out_tile[12]+filts_strip[4]); | |
out_off[1560] = max(0.0f,out_tile[13]+filts_strip[5]); | |
out_off[1680] = max(0.0f,out_tile[14]+filts_strip[6]); | |
out_off[1800] = max(0.0f,out_tile[15]+filts_strip[7]); | |
out_off[1920] = max(0.0f,out_tile[16]+filts_strip[0]); | |
out_off[2040] = max(0.0f,out_tile[17]+filts_strip[1]); | |
out_off[2160] = max(0.0f,out_tile[18]+filts_strip[2]); | |
out_off[2280] = max(0.0f,out_tile[19]+filts_strip[3]); | |
out_off[2400] = max(0.0f,out_tile[20]+filts_strip[4]); | |
out_off[2520] = max(0.0f,out_tile[21]+filts_strip[5]); | |
out_off[2640] = max(0.0f,out_tile[22]+filts_strip[6]); | |
out_off[2760] = max(0.0f,out_tile[23]+filts_strip[7]); | |
out_off[2880] = max(0.0f,out_tile[24]+filts_strip[0]); | |
out_off[3000] = max(0.0f,out_tile[25]+filts_strip[1]); | |
out_off[3120] = max(0.0f,out_tile[26]+filts_strip[2]); | |
out_off[3240] = max(0.0f,out_tile[27]+filts_strip[3]); | |
out_off[3360] = max(0.0f,out_tile[28]+filts_strip[4]); | |
out_off[3480] = max(0.0f,out_tile[29]+filts_strip[5]); | |
out_off[3600] = max(0.0f,out_tile[30]+filts_strip[6]); | |
out_off[3720] = max(0.0f,out_tile[31]+filts_strip[7]); | |
out_off[3840] = max(0.0f,out_tile[32]+filts_strip[0]); | |
out_off[3960] = max(0.0f,out_tile[33]+filts_strip[1]); | |
out_off[4080] = max(0.0f,out_tile[34]+filts_strip[2]); | |
out_off[4200] = max(0.0f,out_tile[35]+filts_strip[3]); | |
out_off[4320] = max(0.0f,out_tile[36]+filts_strip[4]); | |
out_off[4440] = max(0.0f,out_tile[37]+filts_strip[5]); | |
out_off[4560] = max(0.0f,out_tile[38]+filts_strip[6]); | |
out_off[4680] = max(0.0f,out_tile[39]+filts_strip[7]); | |
out_off[4800] = max(0.0f,out_tile[40]+filts_strip[0]); | |
out_off[4920] = max(0.0f,out_tile[41]+filts_strip[1]); | |
out_off[5040] = max(0.0f,out_tile[42]+filts_strip[2]); | |
out_off[5160] = max(0.0f,out_tile[43]+filts_strip[3]); | |
out_off[5280] = max(0.0f,out_tile[44]+filts_strip[4]); | |
out_off[5400] = max(0.0f,out_tile[45]+filts_strip[5]); | |
out_off[5520] = max(0.0f,out_tile[46]+filts_strip[6]); | |
out_off[5640] = max(0.0f,out_tile[47]+filts_strip[7]); | |
out_off[5760] = max(0.0f,out_tile[48]+filts_strip[0]); | |
out_off[5880] = max(0.0f,out_tile[49]+filts_strip[1]); | |
out_off[6000] = max(0.0f,out_tile[50]+filts_strip[2]); | |
out_off[6120] = max(0.0f,out_tile[51]+filts_strip[3]); | |
out_off[6240] = max(0.0f,out_tile[52]+filts_strip[4]); | |
out_off[6360] = max(0.0f,out_tile[53]+filts_strip[5]); | |
out_off[6480] = max(0.0f,out_tile[54]+filts_strip[6]); | |
out_off[6600] = max(0.0f,out_tile[55]+filts_strip[7]); | |
out_off[6720] = max(0.0f,out_tile[56]+filts_strip[0]); | |
out_off[6840] = max(0.0f,out_tile[57]+filts_strip[1]); | |
out_off[6960] = max(0.0f,out_tile[58]+filts_strip[2]); | |
out_off[7080] = max(0.0f,out_tile[59]+filts_strip[3]); | |
out_off[7200] = max(0.0f,out_tile[60]+filts_strip[4]); | |
out_off[7320] = max(0.0f,out_tile[61]+filts_strip[5]); | |
out_off[7440] = max(0.0f,out_tile[62]+filts_strip[6]); | |
out_off[7560] = max(0.0f,out_tile[63]+filts_strip[7]); | |
*/ | |
/* t_tile_bias_loads = // begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
// end t_tile_bias_loads */ | |
/* inner_loop_body = // begin inner_loop_body | |
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[3*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[3*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[3*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[3*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[3*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[3*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[3*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[3*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[4*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[4*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[4*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[4*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[4*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[4*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[4*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[4*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[5*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[5*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[5*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[5*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[5*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[5*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[5*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[5*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[6*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[6*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[6*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[6*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[6*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[6*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[6*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[6*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[7*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[7*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[7*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[7*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[7*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[7*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[7*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[7*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
*/ | |
CUCL_GLOBAL_KERNEL void xpose_in__num_imgs_20__in_chan_tile_8__pad_in_chans_192__in_chans_192__ysz_28__xsz_28__tix_pels_tile_sz_10__bix_pels_blk_sz_196( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const out_ix = GLOB_ID_1D; | |
int32_t const chan_ix = ((out_ix/640)%24)*8 + ((out_ix/80)%8); | |
int32_t const pel_ix = (out_ix/15360)*80 + (out_ix%80); | |
float v = 0.0f; | |
if( ( chan_ix < 192 ) && ( (pel_ix/784) < 20 ) ) { | |
v = in[ (pel_ix/784)*150528 + | |
chan_ix*784 + | |
((pel_ix/28)%28)*28 + | |
(pel_ix%28)*1 ]; | |
} | |
out[out_ix] = v; | |
} | |
/* | |
in_pels = num_img * in.sz.dims_prod() | |
num_in_blks = u32_ceil_div( in_pels, block_chan_pels ) | |
normal in dims: img, chan, y, x OR img, chan, pels // where pels = x,y dims merged | |
block_iters = u32_ceil_div( chan, in_chan_tile ) // for ccp1, 96/8=12 | |
pad_chan = block_iter * in_chan_tile // pad by up to (in_chan_tile-1) [typ. 8; pad with zeros? garbage okay?] | |
block_chan_pels = t_tile_sz*tix_pels_tile_sz // typically 8*8=64 | |
block_iter_pels = block_chan_pels * in_chan_tile; // typically 512 | |
block_pels = 12*512 = 6144 // note: 24576 bytes, prob. too big for SM to fully cache, but 512=2K (per-iter cache) is fine. | |
xposed in dims (inner): (block_iter, block_iter_chan, block_iter_pel) == block_pel | |
sz (inner): (block_iters, in_chan_tile, block_chan_pels) == block_pels (only inner 2 dims need to be linear?) | |
*/ | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_chan_tile = 8 */ | |
/* pad_in_chans = 192 */ | |
/* in_chans = 192 */ | |
/* ysz = 28 */ | |
/* xsz = 28 */ | |
/* tix_pels_tile_sz = 10 */ | |
/* bix_pels_blk_sz = 196 */ | |
/* rtc_func_name = xpose_in__num_imgs_20__in_chan_tile_8__pad_in_chans_192__in_chans_192__ysz_28__xsz_28__tix_pels_tile_sz_10__bix_pels_blk_sz_196 */ | |
/* out_ix_blk_pel_dim = 80 */ | |
/* out_ix_blk_pel_sz = 1 */ | |
/* out_ix_blk_pel_nomod = out_ix */ | |
/* out_ix_blk_pel = (out_ix%%80) */ | |
/* out_ix_blk_iter_chan_dim = 8 */ | |
/* out_ix_blk_iter_chan_sz = 80 */ | |
/* out_ix_blk_iter_chan_nomod = (out_ix/80) */ | |
/* out_ix_blk_iter_chan = ((out_ix/80)%%8) */ | |
/* out_ix_blk_iter_dim = 24 */ | |
/* out_ix_blk_iter_sz = 640 */ | |
/* out_ix_blk_iter_nomod = (out_ix/640) */ | |
/* out_ix_blk_iter = ((out_ix/640)%%24) */ | |
/* out_ix_blk_dim = 196 */ | |
/* out_ix_blk_sz = 15360 */ | |
/* out_ix_blk_nomod = (out_ix/15360) */ | |
/* out_ix_blk = (out_ix/15360) */ | |
/* out_ix_sz = 3010560 */ | |
/* pel_ix_x_dim = 28 */ | |
/* pel_ix_x_sz = 1 */ | |
/* pel_ix_x_nomod = pel_ix */ | |
/* pel_ix_x = (pel_ix%%28) */ | |
/* pel_ix_y_dim = 28 */ | |
/* pel_ix_y_sz = 28 */ | |
/* pel_ix_y_nomod = (pel_ix/28) */ | |
/* pel_ix_y = ((pel_ix/28)%%28) */ | |
/* pel_ix_img_dim = 20 */ | |
/* pel_ix_img_sz = 784 */ | |
/* pel_ix_img_nomod = (pel_ix/784) */ | |
/* pel_ix_img = (pel_ix/784) */ | |
/* pel_ix_sz = 15680 */ | |
/* in_ix_x_dim = 28 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%28) */ | |
/* in_ix_y_dim = 28 */ | |
/* in_ix_y_sz = 28 */ | |
/* in_ix_y_nomod = (in_ix/28) */ | |
/* in_ix_y = ((in_ix/28)%%28) */ | |
/* in_ix_chan_dim = 192 */ | |
/* in_ix_chan_sz = 784 */ | |
/* in_ix_chan_nomod = (in_ix/784) */ | |
/* in_ix_chan = ((in_ix/784)%%192) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 150528 */ | |
/* in_ix_img_nomod = (in_ix/150528) */ | |
/* in_ix_img = (in_ix/150528) */ | |
/* in_ix_sz = 3010560 */ | |
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_96__in_chans_192__kysz_1__kxsz_1( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x | |
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile | |
{ | |
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg | |
int32_t const filts_ix = GLOB_ID_1D; | |
if( filts_ix >= 18432 ) { return; } | |
int32_t const fioc = (filts_ix/192); | |
float val = 0.0f; | |
int32_t const filts_xp_ix = | |
(fioc/96)*18432 + | |
(fioc%8)*12 + | |
((fioc/8)%12)*1 + | |
(filts_ix%192)*96 + | |
(filts_ix%1)*96 + | |
(filts_ix%1)*96; | |
#if 1 | |
val = in[filts_ix]; | |
#else | |
if( (filts_ix%192) == 0 ) { | |
// if( ((filts_ix%1) == 5) && ((filts_ix%1) == 5) ) | |
{ | |
val = (filts_ix%1)*100 + (filts_ix%1); | |
} | |
} | |
#endif | |
out[filts_xp_ix] = val; | |
} | |
// -- template substituion table used: -- | |
/* out_chans = 96 */ | |
/* in_chans = 192 */ | |
/* kysz = 1 */ | |
/* kxsz = 1 */ | |
/* rtc_func_name = xpose_filts__out_chans_96__in_chans_192__kysz_1__kxsz_1 */ | |
/* t_tile_sz = 8 */ | |
/* filts_ix_x_dim = 1 */ | |
/* filts_ix_x_sz = 1 */ | |
/* filts_ix_x_nomod = filts_ix */ | |
/* filts_ix_x = (filts_ix%%1) */ | |
/* filts_ix_y_dim = 1 */ | |
/* filts_ix_y_sz = 1 */ | |
/* filts_ix_y_nomod = filts_ix */ | |
/* filts_ix_y = (filts_ix%%1) */ | |
/* filts_ix_in_chan_dim = 192 */ | |
/* filts_ix_in_chan_sz = 1 */ | |
/* filts_ix_in_chan_nomod = filts_ix */ | |
/* filts_ix_in_chan = (filts_ix%%192) */ | |
/* filts_ix_out_chan_dim = 96 */ | |
/* filts_ix_out_chan_sz = 192 */ | |
/* filts_ix_out_chan_nomod = (filts_ix/192) */ | |
/* filts_ix_out_chan = (filts_ix/192) */ | |
/* filts_ix_sz = 18432 */ | |
/* filts_xp_ix_out_chan_tile_dim = 12 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%12) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 12 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/12) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/12)%%8) */ | |
/* filts_xp_ix_x_dim = 1 */ | |
/* filts_xp_ix_x_sz = 96 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/96) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/96)%%1) */ | |
/* filts_xp_ix_y_dim = 1 */ | |
/* filts_xp_ix_y_sz = 96 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/96) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/96)%%1) */ | |
/* filts_xp_ix_in_chan_dim = 192 */ | |
/* filts_xp_ix_in_chan_sz = 96 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/96) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/96)%%192) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 18432 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/18432) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/18432) */ | |
/* filts_xp_ix_sz = 18432 */ | |
/* fioc_out_chan_reg_dim = 8 */ | |
/* fioc_out_chan_reg_sz = 1 */ | |
/* fioc_out_chan_reg_nomod = fioc */ | |
/* fioc_out_chan_reg = (fioc%%8) */ | |
/* fioc_out_chan_tile_dim = 12 */ | |
/* fioc_out_chan_tile_sz = 8 */ | |
/* fioc_out_chan_tile_nomod = (fioc/8) */ | |
/* fioc_out_chan_tile = ((fioc/8)%%12) */ | |
/* fioc_out_chan_blk_dim = 1 */ | |
/* fioc_out_chan_blk_sz = 96 */ | |
/* fioc_out_chan_blk_nomod = (fioc/96) */ | |
/* fioc_out_chan_blk = (fioc/96) */ | |
/* fioc_sz = 96 */ | |
// each thread: computes 8x8 block of out | |
// loop over k dim | |
CUCL_GLOBAL_KERNEL void tconv__num_imgs_20__in_dim_0_28__in_dim_1_28__kern_sz_3__stride_1__in_pad_1__t_tile_sz_8__conv_has_relu_1__out_chans_128__in_chans_96( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) { | |
LOCSHAR_MEM float all_smem[1024]; // note: max(filts+in,out) == max(384+120,1024) | |
LSMASQ float * const filts_smem = all_smem; | |
LSMASQ float * const in_smem = filts_smem + 384; | |
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers | |
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem | |
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz ) | |
float in_strip[10]; // segment of input line sufficient for one unrolling of inner loop | |
int32_t blk_in_ix_base = GRP_ID_1D*11520 + LOC_ID_1D;// index of first input pel to load for this thread | |
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*110592; // index of first out chan | |
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D; | |
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%16); | |
int32_t out_line = (GRP_ID_1D/4)*8; // first out_line of block | |
int32_t const blk_fli = (out_line/28); // image of first out_line of block | |
out_line += (LOC_ID_1D/16); // adjust to out_line of this thread | |
// offset in lines to deal with >1 img/block = (number of prior images (partial or full) in this block) * (adj to next img) | |
int32_t const img_off_lines = ((out_line/28) - blk_fli)*(3-1); | |
int32_t const in_y = (out_line%28)*1 - 1; | |
for( int32_t in_chan = 0; in_chan != 96; ++in_chan ) { | |
BARRIER_SYNC; | |
// begin in_smem_loads | |
if( (LOC_ID_1D + 128 * 0) < 120) { in_smem[(LOC_ID_1D + 128 * 0)] = in[ blk_in_ix_base + (128*0) ];} | |
blk_in_ix_base += 120; | |
// end in_smem_loads; | |
for( int32_t ky = 0; ky != 3; ++ky ) { | |
if( ky != 0 ) { BARRIER_SYNC; } | |
// begin filt_smem_loads | |
filts_smem[(LOC_ID_1D + 128 * 0)] = filts[filts_off+(128*0)]; | |
filts_smem[(LOC_ID_1D + 128 * 1)] = filts[filts_off+(128*1)]; | |
filts_smem[(LOC_ID_1D + 128 * 2)] = filts[filts_off+(128*2)]; | |
filts_off += 384; | |
// end filt_smem_loads; | |
BARRIER_SYNC; | |
if( (out_line/28) >= 20 ) { continue; } // required: skip lines from invalid images (read might be invalid) | |
if( ((in_y+ky) < 0) || ((in_y+ky)>28) ) { continue; } // optimization: skip known-to-be-padding input lines | |
LSMASQ float * const in_smem_off = in_smem + ((LOC_ID_1D/16)*1+ky+img_off_lines)*10; | |
// begin inner_loop_body | |
in_strip[0] = in_smem_off[0]; | |
in_strip[1] = in_smem_off[1]; | |
in_strip[2] = in_smem_off[2]; | |
in_strip[3] = in_smem_off[3]; | |
in_strip[4] = in_smem_off[4]; | |
in_strip[5] = in_smem_off[5]; | |
in_strip[6] = in_smem_off[6]; | |
in_strip[7] = in_smem_off[7]; | |
in_strip[8] = in_smem_off[8]; | |
in_strip[9] = in_smem_off[9]; | |
filts_strip[0] = filts_smem_off[0*128+0*16]; | |
filts_strip[1] = filts_smem_off[0*128+1*16]; | |
filts_strip[2] = filts_smem_off[0*128+2*16]; | |
filts_strip[3] = filts_smem_off[0*128+3*16]; | |
filts_strip[4] = filts_smem_off[0*128+4*16]; | |
filts_strip[5] = filts_smem_off[0*128+5*16]; | |
filts_strip[6] = filts_smem_off[0*128+6*16]; | |
filts_strip[7] = filts_smem_off[0*128+7*16]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[1*128+0*16]; | |
filts_strip[1] = filts_smem_off[1*128+1*16]; | |
filts_strip[2] = filts_smem_off[1*128+2*16]; | |
filts_strip[3] = filts_smem_off[1*128+3*16]; | |
filts_strip[4] = filts_smem_off[1*128+4*16]; | |
filts_strip[5] = filts_smem_off[1*128+5*16]; | |
filts_strip[6] = filts_smem_off[1*128+6*16]; | |
filts_strip[7] = filts_smem_off[1*128+7*16]; | |
out_tile[0] += filts_strip[0]*in_strip[1]; | |
out_tile[1] += filts_strip[1]*in_strip[1]; | |
out_tile[2] += filts_strip[2]*in_strip[1]; | |
out_tile[3] += filts_strip[3]*in_strip[1]; | |
out_tile[4] += filts_strip[4]*in_strip[1]; | |
out_tile[5] += filts_strip[5]*in_strip[1]; | |
out_tile[6] += filts_strip[6]*in_strip[1]; | |
out_tile[7] += filts_strip[7]*in_strip[1]; | |
out_tile[8] += filts_strip[0]*in_strip[2]; | |
out_tile[9] += filts_strip[1]*in_strip[2]; | |
out_tile[10] += filts_strip[2]*in_strip[2]; | |
out_tile[11] += filts_strip[3]*in_strip[2]; | |
out_tile[12] += filts_strip[4]*in_strip[2]; | |
out_tile[13] += filts_strip[5]*in_strip[2]; | |
out_tile[14] += filts_strip[6]*in_strip[2]; | |
out_tile[15] += filts_strip[7]*in_strip[2]; | |
out_tile[16] += filts_strip[0]*in_strip[3]; | |
out_tile[17] += filts_strip[1]*in_strip[3]; | |
out_tile[18] += filts_strip[2]*in_strip[3]; | |
out_tile[19] += filts_strip[3]*in_strip[3]; | |
out_tile[20] += filts_strip[4]*in_strip[3]; | |
out_tile[21] += filts_strip[5]*in_strip[3]; | |
out_tile[22] += filts_strip[6]*in_strip[3]; | |
out_tile[23] += filts_strip[7]*in_strip[3]; | |
out_tile[24] += filts_strip[0]*in_strip[4]; | |
out_tile[25] += filts_strip[1]*in_strip[4]; | |
out_tile[26] += filts_strip[2]*in_strip[4]; | |
out_tile[27] += filts_strip[3]*in_strip[4]; | |
out_tile[28] += filts_strip[4]*in_strip[4]; | |
out_tile[29] += filts_strip[5]*in_strip[4]; | |
out_tile[30] += filts_strip[6]*in_strip[4]; | |
out_tile[31] += filts_strip[7]*in_strip[4]; | |
out_tile[32] += filts_strip[0]*in_strip[5]; | |
out_tile[33] += filts_strip[1]*in_strip[5]; | |
out_tile[34] += filts_strip[2]*in_strip[5]; | |
out_tile[35] += filts_strip[3]*in_strip[5]; | |
out_tile[36] += filts_strip[4]*in_strip[5]; | |
out_tile[37] += filts_strip[5]*in_strip[5]; | |
out_tile[38] += filts_strip[6]*in_strip[5]; | |
out_tile[39] += filts_strip[7]*in_strip[5]; | |
out_tile[40] += filts_strip[0]*in_strip[6]; | |
out_tile[41] += filts_strip[1]*in_strip[6]; | |
out_tile[42] += filts_strip[2]*in_strip[6]; | |
out_tile[43] += filts_strip[3]*in_strip[6]; | |
out_tile[44] += filts_strip[4]*in_strip[6]; | |
out_tile[45] += filts_strip[5]*in_strip[6]; | |
out_tile[46] += filts_strip[6]*in_strip[6]; | |
out_tile[47] += filts_strip[7]*in_strip[6]; | |
out_tile[48] += filts_strip[0]*in_strip[7]; | |
out_tile[49] += filts_strip[1]*in_strip[7]; | |
out_tile[50] += filts_strip[2]*in_strip[7]; | |
out_tile[51] += filts_strip[3]*in_strip[7]; | |
out_tile[52] += filts_strip[4]*in_strip[7]; | |
out_tile[53] += filts_strip[5]*in_strip[7]; | |
out_tile[54] += filts_strip[6]*in_strip[7]; | |
out_tile[55] += filts_strip[7]*in_strip[7]; | |
out_tile[56] += filts_strip[0]*in_strip[8]; | |
out_tile[57] += filts_strip[1]*in_strip[8]; | |
out_tile[58] += filts_strip[2]*in_strip[8]; | |
out_tile[59] += filts_strip[3]*in_strip[8]; | |
out_tile[60] += filts_strip[4]*in_strip[8]; | |
out_tile[61] += filts_strip[5]*in_strip[8]; | |
out_tile[62] += filts_strip[6]*in_strip[8]; | |
out_tile[63] += filts_strip[7]*in_strip[8]; | |
filts_strip[0] = filts_smem_off[2*128+0*16]; | |
filts_strip[1] = filts_smem_off[2*128+1*16]; | |
filts_strip[2] = filts_smem_off[2*128+2*16]; | |
filts_strip[3] = filts_smem_off[2*128+3*16]; | |
filts_strip[4] = filts_smem_off[2*128+4*16]; | |
filts_strip[5] = filts_smem_off[2*128+5*16]; | |
filts_strip[6] = filts_smem_off[2*128+6*16]; | |
filts_strip[7] = filts_smem_off[2*128+7*16]; | |
out_tile[0] += filts_strip[0]*in_strip[2]; | |
out_tile[1] += filts_strip[1]*in_strip[2]; | |
out_tile[2] += filts_strip[2]*in_strip[2]; | |
out_tile[3] += filts_strip[3]*in_strip[2]; | |
out_tile[4] += filts_strip[4]*in_strip[2]; | |
out_tile[5] += filts_strip[5]*in_strip[2]; | |
out_tile[6] += filts_strip[6]*in_strip[2]; | |
out_tile[7] += filts_strip[7]*in_strip[2]; | |
out_tile[8] += filts_strip[0]*in_strip[3]; | |
out_tile[9] += filts_strip[1]*in_strip[3]; | |
out_tile[10] += filts_strip[2]*in_strip[3]; | |
out_tile[11] += filts_strip[3]*in_strip[3]; | |
out_tile[12] += filts_strip[4]*in_strip[3]; | |
out_tile[13] += filts_strip[5]*in_strip[3]; | |
out_tile[14] += filts_strip[6]*in_strip[3]; | |
out_tile[15] += filts_strip[7]*in_strip[3]; | |
out_tile[16] += filts_strip[0]*in_strip[4]; | |
out_tile[17] += filts_strip[1]*in_strip[4]; | |
out_tile[18] += filts_strip[2]*in_strip[4]; | |
out_tile[19] += filts_strip[3]*in_strip[4]; | |
out_tile[20] += filts_strip[4]*in_strip[4]; | |
out_tile[21] += filts_strip[5]*in_strip[4]; | |
out_tile[22] += filts_strip[6]*in_strip[4]; | |
out_tile[23] += filts_strip[7]*in_strip[4]; | |
out_tile[24] += filts_strip[0]*in_strip[5]; | |
out_tile[25] += filts_strip[1]*in_strip[5]; | |
out_tile[26] += filts_strip[2]*in_strip[5]; | |
out_tile[27] += filts_strip[3]*in_strip[5]; | |
out_tile[28] += filts_strip[4]*in_strip[5]; | |
out_tile[29] += filts_strip[5]*in_strip[5]; | |
out_tile[30] += filts_strip[6]*in_strip[5]; | |
out_tile[31] += filts_strip[7]*in_strip[5]; | |
out_tile[32] += filts_strip[0]*in_strip[6]; | |
out_tile[33] += filts_strip[1]*in_strip[6]; | |
out_tile[34] += filts_strip[2]*in_strip[6]; | |
out_tile[35] += filts_strip[3]*in_strip[6]; | |
out_tile[36] += filts_strip[4]*in_strip[6]; | |
out_tile[37] += filts_strip[5]*in_strip[6]; | |
out_tile[38] += filts_strip[6]*in_strip[6]; | |
out_tile[39] += filts_strip[7]*in_strip[6]; | |
out_tile[40] += filts_strip[0]*in_strip[7]; | |
out_tile[41] += filts_strip[1]*in_strip[7]; | |
out_tile[42] += filts_strip[2]*in_strip[7]; | |
out_tile[43] += filts_strip[3]*in_strip[7]; | |
out_tile[44] += filts_strip[4]*in_strip[7]; | |
out_tile[45] += filts_strip[5]*in_strip[7]; | |
out_tile[46] += filts_strip[6]*in_strip[7]; | |
out_tile[47] += filts_strip[7]*in_strip[7]; | |
out_tile[48] += filts_strip[0]*in_strip[8]; | |
out_tile[49] += filts_strip[1]*in_strip[8]; | |
out_tile[50] += filts_strip[2]*in_strip[8]; | |
out_tile[51] += filts_strip[3]*in_strip[8]; | |
out_tile[52] += filts_strip[4]*in_strip[8]; | |
out_tile[53] += filts_strip[5]*in_strip[8]; | |
out_tile[54] += filts_strip[6]*in_strip[8]; | |
out_tile[55] += filts_strip[7]*in_strip[8]; | |
out_tile[56] += filts_strip[0]*in_strip[9]; | |
out_tile[57] += filts_strip[1]*in_strip[9]; | |
out_tile[58] += filts_strip[2]*in_strip[9]; | |
out_tile[59] += filts_strip[3]*in_strip[9]; | |
out_tile[60] += filts_strip[4]*in_strip[9]; | |
out_tile[61] += filts_strip[5]*in_strip[9]; | |
out_tile[62] += filts_strip[6]*in_strip[9]; | |
out_tile[63] += filts_strip[7]*in_strip[9]; | |
; | |
} | |
} | |
if( flags == 2 ) { return; } | |
BARRIER_SYNC; | |
for( int32_t i = 0; i != 1; ++i ) { | |
int32_t const t_smem_bias_ix = LOC_ID_1D+128*i; | |
if( t_smem_bias_ix < 128 ) { | |
int32_t const ocix_base = (GRP_ID_1D%1)*128; | |
int32_t const load_reg = t_smem_bias_ix / 16; | |
int32_t const load_tile = t_smem_bias_ix % 16; | |
int32_t const ocix = ocix_base + load_tile*8 + load_reg; | |
if( ocix < 128 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; } | |
} | |
} | |
BARRIER_SYNC; | |
// begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*16]; | |
filts_strip[1] = filts_smem_off[1*16]; | |
filts_strip[2] = filts_smem_off[2*16]; | |
filts_strip[3] = filts_smem_off[3*16]; | |
filts_strip[4] = filts_smem_off[4*16]; | |
filts_strip[5] = filts_smem_off[5*16]; | |
filts_strip[6] = filts_smem_off[6*16]; | |
filts_strip[7] = filts_smem_off[7*16]; | |
// end t_tile_bias_loads; | |
if( flags == 1 ) { return; } | |
// begin t_tile_stores | |
if( (out_line/28) >= 20 ) { return; } | |
int32_t out_x = (GRP_ID_1D%4)*8; | |
int32_t out_chan = ((GRP_ID_1D%1)*16 + (LOC_ID_1D%16))*8; | |
GASQ float * out_off = out + (out_line/28)*100352 + out_chan*784 + (out_line%28)*28 + out_x*1 ; | |
if( (out_x + 0) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 128 ) { out_off[ 0*784 + 0*1 ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( (out_chan + 1) < 128 ) { out_off[ 1*784 + 0*1 ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( (out_chan + 2) < 128 ) { out_off[ 2*784 + 0*1 ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( (out_chan + 3) < 128 ) { out_off[ 3*784 + 0*1 ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( (out_chan + 4) < 128 ) { out_off[ 4*784 + 0*1 ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( (out_chan + 5) < 128 ) { out_off[ 5*784 + 0*1 ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( (out_chan + 6) < 128 ) { out_off[ 6*784 + 0*1 ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( (out_chan + 7) < 128 ) { out_off[ 7*784 + 0*1 ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( (out_x + 1) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 128 ) { out_off[ 0*784 + 1*1 ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( (out_chan + 1) < 128 ) { out_off[ 1*784 + 1*1 ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( (out_chan + 2) < 128 ) { out_off[ 2*784 + 1*1 ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( (out_chan + 3) < 128 ) { out_off[ 3*784 + 1*1 ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( (out_chan + 4) < 128 ) { out_off[ 4*784 + 1*1 ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( (out_chan + 5) < 128 ) { out_off[ 5*784 + 1*1 ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( (out_chan + 6) < 128 ) { out_off[ 6*784 + 1*1 ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( (out_chan + 7) < 128 ) { out_off[ 7*784 + 1*1 ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( (out_x + 2) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 128 ) { out_off[ 0*784 + 2*1 ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( (out_chan + 1) < 128 ) { out_off[ 1*784 + 2*1 ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( (out_chan + 2) < 128 ) { out_off[ 2*784 + 2*1 ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( (out_chan + 3) < 128 ) { out_off[ 3*784 + 2*1 ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( (out_chan + 4) < 128 ) { out_off[ 4*784 + 2*1 ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( (out_chan + 5) < 128 ) { out_off[ 5*784 + 2*1 ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( (out_chan + 6) < 128 ) { out_off[ 6*784 + 2*1 ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( (out_chan + 7) < 128 ) { out_off[ 7*784 + 2*1 ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( (out_x + 3) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 128 ) { out_off[ 0*784 + 3*1 ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( (out_chan + 1) < 128 ) { out_off[ 1*784 + 3*1 ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( (out_chan + 2) < 128 ) { out_off[ 2*784 + 3*1 ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( (out_chan + 3) < 128 ) { out_off[ 3*784 + 3*1 ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( (out_chan + 4) < 128 ) { out_off[ 4*784 + 3*1 ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( (out_chan + 5) < 128 ) { out_off[ 5*784 + 3*1 ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( (out_chan + 6) < 128 ) { out_off[ 6*784 + 3*1 ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( (out_chan + 7) < 128 ) { out_off[ 7*784 + 3*1 ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( (out_x + 4) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 128 ) { out_off[ 0*784 + 4*1 ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( (out_chan + 1) < 128 ) { out_off[ 1*784 + 4*1 ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( (out_chan + 2) < 128 ) { out_off[ 2*784 + 4*1 ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( (out_chan + 3) < 128 ) { out_off[ 3*784 + 4*1 ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( (out_chan + 4) < 128 ) { out_off[ 4*784 + 4*1 ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( (out_chan + 5) < 128 ) { out_off[ 5*784 + 4*1 ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( (out_chan + 6) < 128 ) { out_off[ 6*784 + 4*1 ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( (out_chan + 7) < 128 ) { out_off[ 7*784 + 4*1 ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( (out_x + 5) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 128 ) { out_off[ 0*784 + 5*1 ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( (out_chan + 1) < 128 ) { out_off[ 1*784 + 5*1 ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( (out_chan + 2) < 128 ) { out_off[ 2*784 + 5*1 ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( (out_chan + 3) < 128 ) { out_off[ 3*784 + 5*1 ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( (out_chan + 4) < 128 ) { out_off[ 4*784 + 5*1 ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( (out_chan + 5) < 128 ) { out_off[ 5*784 + 5*1 ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( (out_chan + 6) < 128 ) { out_off[ 6*784 + 5*1 ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( (out_chan + 7) < 128 ) { out_off[ 7*784 + 5*1 ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( (out_x + 6) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 128 ) { out_off[ 0*784 + 6*1 ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( (out_chan + 1) < 128 ) { out_off[ 1*784 + 6*1 ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( (out_chan + 2) < 128 ) { out_off[ 2*784 + 6*1 ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( (out_chan + 3) < 128 ) { out_off[ 3*784 + 6*1 ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( (out_chan + 4) < 128 ) { out_off[ 4*784 + 6*1 ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( (out_chan + 5) < 128 ) { out_off[ 5*784 + 6*1 ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( (out_chan + 6) < 128 ) { out_off[ 6*784 + 6*1 ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( (out_chan + 7) < 128 ) { out_off[ 7*784 + 6*1 ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( (out_x + 7) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 128 ) { out_off[ 0*784 + 7*1 ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( (out_chan + 1) < 128 ) { out_off[ 1*784 + 7*1 ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( (out_chan + 2) < 128 ) { out_off[ 2*784 + 7*1 ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( (out_chan + 3) < 128 ) { out_off[ 3*784 + 7*1 ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( (out_chan + 4) < 128 ) { out_off[ 4*784 + 7*1 ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( (out_chan + 5) < 128 ) { out_off[ 5*784 + 7*1 ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( (out_chan + 6) < 128 ) { out_off[ 6*784 + 7*1 ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( (out_chan + 7) < 128 ) { out_off[ 7*784 + 7*1 ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_dim_0 = 28 */ | |
/* in_dim_1 = 28 */ | |
/* kern_sz = 3 */ | |
/* stride = 1 */ | |
/* in_pad = 1 */ | |
/* t_tile_sz = 8 */ | |
/* conv_has_relu = 1 */ | |
/* out_chans = 128 */ | |
/* in_chans = 96 */ | |
/* rtc_func_name = tconv__num_imgs_20__in_dim_0_28__in_dim_1_28__kern_sz_3__stride_1__in_pad_1__t_tile_sz_8__conv_has_relu_1__out_chans_128__in_chans_96 */ | |
/* out_ix_x_dim = 28 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%28) */ | |
/* out_ix_y_dim = 28 */ | |
/* out_ix_y_sz = 28 */ | |
/* out_ix_y_nomod = (out_ix/28) */ | |
/* out_ix_y = ((out_ix/28)%%28) */ | |
/* out_ix_chan_dim = 128 */ | |
/* out_ix_chan_sz = 784 */ | |
/* out_ix_chan_nomod = (out_ix/784) */ | |
/* out_ix_chan = ((out_ix/784)%%128) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 100352 */ | |
/* out_ix_img_nomod = (out_ix/100352) */ | |
/* out_ix_img = (out_ix/100352) */ | |
/* out_ix_sz = 2007040 */ | |
/* tpb = 128 */ | |
/* out_line_y_dim = 28 */ | |
/* out_line_y_sz = 1 */ | |
/* out_line_y_nomod = out_line */ | |
/* out_line_y = (out_line%%28) */ | |
/* out_line_img_dim = 20 */ | |
/* out_line_img_sz = 28 */ | |
/* out_line_img_nomod = (out_line/28) */ | |
/* out_line_img = (out_line/28) */ | |
/* out_line_sz = 560 */ | |
/* in_ix_blk_x_dim = 10 */ | |
/* in_ix_blk_x_sz = 1 */ | |
/* in_ix_blk_x_nomod = in_ix */ | |
/* in_ix_blk_x = (in_ix%%10) */ | |
/* in_ix_blk_y_dim = 12 */ | |
/* in_ix_blk_y_sz = 10 */ | |
/* in_ix_blk_y_nomod = (in_ix/10) */ | |
/* in_ix_blk_y = ((in_ix/10)%%12) */ | |
/* in_ix_blk_in_chan_dim = 96 */ | |
/* in_ix_blk_in_chan_sz = 120 */ | |
/* in_ix_blk_in_chan_nomod = (in_ix/120) */ | |
/* in_ix_blk_in_chan = ((in_ix/120)%%96) */ | |
/* in_ix_blk_bx_dim = 4 */ | |
/* in_ix_blk_bx_sz = 11520 */ | |
/* in_ix_blk_bx_nomod = (in_ix/11520) */ | |
/* in_ix_blk_bx = ((in_ix/11520)%%4) */ | |
/* in_ix_blk_bline_dim = 70 */ | |
/* in_ix_blk_bline_sz = 46080 */ | |
/* in_ix_blk_bline_nomod = (in_ix/46080) */ | |
/* in_ix_blk_bline = (in_ix/46080) */ | |
/* in_ix_sz = 3225600 */ | |
/* LOC_ID_1D_out_chan_tile_dim = 16 */ | |
/* LOC_ID_1D_out_chan_tile_sz = 1 */ | |
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */ | |
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%16) */ | |
/* LOC_ID_1D_blk_y_dim = 8 */ | |
/* LOC_ID_1D_blk_y_sz = 16 */ | |
/* LOC_ID_1D_blk_y_nomod = (LOC_ID_1D/16) */ | |
/* LOC_ID_1D_blk_y = (LOC_ID_1D/16) */ | |
/* LOC_ID_1D_sz = 128 */ | |
/* GRP_ID_1D_out_chan_blk_dim = 1 */ | |
/* GRP_ID_1D_out_chan_blk_sz = 1 */ | |
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */ | |
/* GRP_ID_1D_blk_bx_dim = 4 */ | |
/* GRP_ID_1D_blk_bx_sz = 1 */ | |
/* GRP_ID_1D_blk_bx_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_blk_bx = (GRP_ID_1D%%4) */ | |
/* GRP_ID_1D_blk_bline_dim = 70 */ | |
/* GRP_ID_1D_blk_bline_sz = 4 */ | |
/* GRP_ID_1D_blk_bline_nomod = (GRP_ID_1D/4) */ | |
/* GRP_ID_1D_blk_bline = (GRP_ID_1D/4) */ | |
/* GRP_ID_1D_sz = 280 */ | |
/* blk_filt_ix_sz = 128 */ | |
/* filts_smem_sz = 384 */ | |
/* in_smem_sz = 120 */ | |
/* out_smem_sz = 1024 */ | |
/* all_smem_sz = 1024 */ | |
/* filts_xp_ix_out_chan_tile_dim = 16 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%16) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 16 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/16) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/16)%%8) */ | |
/* filts_xp_ix_x_dim = 3 */ | |
/* filts_xp_ix_x_sz = 128 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/128) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/128)%%3) */ | |
/* filts_xp_ix_y_dim = 3 */ | |
/* filts_xp_ix_y_sz = 384 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/384) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/384)%%3) */ | |
/* filts_xp_ix_in_chan_dim = 96 */ | |
/* filts_xp_ix_in_chan_sz = 1152 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/1152) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/1152)%%96) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 110592 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/110592) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/110592) */ | |
/* filts_xp_ix_sz = 110592 */ | |
/* out_chan_bias_smem_load_iter = 1 */ | |
/* filts_off_adj = LOC_ID_1D */ | |
/* filt_smem_loads = // begin filt_smem_loads | |
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 2)] = filts[filts_off+(%(tpb)*2)]; | |
filts_off += %(filts_xp_ix_y_sz); | |
// end filt_smem_loads */ | |
/* in_smem_loads = // begin in_smem_loads | |
if( (LOC_ID_1D + %(tpb) * 0) < %(in_smem_sz)) { in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ];} | |
blk_in_ix_base += %(in_ix_blk_in_chan_sz); | |
// end in_smem_loads */ | |
/* inner_loop_body = // begin inner_loop_body | |
in_strip[0] = in_smem_off[0]; | |
in_strip[1] = in_smem_off[1]; | |
in_strip[2] = in_smem_off[2]; | |
in_strip[3] = in_smem_off[3]; | |
in_strip[4] = in_smem_off[4]; | |
in_strip[5] = in_smem_off[5]; | |
in_strip[6] = in_smem_off[6]; | |
in_strip[7] = in_smem_off[7]; | |
in_strip[8] = in_smem_off[8]; | |
in_strip[9] = in_smem_off[9]; | |
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[1]; | |
out_tile[1] += filts_strip[1]*in_strip[1]; | |
out_tile[2] += filts_strip[2]*in_strip[1]; | |
out_tile[3] += filts_strip[3]*in_strip[1]; | |
out_tile[4] += filts_strip[4]*in_strip[1]; | |
out_tile[5] += filts_strip[5]*in_strip[1]; | |
out_tile[6] += filts_strip[6]*in_strip[1]; | |
out_tile[7] += filts_strip[7]*in_strip[1]; | |
out_tile[8] += filts_strip[0]*in_strip[2]; | |
out_tile[9] += filts_strip[1]*in_strip[2]; | |
out_tile[10] += filts_strip[2]*in_strip[2]; | |
out_tile[11] += filts_strip[3]*in_strip[2]; | |
out_tile[12] += filts_strip[4]*in_strip[2]; | |
out_tile[13] += filts_strip[5]*in_strip[2]; | |
out_tile[14] += filts_strip[6]*in_strip[2]; | |
out_tile[15] += filts_strip[7]*in_strip[2]; | |
out_tile[16] += filts_strip[0]*in_strip[3]; | |
out_tile[17] += filts_strip[1]*in_strip[3]; | |
out_tile[18] += filts_strip[2]*in_strip[3]; | |
out_tile[19] += filts_strip[3]*in_strip[3]; | |
out_tile[20] += filts_strip[4]*in_strip[3]; | |
out_tile[21] += filts_strip[5]*in_strip[3]; | |
out_tile[22] += filts_strip[6]*in_strip[3]; | |
out_tile[23] += filts_strip[7]*in_strip[3]; | |
out_tile[24] += filts_strip[0]*in_strip[4]; | |
out_tile[25] += filts_strip[1]*in_strip[4]; | |
out_tile[26] += filts_strip[2]*in_strip[4]; | |
out_tile[27] += filts_strip[3]*in_strip[4]; | |
out_tile[28] += filts_strip[4]*in_strip[4]; | |
out_tile[29] += filts_strip[5]*in_strip[4]; | |
out_tile[30] += filts_strip[6]*in_strip[4]; | |
out_tile[31] += filts_strip[7]*in_strip[4]; | |
out_tile[32] += filts_strip[0]*in_strip[5]; | |
out_tile[33] += filts_strip[1]*in_strip[5]; | |
out_tile[34] += filts_strip[2]*in_strip[5]; | |
out_tile[35] += filts_strip[3]*in_strip[5]; | |
out_tile[36] += filts_strip[4]*in_strip[5]; | |
out_tile[37] += filts_strip[5]*in_strip[5]; | |
out_tile[38] += filts_strip[6]*in_strip[5]; | |
out_tile[39] += filts_strip[7]*in_strip[5]; | |
out_tile[40] += filts_strip[0]*in_strip[6]; | |
out_tile[41] += filts_strip[1]*in_strip[6]; | |
out_tile[42] += filts_strip[2]*in_strip[6]; | |
out_tile[43] += filts_strip[3]*in_strip[6]; | |
out_tile[44] += filts_strip[4]*in_strip[6]; | |
out_tile[45] += filts_strip[5]*in_strip[6]; | |
out_tile[46] += filts_strip[6]*in_strip[6]; | |
out_tile[47] += filts_strip[7]*in_strip[6]; | |
out_tile[48] += filts_strip[0]*in_strip[7]; | |
out_tile[49] += filts_strip[1]*in_strip[7]; | |
out_tile[50] += filts_strip[2]*in_strip[7]; | |
out_tile[51] += filts_strip[3]*in_strip[7]; | |
out_tile[52] += filts_strip[4]*in_strip[7]; | |
out_tile[53] += filts_strip[5]*in_strip[7]; | |
out_tile[54] += filts_strip[6]*in_strip[7]; | |
out_tile[55] += filts_strip[7]*in_strip[7]; | |
out_tile[56] += filts_strip[0]*in_strip[8]; | |
out_tile[57] += filts_strip[1]*in_strip[8]; | |
out_tile[58] += filts_strip[2]*in_strip[8]; | |
out_tile[59] += filts_strip[3]*in_strip[8]; | |
out_tile[60] += filts_strip[4]*in_strip[8]; | |
out_tile[61] += filts_strip[5]*in_strip[8]; | |
out_tile[62] += filts_strip[6]*in_strip[8]; | |
out_tile[63] += filts_strip[7]*in_strip[8]; | |
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[2]; | |
out_tile[1] += filts_strip[1]*in_strip[2]; | |
out_tile[2] += filts_strip[2]*in_strip[2]; | |
out_tile[3] += filts_strip[3]*in_strip[2]; | |
out_tile[4] += filts_strip[4]*in_strip[2]; | |
out_tile[5] += filts_strip[5]*in_strip[2]; | |
out_tile[6] += filts_strip[6]*in_strip[2]; | |
out_tile[7] += filts_strip[7]*in_strip[2]; | |
out_tile[8] += filts_strip[0]*in_strip[3]; | |
out_tile[9] += filts_strip[1]*in_strip[3]; | |
out_tile[10] += filts_strip[2]*in_strip[3]; | |
out_tile[11] += filts_strip[3]*in_strip[3]; | |
out_tile[12] += filts_strip[4]*in_strip[3]; | |
out_tile[13] += filts_strip[5]*in_strip[3]; | |
out_tile[14] += filts_strip[6]*in_strip[3]; | |
out_tile[15] += filts_strip[7]*in_strip[3]; | |
out_tile[16] += filts_strip[0]*in_strip[4]; | |
out_tile[17] += filts_strip[1]*in_strip[4]; | |
out_tile[18] += filts_strip[2]*in_strip[4]; | |
out_tile[19] += filts_strip[3]*in_strip[4]; | |
out_tile[20] += filts_strip[4]*in_strip[4]; | |
out_tile[21] += filts_strip[5]*in_strip[4]; | |
out_tile[22] += filts_strip[6]*in_strip[4]; | |
out_tile[23] += filts_strip[7]*in_strip[4]; | |
out_tile[24] += filts_strip[0]*in_strip[5]; | |
out_tile[25] += filts_strip[1]*in_strip[5]; | |
out_tile[26] += filts_strip[2]*in_strip[5]; | |
out_tile[27] += filts_strip[3]*in_strip[5]; | |
out_tile[28] += filts_strip[4]*in_strip[5]; | |
out_tile[29] += filts_strip[5]*in_strip[5]; | |
out_tile[30] += filts_strip[6]*in_strip[5]; | |
out_tile[31] += filts_strip[7]*in_strip[5]; | |
out_tile[32] += filts_strip[0]*in_strip[6]; | |
out_tile[33] += filts_strip[1]*in_strip[6]; | |
out_tile[34] += filts_strip[2]*in_strip[6]; | |
out_tile[35] += filts_strip[3]*in_strip[6]; | |
out_tile[36] += filts_strip[4]*in_strip[6]; | |
out_tile[37] += filts_strip[5]*in_strip[6]; | |
out_tile[38] += filts_strip[6]*in_strip[6]; | |
out_tile[39] += filts_strip[7]*in_strip[6]; | |
out_tile[40] += filts_strip[0]*in_strip[7]; | |
out_tile[41] += filts_strip[1]*in_strip[7]; | |
out_tile[42] += filts_strip[2]*in_strip[7]; | |
out_tile[43] += filts_strip[3]*in_strip[7]; | |
out_tile[44] += filts_strip[4]*in_strip[7]; | |
out_tile[45] += filts_strip[5]*in_strip[7]; | |
out_tile[46] += filts_strip[6]*in_strip[7]; | |
out_tile[47] += filts_strip[7]*in_strip[7]; | |
out_tile[48] += filts_strip[0]*in_strip[8]; | |
out_tile[49] += filts_strip[1]*in_strip[8]; | |
out_tile[50] += filts_strip[2]*in_strip[8]; | |
out_tile[51] += filts_strip[3]*in_strip[8]; | |
out_tile[52] += filts_strip[4]*in_strip[8]; | |
out_tile[53] += filts_strip[5]*in_strip[8]; | |
out_tile[54] += filts_strip[6]*in_strip[8]; | |
out_tile[55] += filts_strip[7]*in_strip[8]; | |
out_tile[56] += filts_strip[0]*in_strip[9]; | |
out_tile[57] += filts_strip[1]*in_strip[9]; | |
out_tile[58] += filts_strip[2]*in_strip[9]; | |
out_tile[59] += filts_strip[3]*in_strip[9]; | |
out_tile[60] += filts_strip[4]*in_strip[9]; | |
out_tile[61] += filts_strip[5]*in_strip[9]; | |
out_tile[62] += filts_strip[6]*in_strip[9]; | |
out_tile[63] += filts_strip[7]*in_strip[9]; | |
*/ | |
/* t_tile_bias_loads = // begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
// end t_tile_bias_loads */ | |
/* t_tile_stores = // begin t_tile_stores | |
if( %(out_line_img) >= %(out_ix_img_dim) ) { return; } | |
int32_t out_x = %(GRP_ID_1D_blk_bx)*%(t_tile_sz); | |
int32_t out_chan = (%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim) + %(LOC_ID_1D_out_chan_tile))*%(t_tile_sz); | |
GASQ float * out_off = out + %(out_line_img)*%(out_ix_img_sz) + out_chan*%(out_ix_chan_sz) + %(out_line_y)*%(out_ix_y_sz) + out_x*%(out_ix_x_sz) ; | |
if( (out_x + 0) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( (out_x + 1) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( (out_x + 2) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( (out_x + 3) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( (out_x + 4) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( (out_x + 5) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( (out_x + 6) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( (out_x + 7) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores */ | |
CUCL_GLOBAL_KERNEL void in_tile_xpose__num_imgs_20__stride_1__kern_sz_3__in_pad_1__in_chans_96__ysz_28__xsz_28__tix_pels_tile_sz_8__t_tile_sz_8__bix_pels_blk_sz_280( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const out_ix = GLOB_ID_1D; | |
if( out_ix >= 3225600 ) { return; } | |
int32_t const out_line = (out_ix/46080)*8; | |
int32_t const fi_skip_in_lines = (out_line%28)*1; | |
int32_t const in_line = (((out_ix/10)%12)+fi_skip_in_lines); | |
int32_t const img_in_lines = (28 - 1)*1 + 3; | |
int32_t const img_off = in_line/img_in_lines; | |
int32_t const img = (out_line/28) + img_off; | |
int32_t const iy = (in_line % img_in_lines) - 1; //(out_line%28)*1 + ((out_ix/10)%12) - 1; | |
int32_t const ix = ((out_ix/11520)%4)*8*1 + (out_ix%10) - 1; | |
float v = 0.0f; | |
if( 1 | |
&& ( ix >= 0 ) | |
&& ( iy >= 0 ) | |
&& ( ix < 28 ) | |
&& ( iy < 28 ) | |
&& ( img < 20 ) | |
) | |
{ | |
v = in[ img*75264 + | |
((out_ix/120)%96)*784 + | |
iy*28 + | |
ix*1 ]; | |
} | |
out[out_ix] = v; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* stride = 1 */ | |
/* kern_sz = 3 */ | |
/* in_pad = 1 */ | |
/* in_chans = 96 */ | |
/* ysz = 28 */ | |
/* xsz = 28 */ | |
/* tix_pels_tile_sz = 8 */ | |
/* t_tile_sz = 8 */ | |
/* bix_pels_blk_sz = 280 */ | |
/* rtc_func_name = in_tile_xpose__num_imgs_20__stride_1__kern_sz_3__in_pad_1__in_chans_96__ysz_28__xsz_28__tix_pels_tile_sz_8__t_tile_sz_8__bix_pels_blk_sz_280 */ | |
/* out_ix_blk_x_dim = 10 */ | |
/* out_ix_blk_x_sz = 1 */ | |
/* out_ix_blk_x_nomod = out_ix */ | |
/* out_ix_blk_x = (out_ix%%10) */ | |
/* out_ix_blk_y_dim = 12 */ | |
/* out_ix_blk_y_sz = 10 */ | |
/* out_ix_blk_y_nomod = (out_ix/10) */ | |
/* out_ix_blk_y = ((out_ix/10)%%12) */ | |
/* out_ix_blk_in_chan_dim = 96 */ | |
/* out_ix_blk_in_chan_sz = 120 */ | |
/* out_ix_blk_in_chan_nomod = (out_ix/120) */ | |
/* out_ix_blk_in_chan = ((out_ix/120)%%96) */ | |
/* out_ix_blk_bx_dim = 4 */ | |
/* out_ix_blk_bx_sz = 11520 */ | |
/* out_ix_blk_bx_nomod = (out_ix/11520) */ | |
/* out_ix_blk_bx = ((out_ix/11520)%%4) */ | |
/* out_ix_blk_bline_dim = 70 */ | |
/* out_ix_blk_bline_sz = 46080 */ | |
/* out_ix_blk_bline_nomod = (out_ix/46080) */ | |
/* out_ix_blk_bline = (out_ix/46080) */ | |
/* out_ix_sz = 3225600 */ | |
/* out_line_y_dim = 28 */ | |
/* out_line_y_sz = 1 */ | |
/* out_line_y_nomod = out_line */ | |
/* out_line_y = (out_line%%28) */ | |
/* out_line_img_dim = 20 */ | |
/* out_line_img_sz = 28 */ | |
/* out_line_img_nomod = (out_line/28) */ | |
/* out_line_img = (out_line/28) */ | |
/* out_line_sz = 560 */ | |
/* in_ix_x_dim = 28 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%28) */ | |
/* in_ix_y_dim = 28 */ | |
/* in_ix_y_sz = 28 */ | |
/* in_ix_y_nomod = (in_ix/28) */ | |
/* in_ix_y = ((in_ix/28)%%28) */ | |
/* in_ix_chan_dim = 96 */ | |
/* in_ix_chan_sz = 784 */ | |
/* in_ix_chan_nomod = (in_ix/784) */ | |
/* in_ix_chan = ((in_ix/784)%%96) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 75264 */ | |
/* in_ix_img_nomod = (in_ix/75264) */ | |
/* in_ix_img = (in_ix/75264) */ | |
/* in_ix_sz = 1505280 */ | |
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_128__in_chans_96__kysz_3__kxsz_3( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x | |
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile | |
{ | |
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg | |
int32_t const filts_ix = GLOB_ID_1D; | |
if( filts_ix >= 110592 ) { return; } | |
int32_t const fioc = (filts_ix/864); | |
float val = 0.0f; | |
int32_t const filts_xp_ix = | |
(fioc/128)*110592 + | |
(fioc%8)*16 + | |
((fioc/8)%16)*1 + | |
((filts_ix/9)%96)*1152 + | |
((filts_ix/3)%3)*384 + | |
(filts_ix%3)*128; | |
#if 1 | |
val = in[filts_ix]; | |
#else | |
if( ((filts_ix/9)%96) == 0 ) { | |
// if( ((filts_ix%3) == 5) && (((filts_ix/3)%3) == 5) ) | |
{ | |
val = (filts_ix%3)*100 + ((filts_ix/3)%3); | |
} | |
} | |
#endif | |
out[filts_xp_ix] = val; | |
} | |
// -- template substituion table used: -- | |
/* out_chans = 128 */ | |
/* in_chans = 96 */ | |
/* kysz = 3 */ | |
/* kxsz = 3 */ | |
/* rtc_func_name = xpose_filts__out_chans_128__in_chans_96__kysz_3__kxsz_3 */ | |
/* t_tile_sz = 8 */ | |
/* filts_ix_x_dim = 3 */ | |
/* filts_ix_x_sz = 1 */ | |
/* filts_ix_x_nomod = filts_ix */ | |
/* filts_ix_x = (filts_ix%%3) */ | |
/* filts_ix_y_dim = 3 */ | |
/* filts_ix_y_sz = 3 */ | |
/* filts_ix_y_nomod = (filts_ix/3) */ | |
/* filts_ix_y = ((filts_ix/3)%%3) */ | |
/* filts_ix_in_chan_dim = 96 */ | |
/* filts_ix_in_chan_sz = 9 */ | |
/* filts_ix_in_chan_nomod = (filts_ix/9) */ | |
/* filts_ix_in_chan = ((filts_ix/9)%%96) */ | |
/* filts_ix_out_chan_dim = 128 */ | |
/* filts_ix_out_chan_sz = 864 */ | |
/* filts_ix_out_chan_nomod = (filts_ix/864) */ | |
/* filts_ix_out_chan = (filts_ix/864) */ | |
/* filts_ix_sz = 110592 */ | |
/* filts_xp_ix_out_chan_tile_dim = 16 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%16) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 16 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/16) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/16)%%8) */ | |
/* filts_xp_ix_x_dim = 3 */ | |
/* filts_xp_ix_x_sz = 128 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/128) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/128)%%3) */ | |
/* filts_xp_ix_y_dim = 3 */ | |
/* filts_xp_ix_y_sz = 384 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/384) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/384)%%3) */ | |
/* filts_xp_ix_in_chan_dim = 96 */ | |
/* filts_xp_ix_in_chan_sz = 1152 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/1152) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/1152)%%96) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 110592 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/110592) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/110592) */ | |
/* filts_xp_ix_sz = 110592 */ | |
/* fioc_out_chan_reg_dim = 8 */ | |
/* fioc_out_chan_reg_sz = 1 */ | |
/* fioc_out_chan_reg_nomod = fioc */ | |
/* fioc_out_chan_reg = (fioc%%8) */ | |
/* fioc_out_chan_tile_dim = 16 */ | |
/* fioc_out_chan_tile_sz = 8 */ | |
/* fioc_out_chan_tile_nomod = (fioc/8) */ | |
/* fioc_out_chan_tile = ((fioc/8)%%16) */ | |
/* fioc_out_chan_blk_dim = 1 */ | |
/* fioc_out_chan_blk_sz = 128 */ | |
/* fioc_out_chan_blk_nomod = (fioc/128) */ | |
/* fioc_out_chan_blk = (fioc/128) */ | |
/* fioc_sz = 128 */ | |
// 256 tbp | |
// each thread: computes 8x8 block of out | |
// loop over k dim | |
CUCL_GLOBAL_KERNEL void conv__num_imgs_20__in_pad_0__in_dim_0_28__in_dim_1_28__conv_has_relu_1__kern_sz_1__stride_1__out_chans_16__in_chans_192( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out ) { | |
LOCSHAR_MEM float in_smem[64*8]; | |
int32_t const blk_filt_ix_sz = 2*8; | |
LOCSHAR_MEM float filts_smem[2*8]; // aka blk_filt_ix_sz, which wasn't const enough OpenCL | |
float out_tile[8*8] = {0}; // tile of output for this thread to compute, stored in registers | |
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem | |
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz ) | |
float in_strip[8]; // across patches (approx square block in x/y space, favoring x if sqrt() not integer) | |
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*3072; | |
int32_t const blk_patch_ix_sz = 64*8; | |
int32_t const blk_patch_ix_base = GRP_ID_1D*blk_patch_ix_sz; | |
// iteratate over filter elements | |
int32_t filts_off = blk_filt_ix_base; | |
for( int32_t filts_ix_out_chan_elem = 0; filts_ix_out_chan_elem != | |
(192 * 1 * 1); ++filts_ix_out_chan_elem ) { | |
BARRIER_SYNC; | |
if( LOC_ID_1D < blk_filt_ix_sz ) { | |
#ifdef NO_IOX // by default, we don't ever disable this, since it's seems about as good as it can be already | |
//filts_smem[LOC_ID_1D] = LOC_ID_1D; | |
filts_smem[LOC_ID_1D] = filts[LOC_ID_1D]; | |
#else | |
filts_smem[LOC_ID_1D] = filts[filts_off + LOC_ID_1D]; | |
#endif | |
} | |
for( int32_t i = 0; i != 4; ++i ) { | |
if( (LOC_ID_1D+LOC_SZ_1D*i) < blk_patch_ix_sz ) { | |
int32_t const t_smem_patch_ix = (blk_patch_ix_base+LOC_ID_1D+LOC_SZ_1D*i); | |
#ifdef NO_IO | |
//float v = LOC_ID_1D; | |
//float v = in[LOC_ID_1D]; | |
float v = in[filts_off + LOC_ID_1D]; | |
#else | |
float v = 0; | |
int const smem_in_ix_y = ((t_smem_patch_ix/28)%28)*1+(filts_ix_out_chan_elem%1) - 0; | |
int const smem_in_ix_x = (t_smem_patch_ix%28)*1+(filts_ix_out_chan_elem%1) - 0; | |
if(smem_in_ix_y >= 0 && smem_in_ix_x >= 0 && | |
(t_smem_patch_ix/784) < 20 && | |
smem_in_ix_x < 28 && smem_in_ix_y < 28 ) { | |
v = in[(t_smem_patch_ix/784)*150528 + | |
filts_ix_out_chan_elem*784 + | |
smem_in_ix_y*28 + | |
smem_in_ix_x*1]; | |
}; | |
#endif | |
in_smem[LOC_ID_1D+LOC_SZ_1D*i] = v; | |
} | |
} | |
filts_off += 16; | |
BARRIER_SYNC; | |
#ifdef NO_IO | |
// begin t_tile_dummy_loads | |
filts_strip[0] = filts_smem[(LOC_ID_1D % 32) + 0]; | |
filts_strip[1] = filts_smem[(LOC_ID_1D % 32) + 1]; | |
filts_strip[2] = filts_smem[(LOC_ID_1D % 32) + 2]; | |
filts_strip[3] = filts_smem[(LOC_ID_1D % 32) + 3]; | |
filts_strip[4] = filts_smem[(LOC_ID_1D % 32) + 4]; | |
filts_strip[5] = filts_smem[(LOC_ID_1D % 32) + 5]; | |
filts_strip[6] = filts_smem[(LOC_ID_1D % 32) + 6]; | |
filts_strip[7] = filts_smem[(LOC_ID_1D % 32) + 7]; | |
in_strip[0] = in_smem[(LOC_ID_1D % 32) + 0]; | |
in_strip[1] = in_smem[(LOC_ID_1D % 32) + 1]; | |
in_strip[2] = in_smem[(LOC_ID_1D % 32) + 2]; | |
in_strip[3] = in_smem[(LOC_ID_1D % 32) + 3]; | |
in_strip[4] = in_smem[(LOC_ID_1D % 32) + 4]; | |
in_strip[5] = in_smem[(LOC_ID_1D % 32) + 5]; | |
in_strip[6] = in_smem[(LOC_ID_1D % 32) + 6]; | |
in_strip[7] = in_smem[(LOC_ID_1D % 32) + 7]; | |
// end t_tile_dummy_loads; | |
#else | |
// begin t_tile_loads | |
filts_strip[0] = filts_smem[(LOC_ID_1D%2)+0*2]; | |
filts_strip[1] = filts_smem[(LOC_ID_1D%2)+1*2]; | |
filts_strip[2] = filts_smem[(LOC_ID_1D%2)+2*2]; | |
filts_strip[3] = filts_smem[(LOC_ID_1D%2)+3*2]; | |
filts_strip[4] = filts_smem[(LOC_ID_1D%2)+4*2]; | |
filts_strip[5] = filts_smem[(LOC_ID_1D%2)+5*2]; | |
filts_strip[6] = filts_smem[(LOC_ID_1D%2)+6*2]; | |
filts_strip[7] = filts_smem[(LOC_ID_1D%2)+7*2]; | |
in_strip[0] = in_smem[8*(LOC_ID_1D/2)+0]; | |
in_strip[1] = in_smem[8*(LOC_ID_1D/2)+1]; | |
in_strip[2] = in_smem[8*(LOC_ID_1D/2)+2]; | |
in_strip[3] = in_smem[8*(LOC_ID_1D/2)+3]; | |
in_strip[4] = in_smem[8*(LOC_ID_1D/2)+4]; | |
in_strip[5] = in_smem[8*(LOC_ID_1D/2)+5]; | |
in_strip[6] = in_smem[8*(LOC_ID_1D/2)+6]; | |
in_strip[7] = in_smem[8*(LOC_ID_1D/2)+7]; | |
// end t_tile_loads; | |
#endif | |
// (2) do 8^2 fmas into out_tile | |
// begin t_tile_fmas | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
// end t_tile_fmas; | |
} | |
// load per-block biases into smem | |
BARRIER_SYNC; | |
if( LOC_ID_1D < blk_filt_ix_sz ) { | |
int32_t const ocix_base = (GRP_ID_1D%1)*blk_filt_ix_sz; | |
int32_t const load_reg = LOC_ID_1D / 2; | |
int32_t const load_tile = LOC_ID_1D % 2; | |
int32_t const ocix = ocix_base + load_tile*8 + load_reg; | |
if( ocix < 16 ) { filts_smem[LOC_ID_1D] = biases[ ocix ]; } | |
//int32_t const ocix_tile = (ocix / 8) % 2; | |
//int32_t const ocix_reg = ocix % 8; | |
//filts_smem[ocix_tile * 1 + ocix_reg * 2] = biases[ocix]; | |
} | |
BARRIER_SYNC; | |
// load biases into filts_strip | |
// begin t_tile_loads | |
filts_strip[0] = filts_smem[(LOC_ID_1D%2)+0*2]; | |
filts_strip[1] = filts_smem[(LOC_ID_1D%2)+1*2]; | |
filts_strip[2] = filts_smem[(LOC_ID_1D%2)+2*2]; | |
filts_strip[3] = filts_smem[(LOC_ID_1D%2)+3*2]; | |
filts_strip[4] = filts_smem[(LOC_ID_1D%2)+4*2]; | |
filts_strip[5] = filts_smem[(LOC_ID_1D%2)+5*2]; | |
filts_strip[6] = filts_smem[(LOC_ID_1D%2)+6*2]; | |
filts_strip[7] = filts_smem[(LOC_ID_1D%2)+7*2]; | |
in_strip[0] = in_smem[8*(LOC_ID_1D/2)+0]; | |
in_strip[1] = in_smem[8*(LOC_ID_1D/2)+1]; | |
in_strip[2] = in_smem[8*(LOC_ID_1D/2)+2]; | |
in_strip[3] = in_smem[8*(LOC_ID_1D/2)+3]; | |
in_strip[4] = in_smem[8*(LOC_ID_1D/2)+4]; | |
in_strip[5] = in_smem[8*(LOC_ID_1D/2)+5]; | |
in_strip[6] = in_smem[8*(LOC_ID_1D/2)+6]; | |
in_strip[7] = in_smem[8*(LOC_ID_1D/2)+7]; | |
// end t_tile_loads; | |
// add bias to each elem of out_tile[] and store the results to out[] | |
#ifdef NO_IO | |
// begin t_tile_dummy_stores | |
out[0] = 0.0f | |
+ max(0.0f,out_tile[0] + filts_strip[0]) | |
+ max(0.0f,out_tile[1] + filts_strip[1]) | |
+ max(0.0f,out_tile[2] + filts_strip[2]) | |
+ max(0.0f,out_tile[3] + filts_strip[3]) | |
+ max(0.0f,out_tile[4] + filts_strip[4]) | |
+ max(0.0f,out_tile[5] + filts_strip[5]) | |
+ max(0.0f,out_tile[6] + filts_strip[6]) | |
+ max(0.0f,out_tile[7] + filts_strip[7]) | |
+ max(0.0f,out_tile[8] + filts_strip[0]) | |
+ max(0.0f,out_tile[9] + filts_strip[1]) | |
+ max(0.0f,out_tile[10] + filts_strip[2]) | |
+ max(0.0f,out_tile[11] + filts_strip[3]) | |
+ max(0.0f,out_tile[12] + filts_strip[4]) | |
+ max(0.0f,out_tile[13] + filts_strip[5]) | |
+ max(0.0f,out_tile[14] + filts_strip[6]) | |
+ max(0.0f,out_tile[15] + filts_strip[7]) | |
+ max(0.0f,out_tile[16] + filts_strip[0]) | |
+ max(0.0f,out_tile[17] + filts_strip[1]) | |
+ max(0.0f,out_tile[18] + filts_strip[2]) | |
+ max(0.0f,out_tile[19] + filts_strip[3]) | |
+ max(0.0f,out_tile[20] + filts_strip[4]) | |
+ max(0.0f,out_tile[21] + filts_strip[5]) | |
+ max(0.0f,out_tile[22] + filts_strip[6]) | |
+ max(0.0f,out_tile[23] + filts_strip[7]) | |
+ max(0.0f,out_tile[24] + filts_strip[0]) | |
+ max(0.0f,out_tile[25] + filts_strip[1]) | |
+ max(0.0f,out_tile[26] + filts_strip[2]) | |
+ max(0.0f,out_tile[27] + filts_strip[3]) | |
+ max(0.0f,out_tile[28] + filts_strip[4]) | |
+ max(0.0f,out_tile[29] + filts_strip[5]) | |
+ max(0.0f,out_tile[30] + filts_strip[6]) | |
+ max(0.0f,out_tile[31] + filts_strip[7]) | |
+ max(0.0f,out_tile[32] + filts_strip[0]) | |
+ max(0.0f,out_tile[33] + filts_strip[1]) | |
+ max(0.0f,out_tile[34] + filts_strip[2]) | |
+ max(0.0f,out_tile[35] + filts_strip[3]) | |
+ max(0.0f,out_tile[36] + filts_strip[4]) | |
+ max(0.0f,out_tile[37] + filts_strip[5]) | |
+ max(0.0f,out_tile[38] + filts_strip[6]) | |
+ max(0.0f,out_tile[39] + filts_strip[7]) | |
+ max(0.0f,out_tile[40] + filts_strip[0]) | |
+ max(0.0f,out_tile[41] + filts_strip[1]) | |
+ max(0.0f,out_tile[42] + filts_strip[2]) | |
+ max(0.0f,out_tile[43] + filts_strip[3]) | |
+ max(0.0f,out_tile[44] + filts_strip[4]) | |
+ max(0.0f,out_tile[45] + filts_strip[5]) | |
+ max(0.0f,out_tile[46] + filts_strip[6]) | |
+ max(0.0f,out_tile[47] + filts_strip[7]) | |
+ max(0.0f,out_tile[48] + filts_strip[0]) | |
+ max(0.0f,out_tile[49] + filts_strip[1]) | |
+ max(0.0f,out_tile[50] + filts_strip[2]) | |
+ max(0.0f,out_tile[51] + filts_strip[3]) | |
+ max(0.0f,out_tile[52] + filts_strip[4]) | |
+ max(0.0f,out_tile[53] + filts_strip[5]) | |
+ max(0.0f,out_tile[54] + filts_strip[6]) | |
+ max(0.0f,out_tile[55] + filts_strip[7]) | |
+ max(0.0f,out_tile[56] + filts_strip[0]) | |
+ max(0.0f,out_tile[57] + filts_strip[1]) | |
+ max(0.0f,out_tile[58] + filts_strip[2]) | |
+ max(0.0f,out_tile[59] + filts_strip[3]) | |
+ max(0.0f,out_tile[60] + filts_strip[4]) | |
+ max(0.0f,out_tile[61] + filts_strip[5]) | |
+ max(0.0f,out_tile[62] + filts_strip[6]) | |
+ max(0.0f,out_tile[63] + filts_strip[7]) | |
; | |
// end t_tile_dummy_stores; | |
#else | |
// begin t_tile_stores | |
int32_t tpix[8]; | |
int32_t tcix[8]; | |
tpix[0] = ((((LOC_ID_1D/2)+GRP_ID_1D*64)*8+0)/784)*12544 + | |
( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+0) % 784 ); // cache out patch ixs | |
tpix[1] = ((((LOC_ID_1D/2)+GRP_ID_1D*64)*8+1)/784)*12544 + | |
( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+1) % 784 ); // cache out patch ixs | |
tpix[2] = ((((LOC_ID_1D/2)+GRP_ID_1D*64)*8+2)/784)*12544 + | |
( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+2) % 784 ); // cache out patch ixs | |
tpix[3] = ((((LOC_ID_1D/2)+GRP_ID_1D*64)*8+3)/784)*12544 + | |
( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+3) % 784 ); // cache out patch ixs | |
tpix[4] = ((((LOC_ID_1D/2)+GRP_ID_1D*64)*8+4)/784)*12544 + | |
( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+4) % 784 ); // cache out patch ixs | |
tpix[5] = ((((LOC_ID_1D/2)+GRP_ID_1D*64)*8+5)/784)*12544 + | |
( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+5) % 784 ); // cache out patch ixs | |
tpix[6] = ((((LOC_ID_1D/2)+GRP_ID_1D*64)*8+6)/784)*12544 + | |
( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+6) % 784 ); // cache out patch ixs | |
tpix[7] = ((((LOC_ID_1D/2)+GRP_ID_1D*64)*8+7)/784)*12544 + | |
( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+7) % 784 ); // cache out patch ixs | |
tcix[0] = ((((LOC_ID_1D%2)+(GRP_ID_1D%1)*2)*8)+0)*784; // cache out chan ixs | |
tcix[1] = ((((LOC_ID_1D%2)+(GRP_ID_1D%1)*2)*8)+1)*784; // cache out chan ixs | |
tcix[2] = ((((LOC_ID_1D%2)+(GRP_ID_1D%1)*2)*8)+2)*784; // cache out chan ixs | |
tcix[3] = ((((LOC_ID_1D%2)+(GRP_ID_1D%1)*2)*8)+3)*784; // cache out chan ixs | |
tcix[4] = ((((LOC_ID_1D%2)+(GRP_ID_1D%1)*2)*8)+4)*784; // cache out chan ixs | |
tcix[5] = ((((LOC_ID_1D%2)+(GRP_ID_1D%1)*2)*8)+5)*784; // cache out chan ixs | |
tcix[6] = ((((LOC_ID_1D%2)+(GRP_ID_1D%1)*2)*8)+6)*784; // cache out chan ixs | |
tcix[7] = ((((LOC_ID_1D%2)+(GRP_ID_1D%1)*2)*8)+7)*784; // cache out chan ixs | |
if( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+0) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (16*784) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( tcix[1] < (16*784) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( tcix[2] < (16*784) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( tcix[3] < (16*784) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( tcix[4] < (16*784) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( tcix[5] < (16*784) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( tcix[6] < (16*784) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( tcix[7] < (16*784) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+1) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (16*784) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( tcix[1] < (16*784) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( tcix[2] < (16*784) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( tcix[3] < (16*784) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( tcix[4] < (16*784) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( tcix[5] < (16*784) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( tcix[6] < (16*784) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( tcix[7] < (16*784) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+2) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (16*784) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( tcix[1] < (16*784) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( tcix[2] < (16*784) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( tcix[3] < (16*784) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( tcix[4] < (16*784) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( tcix[5] < (16*784) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( tcix[6] < (16*784) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( tcix[7] < (16*784) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+3) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (16*784) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( tcix[1] < (16*784) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( tcix[2] < (16*784) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( tcix[3] < (16*784) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( tcix[4] < (16*784) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( tcix[5] < (16*784) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( tcix[6] < (16*784) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( tcix[7] < (16*784) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+4) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (16*784) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( tcix[1] < (16*784) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( tcix[2] < (16*784) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( tcix[3] < (16*784) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( tcix[4] < (16*784) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( tcix[5] < (16*784) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( tcix[6] < (16*784) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( tcix[7] < (16*784) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+5) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (16*784) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( tcix[1] < (16*784) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( tcix[2] < (16*784) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( tcix[3] < (16*784) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( tcix[4] < (16*784) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( tcix[5] < (16*784) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( tcix[6] < (16*784) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( tcix[7] < (16*784) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+6) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (16*784) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( tcix[1] < (16*784) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( tcix[2] < (16*784) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( tcix[3] < (16*784) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( tcix[4] < (16*784) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( tcix[5] < (16*784) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( tcix[6] < (16*784) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( tcix[7] < (16*784) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( (((LOC_ID_1D/2)+GRP_ID_1D*64)*8+7) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (16*784) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( tcix[1] < (16*784) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( tcix[2] < (16*784) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( tcix[3] < (16*784) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( tcix[4] < (16*784) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( tcix[5] < (16*784) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( tcix[6] < (16*784) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( tcix[7] < (16*784) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores; | |
#endif | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_pad = 0 */ | |
/* in_dim_0 = 28 */ | |
/* in_dim_1 = 28 */ | |
/* conv_has_relu = 1 */ | |
/* kern_sz = 1 */ | |
/* stride = 1 */ | |
/* out_chans = 16 */ | |
/* in_chans = 192 */ | |
/* rtc_func_name = conv__num_imgs_20__in_pad_0__in_dim_0_28__in_dim_1_28__conv_has_relu_1__kern_sz_1__stride_1__out_chans_16__in_chans_192 */ | |
/* t_tile_sz = 8 */ | |
/* out_ix_x_dim = 28 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%28) */ | |
/* out_ix_y_dim = 28 */ | |
/* out_ix_y_sz = 28 */ | |
/* out_ix_y_nomod = (out_ix/28) */ | |
/* out_ix_y = ((out_ix/28)%%28) */ | |
/* out_ix_chan_dim = 16 */ | |
/* out_ix_chan_sz = 784 */ | |
/* out_ix_chan_nomod = (out_ix/784) */ | |
/* out_ix_chan = ((out_ix/784)%%16) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 12544 */ | |
/* out_ix_img_nomod = (out_ix/12544) */ | |
/* out_ix_img = (out_ix/12544) */ | |
/* out_ix_sz = 250880 */ | |
/* in_ix_x_dim = 28 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%28) */ | |
/* in_ix_y_dim = 28 */ | |
/* in_ix_y_sz = 28 */ | |
/* in_ix_y_nomod = (in_ix/28) */ | |
/* in_ix_y = ((in_ix/28)%%28) */ | |
/* in_ix_chan_dim = 192 */ | |
/* in_ix_chan_sz = 784 */ | |
/* in_ix_chan_nomod = (in_ix/784) */ | |
/* in_ix_chan = ((in_ix/784)%%192) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 150528 */ | |
/* in_ix_img_nomod = (in_ix/150528) */ | |
/* in_ix_img = (in_ix/150528) */ | |
/* in_ix_sz = 3010560 */ | |
/* t_smem_patch_ix_x_dim = 28 */ | |
/* t_smem_patch_ix_x_sz = 1 */ | |
/* t_smem_patch_ix_x_nomod = t_smem_patch_ix */ | |
/* t_smem_patch_ix_x = (t_smem_patch_ix%%28) */ | |
/* t_smem_patch_ix_y_dim = 28 */ | |
/* t_smem_patch_ix_y_sz = 28 */ | |
/* t_smem_patch_ix_y_nomod = (t_smem_patch_ix/28) */ | |
/* t_smem_patch_ix_y = ((t_smem_patch_ix/28)%%28) */ | |
/* t_smem_patch_ix_img_dim = 20 */ | |
/* t_smem_patch_ix_img_sz = 784 */ | |
/* t_smem_patch_ix_img_nomod = (t_smem_patch_ix/784) */ | |
/* t_smem_patch_ix_img = (t_smem_patch_ix/784) */ | |
/* t_smem_patch_ix_sz = 15680 */ | |
/* filts_ix_out_chan_elem_x_dim = 1 */ | |
/* filts_ix_out_chan_elem_x_sz = 1 */ | |
/* filts_ix_out_chan_elem_x_nomod = filts_ix_out_chan_elem */ | |
/* filts_ix_out_chan_elem_x = (filts_ix_out_chan_elem%%1) */ | |
/* filts_ix_out_chan_elem_y_dim = 1 */ | |
/* filts_ix_out_chan_elem_y_sz = 1 */ | |
/* filts_ix_out_chan_elem_y_nomod = filts_ix_out_chan_elem */ | |
/* filts_ix_out_chan_elem_y = (filts_ix_out_chan_elem%%1) */ | |
/* filts_ix_out_chan_elem_in_chan_dim = 192 */ | |
/* filts_ix_out_chan_elem_in_chan_sz = 1 */ | |
/* filts_ix_out_chan_elem_in_chan_nomod = filts_ix_out_chan_elem */ | |
/* filts_ix_out_chan_elem_in_chan = filts_ix_out_chan_elem */ | |
/* filts_ix_out_chan_elem_sz = 192 */ | |
/* LOC_ID_1D_out_chan_tile_dim = 2 */ | |
/* LOC_ID_1D_out_chan_tile_sz = 1 */ | |
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */ | |
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%2) */ | |
/* LOC_ID_1D_patch_tile_dim = 64 */ | |
/* LOC_ID_1D_patch_tile_sz = 2 */ | |
/* LOC_ID_1D_patch_tile_nomod = (LOC_ID_1D/2) */ | |
/* LOC_ID_1D_patch_tile = (LOC_ID_1D/2) */ | |
/* LOC_ID_1D_sz = 128 */ | |
/* filts_xp_ix_out_chan_tile_dim = 2 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%2) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 2 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/2) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/2)%%8) */ | |
/* filts_xp_ix_x_dim = 1 */ | |
/* filts_xp_ix_x_sz = 16 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/16) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/16)%%1) */ | |
/* filts_xp_ix_y_dim = 1 */ | |
/* filts_xp_ix_y_sz = 16 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/16) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/16)%%1) */ | |
/* filts_xp_ix_in_chan_dim = 192 */ | |
/* filts_xp_ix_in_chan_sz = 16 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/16) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/16)%%192) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 3072 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/3072) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/3072) */ | |
/* filts_xp_ix_sz = 3072 */ | |
/* patch_smem_load_iter = 4 */ | |
/* GRP_ID_1D_out_chan_blk_dim = 1 */ | |
/* GRP_ID_1D_out_chan_blk_sz = 1 */ | |
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */ | |
/* GRP_ID_1D_patch_blk_dim = 31 */ | |
/* GRP_ID_1D_patch_blk_sz = 1 */ | |
/* GRP_ID_1D_patch_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_patch_blk = GRP_ID_1D */ | |
/* GRP_ID_1D_sz = 31 */ | |
/* out_chan_tile = (%(LOC_ID_1D_out_chan_tile)+%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim)) */ | |
/* patch_tile = (%(LOC_ID_1D_patch_tile)+%(GRP_ID_1D_patch_blk)*%(LOC_ID_1D_patch_tile_dim)) */ | |
/* out_chan_ix = (%(out_chan_tile)*%(t_tile_sz)) */ | |
/* patch_ix_0 = (%(patch_tile)*%(t_tile_sz)+0) */ | |
/* patch_ix_0_x_dim = 28 */ | |
/* patch_ix_0_x_sz = 1 */ | |
/* patch_ix_0_x_nomod = %(patch_ix_0) */ | |
/* patch_ix_0_x = (%(patch_ix_0)%%28) */ | |
/* patch_ix_0_y_dim = 28 */ | |
/* patch_ix_0_y_sz = 28 */ | |
/* patch_ix_0_y_nomod = (%(patch_ix_0)/28) */ | |
/* patch_ix_0_y = ((%(patch_ix_0)/28)%%28) */ | |
/* patch_ix_0_img_dim = 20 */ | |
/* patch_ix_0_img_sz = 784 */ | |
/* patch_ix_0_img_nomod = (%(patch_ix_0)/784) */ | |
/* patch_ix_0_img = (%(patch_ix_0)/784) */ | |
/* patch_ix_0_sz = 15680 */ | |
/* patch_ix_1 = (%(patch_tile)*%(t_tile_sz)+1) */ | |
/* patch_ix_1_x_dim = 28 */ | |
/* patch_ix_1_x_sz = 1 */ | |
/* patch_ix_1_x_nomod = %(patch_ix_1) */ | |
/* patch_ix_1_x = (%(patch_ix_1)%%28) */ | |
/* patch_ix_1_y_dim = 28 */ | |
/* patch_ix_1_y_sz = 28 */ | |
/* patch_ix_1_y_nomod = (%(patch_ix_1)/28) */ | |
/* patch_ix_1_y = ((%(patch_ix_1)/28)%%28) */ | |
/* patch_ix_1_img_dim = 20 */ | |
/* patch_ix_1_img_sz = 784 */ | |
/* patch_ix_1_img_nomod = (%(patch_ix_1)/784) */ | |
/* patch_ix_1_img = (%(patch_ix_1)/784) */ | |
/* patch_ix_1_sz = 15680 */ | |
/* patch_ix_2 = (%(patch_tile)*%(t_tile_sz)+2) */ | |
/* patch_ix_2_x_dim = 28 */ | |
/* patch_ix_2_x_sz = 1 */ | |
/* patch_ix_2_x_nomod = %(patch_ix_2) */ | |
/* patch_ix_2_x = (%(patch_ix_2)%%28) */ | |
/* patch_ix_2_y_dim = 28 */ | |
/* patch_ix_2_y_sz = 28 */ | |
/* patch_ix_2_y_nomod = (%(patch_ix_2)/28) */ | |
/* patch_ix_2_y = ((%(patch_ix_2)/28)%%28) */ | |
/* patch_ix_2_img_dim = 20 */ | |
/* patch_ix_2_img_sz = 784 */ | |
/* patch_ix_2_img_nomod = (%(patch_ix_2)/784) */ | |
/* patch_ix_2_img = (%(patch_ix_2)/784) */ | |
/* patch_ix_2_sz = 15680 */ | |
/* patch_ix_3 = (%(patch_tile)*%(t_tile_sz)+3) */ | |
/* patch_ix_3_x_dim = 28 */ | |
/* patch_ix_3_x_sz = 1 */ | |
/* patch_ix_3_x_nomod = %(patch_ix_3) */ | |
/* patch_ix_3_x = (%(patch_ix_3)%%28) */ | |
/* patch_ix_3_y_dim = 28 */ | |
/* patch_ix_3_y_sz = 28 */ | |
/* patch_ix_3_y_nomod = (%(patch_ix_3)/28) */ | |
/* patch_ix_3_y = ((%(patch_ix_3)/28)%%28) */ | |
/* patch_ix_3_img_dim = 20 */ | |
/* patch_ix_3_img_sz = 784 */ | |
/* patch_ix_3_img_nomod = (%(patch_ix_3)/784) */ | |
/* patch_ix_3_img = (%(patch_ix_3)/784) */ | |
/* patch_ix_3_sz = 15680 */ | |
/* patch_ix_4 = (%(patch_tile)*%(t_tile_sz)+4) */ | |
/* patch_ix_4_x_dim = 28 */ | |
/* patch_ix_4_x_sz = 1 */ | |
/* patch_ix_4_x_nomod = %(patch_ix_4) */ | |
/* patch_ix_4_x = (%(patch_ix_4)%%28) */ | |
/* patch_ix_4_y_dim = 28 */ | |
/* patch_ix_4_y_sz = 28 */ | |
/* patch_ix_4_y_nomod = (%(patch_ix_4)/28) */ | |
/* patch_ix_4_y = ((%(patch_ix_4)/28)%%28) */ | |
/* patch_ix_4_img_dim = 20 */ | |
/* patch_ix_4_img_sz = 784 */ | |
/* patch_ix_4_img_nomod = (%(patch_ix_4)/784) */ | |
/* patch_ix_4_img = (%(patch_ix_4)/784) */ | |
/* patch_ix_4_sz = 15680 */ | |
/* patch_ix_5 = (%(patch_tile)*%(t_tile_sz)+5) */ | |
/* patch_ix_5_x_dim = 28 */ | |
/* patch_ix_5_x_sz = 1 */ | |
/* patch_ix_5_x_nomod = %(patch_ix_5) */ | |
/* patch_ix_5_x = (%(patch_ix_5)%%28) */ | |
/* patch_ix_5_y_dim = 28 */ | |
/* patch_ix_5_y_sz = 28 */ | |
/* patch_ix_5_y_nomod = (%(patch_ix_5)/28) */ | |
/* patch_ix_5_y = ((%(patch_ix_5)/28)%%28) */ | |
/* patch_ix_5_img_dim = 20 */ | |
/* patch_ix_5_img_sz = 784 */ | |
/* patch_ix_5_img_nomod = (%(patch_ix_5)/784) */ | |
/* patch_ix_5_img = (%(patch_ix_5)/784) */ | |
/* patch_ix_5_sz = 15680 */ | |
/* patch_ix_6 = (%(patch_tile)*%(t_tile_sz)+6) */ | |
/* patch_ix_6_x_dim = 28 */ | |
/* patch_ix_6_x_sz = 1 */ | |
/* patch_ix_6_x_nomod = %(patch_ix_6) */ | |
/* patch_ix_6_x = (%(patch_ix_6)%%28) */ | |
/* patch_ix_6_y_dim = 28 */ | |
/* patch_ix_6_y_sz = 28 */ | |
/* patch_ix_6_y_nomod = (%(patch_ix_6)/28) */ | |
/* patch_ix_6_y = ((%(patch_ix_6)/28)%%28) */ | |
/* patch_ix_6_img_dim = 20 */ | |
/* patch_ix_6_img_sz = 784 */ | |
/* patch_ix_6_img_nomod = (%(patch_ix_6)/784) */ | |
/* patch_ix_6_img = (%(patch_ix_6)/784) */ | |
/* patch_ix_6_sz = 15680 */ | |
/* patch_ix_7 = (%(patch_tile)*%(t_tile_sz)+7) */ | |
/* patch_ix_7_x_dim = 28 */ | |
/* patch_ix_7_x_sz = 1 */ | |
/* patch_ix_7_x_nomod = %(patch_ix_7) */ | |
/* patch_ix_7_x = (%(patch_ix_7)%%28) */ | |
/* patch_ix_7_y_dim = 28 */ | |
/* patch_ix_7_y_sz = 28 */ | |
/* patch_ix_7_y_nomod = (%(patch_ix_7)/28) */ | |
/* patch_ix_7_y = ((%(patch_ix_7)/28)%%28) */ | |
/* patch_ix_7_img_dim = 20 */ | |
/* patch_ix_7_img_sz = 784 */ | |
/* patch_ix_7_img_nomod = (%(patch_ix_7)/784) */ | |
/* patch_ix_7_img = (%(patch_ix_7)/784) */ | |
/* patch_ix_7_sz = 15680 */ | |
/* get_in = float v = 0; | |
int const smem_in_ix_y = %(t_smem_patch_ix_y)*%(stride)+%(filts_ix_out_chan_elem_y) - %(in_pad); | |
int const smem_in_ix_x = %(t_smem_patch_ix_x)*%(stride)+%(filts_ix_out_chan_elem_x) - %(in_pad); | |
if(smem_in_ix_y >= 0 && smem_in_ix_x >= 0 && | |
%(t_smem_patch_ix_img) < %(in_ix_img_dim) && | |
smem_in_ix_x < %(in_ix_x_dim) && smem_in_ix_y < %(in_ix_y_dim) ) { | |
v = in[%(t_smem_patch_ix_img)*%(in_ix_img_sz) + | |
%(filts_ix_out_chan_elem_in_chan)*%(in_ix_chan_sz) + | |
smem_in_ix_y*%(in_ix_y_sz) + | |
smem_in_ix_x*%(in_ix_x_sz)]; | |
} */ | |
/* t_tile_fmas = // begin t_tile_fmas | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
// end t_tile_fmas */ | |
/* t_tile_loads = // begin t_tile_loads | |
filts_strip[0] = filts_smem[%(LOC_ID_1D_out_chan_tile)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem[%(LOC_ID_1D_out_chan_tile)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem[%(LOC_ID_1D_out_chan_tile)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem[%(LOC_ID_1D_out_chan_tile)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem[%(LOC_ID_1D_out_chan_tile)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem[%(LOC_ID_1D_out_chan_tile)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem[%(LOC_ID_1D_out_chan_tile)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem[%(LOC_ID_1D_out_chan_tile)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+0]; | |
in_strip[1] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+1]; | |
in_strip[2] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+2]; | |
in_strip[3] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+3]; | |
in_strip[4] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+4]; | |
in_strip[5] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+5]; | |
in_strip[6] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+6]; | |
in_strip[7] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+7]; | |
// end t_tile_loads */ | |
/* t_tile_dummy_loads = // begin t_tile_dummy_loads | |
filts_strip[0] = filts_smem[(LOC_ID_1D %% 32) + 0]; | |
filts_strip[1] = filts_smem[(LOC_ID_1D %% 32) + 1]; | |
filts_strip[2] = filts_smem[(LOC_ID_1D %% 32) + 2]; | |
filts_strip[3] = filts_smem[(LOC_ID_1D %% 32) + 3]; | |
filts_strip[4] = filts_smem[(LOC_ID_1D %% 32) + 4]; | |
filts_strip[5] = filts_smem[(LOC_ID_1D %% 32) + 5]; | |
filts_strip[6] = filts_smem[(LOC_ID_1D %% 32) + 6]; | |
filts_strip[7] = filts_smem[(LOC_ID_1D %% 32) + 7]; | |
in_strip[0] = in_smem[(LOC_ID_1D %% 32) + 0]; | |
in_strip[1] = in_smem[(LOC_ID_1D %% 32) + 1]; | |
in_strip[2] = in_smem[(LOC_ID_1D %% 32) + 2]; | |
in_strip[3] = in_smem[(LOC_ID_1D %% 32) + 3]; | |
in_strip[4] = in_smem[(LOC_ID_1D %% 32) + 4]; | |
in_strip[5] = in_smem[(LOC_ID_1D %% 32) + 5]; | |
in_strip[6] = in_smem[(LOC_ID_1D %% 32) + 6]; | |
in_strip[7] = in_smem[(LOC_ID_1D %% 32) + 7]; | |
// end t_tile_dummy_loads */ | |
/* t_tile_stores = // begin t_tile_stores | |
int32_t tpix[%(t_tile_sz)]; | |
int32_t tcix[%(t_tile_sz)]; | |
tpix[0] = %(patch_ix_0_img)*%(out_ix_img_sz) + | |
( %(patch_ix_0) %% %(patch_ix_0_img_sz) ); // cache out patch ixs | |
tpix[1] = %(patch_ix_1_img)*%(out_ix_img_sz) + | |
( %(patch_ix_1) %% %(patch_ix_1_img_sz) ); // cache out patch ixs | |
tpix[2] = %(patch_ix_2_img)*%(out_ix_img_sz) + | |
( %(patch_ix_2) %% %(patch_ix_2_img_sz) ); // cache out patch ixs | |
tpix[3] = %(patch_ix_3_img)*%(out_ix_img_sz) + | |
( %(patch_ix_3) %% %(patch_ix_3_img_sz) ); // cache out patch ixs | |
tpix[4] = %(patch_ix_4_img)*%(out_ix_img_sz) + | |
( %(patch_ix_4) %% %(patch_ix_4_img_sz) ); // cache out patch ixs | |
tpix[5] = %(patch_ix_5_img)*%(out_ix_img_sz) + | |
( %(patch_ix_5) %% %(patch_ix_5_img_sz) ); // cache out patch ixs | |
tpix[6] = %(patch_ix_6_img)*%(out_ix_img_sz) + | |
( %(patch_ix_6) %% %(patch_ix_6_img_sz) ); // cache out patch ixs | |
tpix[7] = %(patch_ix_7_img)*%(out_ix_img_sz) + | |
( %(patch_ix_7) %% %(patch_ix_7_img_sz) ); // cache out patch ixs | |
tcix[0] = (%(out_chan_ix)+0)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[1] = (%(out_chan_ix)+1)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[2] = (%(out_chan_ix)+2)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[3] = (%(out_chan_ix)+3)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[4] = (%(out_chan_ix)+4)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[5] = (%(out_chan_ix)+5)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[6] = (%(out_chan_ix)+6)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[7] = (%(out_chan_ix)+7)*%(out_ix_chan_sz); // cache out chan ixs | |
if( %(patch_ix_0) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( %(patch_ix_1) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( %(patch_ix_2) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( %(patch_ix_3) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( %(patch_ix_4) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( %(patch_ix_5) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( %(patch_ix_6) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( %(patch_ix_7) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores */ | |
/* t_tile_dummy_stores = // begin t_tile_dummy_stores | |
out[0] = 0.0f | |
+ max(0.0f,out_tile[0] + filts_strip[0]) | |
+ max(0.0f,out_tile[1] + filts_strip[1]) | |
+ max(0.0f,out_tile[2] + filts_strip[2]) | |
+ max(0.0f,out_tile[3] + filts_strip[3]) | |
+ max(0.0f,out_tile[4] + filts_strip[4]) | |
+ max(0.0f,out_tile[5] + filts_strip[5]) | |
+ max(0.0f,out_tile[6] + filts_strip[6]) | |
+ max(0.0f,out_tile[7] + filts_strip[7]) | |
+ max(0.0f,out_tile[8] + filts_strip[0]) | |
+ max(0.0f,out_tile[9] + filts_strip[1]) | |
+ max(0.0f,out_tile[10] + filts_strip[2]) | |
+ max(0.0f,out_tile[11] + filts_strip[3]) | |
+ max(0.0f,out_tile[12] + filts_strip[4]) | |
+ max(0.0f,out_tile[13] + filts_strip[5]) | |
+ max(0.0f,out_tile[14] + filts_strip[6]) | |
+ max(0.0f,out_tile[15] + filts_strip[7]) | |
+ max(0.0f,out_tile[16] + filts_strip[0]) | |
+ max(0.0f,out_tile[17] + filts_strip[1]) | |
+ max(0.0f,out_tile[18] + filts_strip[2]) | |
+ max(0.0f,out_tile[19] + filts_strip[3]) | |
+ max(0.0f,out_tile[20] + filts_strip[4]) | |
+ max(0.0f,out_tile[21] + filts_strip[5]) | |
+ max(0.0f,out_tile[22] + filts_strip[6]) | |
+ max(0.0f,out_tile[23] + filts_strip[7]) | |
+ max(0.0f,out_tile[24] + filts_strip[0]) | |
+ max(0.0f,out_tile[25] + filts_strip[1]) | |
+ max(0.0f,out_tile[26] + filts_strip[2]) | |
+ max(0.0f,out_tile[27] + filts_strip[3]) | |
+ max(0.0f,out_tile[28] + filts_strip[4]) | |
+ max(0.0f,out_tile[29] + filts_strip[5]) | |
+ max(0.0f,out_tile[30] + filts_strip[6]) | |
+ max(0.0f,out_tile[31] + filts_strip[7]) | |
+ max(0.0f,out_tile[32] + filts_strip[0]) | |
+ max(0.0f,out_tile[33] + filts_strip[1]) | |
+ max(0.0f,out_tile[34] + filts_strip[2]) | |
+ max(0.0f,out_tile[35] + filts_strip[3]) | |
+ max(0.0f,out_tile[36] + filts_strip[4]) | |
+ max(0.0f,out_tile[37] + filts_strip[5]) | |
+ max(0.0f,out_tile[38] + filts_strip[6]) | |
+ max(0.0f,out_tile[39] + filts_strip[7]) | |
+ max(0.0f,out_tile[40] + filts_strip[0]) | |
+ max(0.0f,out_tile[41] + filts_strip[1]) | |
+ max(0.0f,out_tile[42] + filts_strip[2]) | |
+ max(0.0f,out_tile[43] + filts_strip[3]) | |
+ max(0.0f,out_tile[44] + filts_strip[4]) | |
+ max(0.0f,out_tile[45] + filts_strip[5]) | |
+ max(0.0f,out_tile[46] + filts_strip[6]) | |
+ max(0.0f,out_tile[47] + filts_strip[7]) | |
+ max(0.0f,out_tile[48] + filts_strip[0]) | |
+ max(0.0f,out_tile[49] + filts_strip[1]) | |
+ max(0.0f,out_tile[50] + filts_strip[2]) | |
+ max(0.0f,out_tile[51] + filts_strip[3]) | |
+ max(0.0f,out_tile[52] + filts_strip[4]) | |
+ max(0.0f,out_tile[53] + filts_strip[5]) | |
+ max(0.0f,out_tile[54] + filts_strip[6]) | |
+ max(0.0f,out_tile[55] + filts_strip[7]) | |
+ max(0.0f,out_tile[56] + filts_strip[0]) | |
+ max(0.0f,out_tile[57] + filts_strip[1]) | |
+ max(0.0f,out_tile[58] + filts_strip[2]) | |
+ max(0.0f,out_tile[59] + filts_strip[3]) | |
+ max(0.0f,out_tile[60] + filts_strip[4]) | |
+ max(0.0f,out_tile[61] + filts_strip[5]) | |
+ max(0.0f,out_tile[62] + filts_strip[6]) | |
+ max(0.0f,out_tile[63] + filts_strip[7]) | |
; | |
// end t_tile_dummy_stores */ | |
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_16__in_chans_192__kysz_1__kxsz_1( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x | |
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile | |
{ | |
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg | |
int32_t const filts_ix = GLOB_ID_1D; | |
if( filts_ix >= 3072 ) { return; } | |
int32_t const fioc = (filts_ix/192); | |
float val = 0.0f; | |
int32_t const filts_xp_ix = | |
(fioc/16)*3072 + | |
(fioc%8)*2 + | |
((fioc/8)%2)*1 + | |
(filts_ix%192)*16 + | |
(filts_ix%1)*16 + | |
(filts_ix%1)*16; | |
#if 1 | |
val = in[filts_ix]; | |
#else | |
if( (filts_ix%192) == 0 ) { | |
// if( ((filts_ix%1) == 5) && ((filts_ix%1) == 5) ) | |
{ | |
val = (filts_ix%1)*100 + (filts_ix%1); | |
} | |
} | |
#endif | |
out[filts_xp_ix] = val; | |
} | |
// -- template substituion table used: -- | |
/* out_chans = 16 */ | |
/* in_chans = 192 */ | |
/* kysz = 1 */ | |
/* kxsz = 1 */ | |
/* rtc_func_name = xpose_filts__out_chans_16__in_chans_192__kysz_1__kxsz_1 */ | |
/* t_tile_sz = 8 */ | |
/* filts_ix_x_dim = 1 */ | |
/* filts_ix_x_sz = 1 */ | |
/* filts_ix_x_nomod = filts_ix */ | |
/* filts_ix_x = (filts_ix%%1) */ | |
/* filts_ix_y_dim = 1 */ | |
/* filts_ix_y_sz = 1 */ | |
/* filts_ix_y_nomod = filts_ix */ | |
/* filts_ix_y = (filts_ix%%1) */ | |
/* filts_ix_in_chan_dim = 192 */ | |
/* filts_ix_in_chan_sz = 1 */ | |
/* filts_ix_in_chan_nomod = filts_ix */ | |
/* filts_ix_in_chan = (filts_ix%%192) */ | |
/* filts_ix_out_chan_dim = 16 */ | |
/* filts_ix_out_chan_sz = 192 */ | |
/* filts_ix_out_chan_nomod = (filts_ix/192) */ | |
/* filts_ix_out_chan = (filts_ix/192) */ | |
/* filts_ix_sz = 3072 */ | |
/* filts_xp_ix_out_chan_tile_dim = 2 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%2) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 2 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/2) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/2)%%8) */ | |
/* filts_xp_ix_x_dim = 1 */ | |
/* filts_xp_ix_x_sz = 16 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/16) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/16)%%1) */ | |
/* filts_xp_ix_y_dim = 1 */ | |
/* filts_xp_ix_y_sz = 16 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/16) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/16)%%1) */ | |
/* filts_xp_ix_in_chan_dim = 192 */ | |
/* filts_xp_ix_in_chan_sz = 16 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/16) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/16)%%192) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 3072 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/3072) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/3072) */ | |
/* filts_xp_ix_sz = 3072 */ | |
/* fioc_out_chan_reg_dim = 8 */ | |
/* fioc_out_chan_reg_sz = 1 */ | |
/* fioc_out_chan_reg_nomod = fioc */ | |
/* fioc_out_chan_reg = (fioc%%8) */ | |
/* fioc_out_chan_tile_dim = 2 */ | |
/* fioc_out_chan_tile_sz = 8 */ | |
/* fioc_out_chan_tile_nomod = (fioc/8) */ | |
/* fioc_out_chan_tile = ((fioc/8)%%2) */ | |
/* fioc_out_chan_blk_dim = 1 */ | |
/* fioc_out_chan_blk_sz = 16 */ | |
/* fioc_out_chan_blk_nomod = (fioc/16) */ | |
/* fioc_out_chan_blk = (fioc/16) */ | |
/* fioc_sz = 16 */ | |
// each thread: computes 8x8 block of out | |
// loop over k dim | |
CUCL_GLOBAL_KERNEL void tconv__num_imgs_20__in_dim_0_28__in_dim_1_28__kern_sz_5__stride_1__in_pad_2__t_tile_sz_8__conv_has_relu_1__out_chans_32__in_chans_16( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) { | |
LOCSHAR_MEM float all_smem[1024]; // note: max(filts+in,out) == max(160+480,1024) | |
LSMASQ float * const filts_smem = all_smem; | |
LSMASQ float * const in_smem = filts_smem + 160; | |
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers | |
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem | |
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz ) | |
float in_strip[12]; // segment of input line sufficient for one unrolling of inner loop | |
int32_t blk_in_ix_base = GRP_ID_1D*7680 + LOC_ID_1D;// index of first input pel to load for this thread | |
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*12800; // index of first out chan | |
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D; | |
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%4); | |
int32_t out_line = (GRP_ID_1D/4)*32; // first out_line of block | |
int32_t const blk_fli = (out_line/28); // image of first out_line of block | |
out_line += (LOC_ID_1D/4); // adjust to out_line of this thread | |
// offset in lines to deal with >1 img/block = (number of prior images (partial or full) in this block) * (adj to next img) | |
int32_t const img_off_lines = ((out_line/28) - blk_fli)*(5-1); | |
int32_t const in_y = (out_line%28)*1 - 2; | |
for( int32_t in_chan = 0; in_chan != 16; ++in_chan ) { | |
BARRIER_SYNC; | |
// begin in_smem_loads | |
in_smem[(LOC_ID_1D + 128 * 0)] = in[ blk_in_ix_base + (128*0) ]; | |
in_smem[(LOC_ID_1D + 128 * 1)] = in[ blk_in_ix_base + (128*1) ]; | |
in_smem[(LOC_ID_1D + 128 * 2)] = in[ blk_in_ix_base + (128*2) ]; | |
if( (LOC_ID_1D + 128 * 3) < 480) { in_smem[(LOC_ID_1D + 128 * 3)] = in[ blk_in_ix_base + (128*3) ];} | |
blk_in_ix_base += 480; | |
// end in_smem_loads; | |
for( int32_t ky = 0; ky != 5; ++ky ) { | |
if( ky != 0 ) { BARRIER_SYNC; } | |
// begin filt_smem_loads | |
filts_smem[(LOC_ID_1D + 128 * 0)] = filts[filts_off+(128*0)]; | |
if( (LOC_ID_1D + 128 * 1) < 160 ) { filts_smem[(LOC_ID_1D + 128 * 1)] = filts[filts_off+(128*1)];} | |
filts_off += 160; | |
// end filt_smem_loads; | |
BARRIER_SYNC; | |
if( (out_line/28) >= 20 ) { continue; } // required: skip lines from invalid images (read might be invalid) | |
if( ((in_y+ky) < 0) || ((in_y+ky)>28) ) { continue; } // optimization: skip known-to-be-padding input lines | |
LSMASQ float * const in_smem_off = in_smem + ((LOC_ID_1D/4)*1+ky+img_off_lines)*12; | |
// begin inner_loop_body | |
in_strip[0] = in_smem_off[0]; | |
in_strip[1] = in_smem_off[1]; | |
in_strip[2] = in_smem_off[2]; | |
in_strip[3] = in_smem_off[3]; | |
in_strip[4] = in_smem_off[4]; | |
in_strip[5] = in_smem_off[5]; | |
in_strip[6] = in_smem_off[6]; | |
in_strip[7] = in_smem_off[7]; | |
in_strip[8] = in_smem_off[8]; | |
in_strip[9] = in_smem_off[9]; | |
in_strip[10] = in_smem_off[10]; | |
in_strip[11] = in_smem_off[11]; | |
filts_strip[0] = filts_smem_off[0*32+0*4]; | |
filts_strip[1] = filts_smem_off[0*32+1*4]; | |
filts_strip[2] = filts_smem_off[0*32+2*4]; | |
filts_strip[3] = filts_smem_off[0*32+3*4]; | |
filts_strip[4] = filts_smem_off[0*32+4*4]; | |
filts_strip[5] = filts_smem_off[0*32+5*4]; | |
filts_strip[6] = filts_smem_off[0*32+6*4]; | |
filts_strip[7] = filts_smem_off[0*32+7*4]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[1*32+0*4]; | |
filts_strip[1] = filts_smem_off[1*32+1*4]; | |
filts_strip[2] = filts_smem_off[1*32+2*4]; | |
filts_strip[3] = filts_smem_off[1*32+3*4]; | |
filts_strip[4] = filts_smem_off[1*32+4*4]; | |
filts_strip[5] = filts_smem_off[1*32+5*4]; | |
filts_strip[6] = filts_smem_off[1*32+6*4]; | |
filts_strip[7] = filts_smem_off[1*32+7*4]; | |
out_tile[0] += filts_strip[0]*in_strip[1]; | |
out_tile[1] += filts_strip[1]*in_strip[1]; | |
out_tile[2] += filts_strip[2]*in_strip[1]; | |
out_tile[3] += filts_strip[3]*in_strip[1]; | |
out_tile[4] += filts_strip[4]*in_strip[1]; | |
out_tile[5] += filts_strip[5]*in_strip[1]; | |
out_tile[6] += filts_strip[6]*in_strip[1]; | |
out_tile[7] += filts_strip[7]*in_strip[1]; | |
out_tile[8] += filts_strip[0]*in_strip[2]; | |
out_tile[9] += filts_strip[1]*in_strip[2]; | |
out_tile[10] += filts_strip[2]*in_strip[2]; | |
out_tile[11] += filts_strip[3]*in_strip[2]; | |
out_tile[12] += filts_strip[4]*in_strip[2]; | |
out_tile[13] += filts_strip[5]*in_strip[2]; | |
out_tile[14] += filts_strip[6]*in_strip[2]; | |
out_tile[15] += filts_strip[7]*in_strip[2]; | |
out_tile[16] += filts_strip[0]*in_strip[3]; | |
out_tile[17] += filts_strip[1]*in_strip[3]; | |
out_tile[18] += filts_strip[2]*in_strip[3]; | |
out_tile[19] += filts_strip[3]*in_strip[3]; | |
out_tile[20] += filts_strip[4]*in_strip[3]; | |
out_tile[21] += filts_strip[5]*in_strip[3]; | |
out_tile[22] += filts_strip[6]*in_strip[3]; | |
out_tile[23] += filts_strip[7]*in_strip[3]; | |
out_tile[24] += filts_strip[0]*in_strip[4]; | |
out_tile[25] += filts_strip[1]*in_strip[4]; | |
out_tile[26] += filts_strip[2]*in_strip[4]; | |
out_tile[27] += filts_strip[3]*in_strip[4]; | |
out_tile[28] += filts_strip[4]*in_strip[4]; | |
out_tile[29] += filts_strip[5]*in_strip[4]; | |
out_tile[30] += filts_strip[6]*in_strip[4]; | |
out_tile[31] += filts_strip[7]*in_strip[4]; | |
out_tile[32] += filts_strip[0]*in_strip[5]; | |
out_tile[33] += filts_strip[1]*in_strip[5]; | |
out_tile[34] += filts_strip[2]*in_strip[5]; | |
out_tile[35] += filts_strip[3]*in_strip[5]; | |
out_tile[36] += filts_strip[4]*in_strip[5]; | |
out_tile[37] += filts_strip[5]*in_strip[5]; | |
out_tile[38] += filts_strip[6]*in_strip[5]; | |
out_tile[39] += filts_strip[7]*in_strip[5]; | |
out_tile[40] += filts_strip[0]*in_strip[6]; | |
out_tile[41] += filts_strip[1]*in_strip[6]; | |
out_tile[42] += filts_strip[2]*in_strip[6]; | |
out_tile[43] += filts_strip[3]*in_strip[6]; | |
out_tile[44] += filts_strip[4]*in_strip[6]; | |
out_tile[45] += filts_strip[5]*in_strip[6]; | |
out_tile[46] += filts_strip[6]*in_strip[6]; | |
out_tile[47] += filts_strip[7]*in_strip[6]; | |
out_tile[48] += filts_strip[0]*in_strip[7]; | |
out_tile[49] += filts_strip[1]*in_strip[7]; | |
out_tile[50] += filts_strip[2]*in_strip[7]; | |
out_tile[51] += filts_strip[3]*in_strip[7]; | |
out_tile[52] += filts_strip[4]*in_strip[7]; | |
out_tile[53] += filts_strip[5]*in_strip[7]; | |
out_tile[54] += filts_strip[6]*in_strip[7]; | |
out_tile[55] += filts_strip[7]*in_strip[7]; | |
out_tile[56] += filts_strip[0]*in_strip[8]; | |
out_tile[57] += filts_strip[1]*in_strip[8]; | |
out_tile[58] += filts_strip[2]*in_strip[8]; | |
out_tile[59] += filts_strip[3]*in_strip[8]; | |
out_tile[60] += filts_strip[4]*in_strip[8]; | |
out_tile[61] += filts_strip[5]*in_strip[8]; | |
out_tile[62] += filts_strip[6]*in_strip[8]; | |
out_tile[63] += filts_strip[7]*in_strip[8]; | |
filts_strip[0] = filts_smem_off[2*32+0*4]; | |
filts_strip[1] = filts_smem_off[2*32+1*4]; | |
filts_strip[2] = filts_smem_off[2*32+2*4]; | |
filts_strip[3] = filts_smem_off[2*32+3*4]; | |
filts_strip[4] = filts_smem_off[2*32+4*4]; | |
filts_strip[5] = filts_smem_off[2*32+5*4]; | |
filts_strip[6] = filts_smem_off[2*32+6*4]; | |
filts_strip[7] = filts_smem_off[2*32+7*4]; | |
out_tile[0] += filts_strip[0]*in_strip[2]; | |
out_tile[1] += filts_strip[1]*in_strip[2]; | |
out_tile[2] += filts_strip[2]*in_strip[2]; | |
out_tile[3] += filts_strip[3]*in_strip[2]; | |
out_tile[4] += filts_strip[4]*in_strip[2]; | |
out_tile[5] += filts_strip[5]*in_strip[2]; | |
out_tile[6] += filts_strip[6]*in_strip[2]; | |
out_tile[7] += filts_strip[7]*in_strip[2]; | |
out_tile[8] += filts_strip[0]*in_strip[3]; | |
out_tile[9] += filts_strip[1]*in_strip[3]; | |
out_tile[10] += filts_strip[2]*in_strip[3]; | |
out_tile[11] += filts_strip[3]*in_strip[3]; | |
out_tile[12] += filts_strip[4]*in_strip[3]; | |
out_tile[13] += filts_strip[5]*in_strip[3]; | |
out_tile[14] += filts_strip[6]*in_strip[3]; | |
out_tile[15] += filts_strip[7]*in_strip[3]; | |
out_tile[16] += filts_strip[0]*in_strip[4]; | |
out_tile[17] += filts_strip[1]*in_strip[4]; | |
out_tile[18] += filts_strip[2]*in_strip[4]; | |
out_tile[19] += filts_strip[3]*in_strip[4]; | |
out_tile[20] += filts_strip[4]*in_strip[4]; | |
out_tile[21] += filts_strip[5]*in_strip[4]; | |
out_tile[22] += filts_strip[6]*in_strip[4]; | |
out_tile[23] += filts_strip[7]*in_strip[4]; | |
out_tile[24] += filts_strip[0]*in_strip[5]; | |
out_tile[25] += filts_strip[1]*in_strip[5]; | |
out_tile[26] += filts_strip[2]*in_strip[5]; | |
out_tile[27] += filts_strip[3]*in_strip[5]; | |
out_tile[28] += filts_strip[4]*in_strip[5]; | |
out_tile[29] += filts_strip[5]*in_strip[5]; | |
out_tile[30] += filts_strip[6]*in_strip[5]; | |
out_tile[31] += filts_strip[7]*in_strip[5]; | |
out_tile[32] += filts_strip[0]*in_strip[6]; | |
out_tile[33] += filts_strip[1]*in_strip[6]; | |
out_tile[34] += filts_strip[2]*in_strip[6]; | |
out_tile[35] += filts_strip[3]*in_strip[6]; | |
out_tile[36] += filts_strip[4]*in_strip[6]; | |
out_tile[37] += filts_strip[5]*in_strip[6]; | |
out_tile[38] += filts_strip[6]*in_strip[6]; | |
out_tile[39] += filts_strip[7]*in_strip[6]; | |
out_tile[40] += filts_strip[0]*in_strip[7]; | |
out_tile[41] += filts_strip[1]*in_strip[7]; | |
out_tile[42] += filts_strip[2]*in_strip[7]; | |
out_tile[43] += filts_strip[3]*in_strip[7]; | |
out_tile[44] += filts_strip[4]*in_strip[7]; | |
out_tile[45] += filts_strip[5]*in_strip[7]; | |
out_tile[46] += filts_strip[6]*in_strip[7]; | |
out_tile[47] += filts_strip[7]*in_strip[7]; | |
out_tile[48] += filts_strip[0]*in_strip[8]; | |
out_tile[49] += filts_strip[1]*in_strip[8]; | |
out_tile[50] += filts_strip[2]*in_strip[8]; | |
out_tile[51] += filts_strip[3]*in_strip[8]; | |
out_tile[52] += filts_strip[4]*in_strip[8]; | |
out_tile[53] += filts_strip[5]*in_strip[8]; | |
out_tile[54] += filts_strip[6]*in_strip[8]; | |
out_tile[55] += filts_strip[7]*in_strip[8]; | |
out_tile[56] += filts_strip[0]*in_strip[9]; | |
out_tile[57] += filts_strip[1]*in_strip[9]; | |
out_tile[58] += filts_strip[2]*in_strip[9]; | |
out_tile[59] += filts_strip[3]*in_strip[9]; | |
out_tile[60] += filts_strip[4]*in_strip[9]; | |
out_tile[61] += filts_strip[5]*in_strip[9]; | |
out_tile[62] += filts_strip[6]*in_strip[9]; | |
out_tile[63] += filts_strip[7]*in_strip[9]; | |
filts_strip[0] = filts_smem_off[3*32+0*4]; | |
filts_strip[1] = filts_smem_off[3*32+1*4]; | |
filts_strip[2] = filts_smem_off[3*32+2*4]; | |
filts_strip[3] = filts_smem_off[3*32+3*4]; | |
filts_strip[4] = filts_smem_off[3*32+4*4]; | |
filts_strip[5] = filts_smem_off[3*32+5*4]; | |
filts_strip[6] = filts_smem_off[3*32+6*4]; | |
filts_strip[7] = filts_smem_off[3*32+7*4]; | |
out_tile[0] += filts_strip[0]*in_strip[3]; | |
out_tile[1] += filts_strip[1]*in_strip[3]; | |
out_tile[2] += filts_strip[2]*in_strip[3]; | |
out_tile[3] += filts_strip[3]*in_strip[3]; | |
out_tile[4] += filts_strip[4]*in_strip[3]; | |
out_tile[5] += filts_strip[5]*in_strip[3]; | |
out_tile[6] += filts_strip[6]*in_strip[3]; | |
out_tile[7] += filts_strip[7]*in_strip[3]; | |
out_tile[8] += filts_strip[0]*in_strip[4]; | |
out_tile[9] += filts_strip[1]*in_strip[4]; | |
out_tile[10] += filts_strip[2]*in_strip[4]; | |
out_tile[11] += filts_strip[3]*in_strip[4]; | |
out_tile[12] += filts_strip[4]*in_strip[4]; | |
out_tile[13] += filts_strip[5]*in_strip[4]; | |
out_tile[14] += filts_strip[6]*in_strip[4]; | |
out_tile[15] += filts_strip[7]*in_strip[4]; | |
out_tile[16] += filts_strip[0]*in_strip[5]; | |
out_tile[17] += filts_strip[1]*in_strip[5]; | |
out_tile[18] += filts_strip[2]*in_strip[5]; | |
out_tile[19] += filts_strip[3]*in_strip[5]; | |
out_tile[20] += filts_strip[4]*in_strip[5]; | |
out_tile[21] += filts_strip[5]*in_strip[5]; | |
out_tile[22] += filts_strip[6]*in_strip[5]; | |
out_tile[23] += filts_strip[7]*in_strip[5]; | |
out_tile[24] += filts_strip[0]*in_strip[6]; | |
out_tile[25] += filts_strip[1]*in_strip[6]; | |
out_tile[26] += filts_strip[2]*in_strip[6]; | |
out_tile[27] += filts_strip[3]*in_strip[6]; | |
out_tile[28] += filts_strip[4]*in_strip[6]; | |
out_tile[29] += filts_strip[5]*in_strip[6]; | |
out_tile[30] += filts_strip[6]*in_strip[6]; | |
out_tile[31] += filts_strip[7]*in_strip[6]; | |
out_tile[32] += filts_strip[0]*in_strip[7]; | |
out_tile[33] += filts_strip[1]*in_strip[7]; | |
out_tile[34] += filts_strip[2]*in_strip[7]; | |
out_tile[35] += filts_strip[3]*in_strip[7]; | |
out_tile[36] += filts_strip[4]*in_strip[7]; | |
out_tile[37] += filts_strip[5]*in_strip[7]; | |
out_tile[38] += filts_strip[6]*in_strip[7]; | |
out_tile[39] += filts_strip[7]*in_strip[7]; | |
out_tile[40] += filts_strip[0]*in_strip[8]; | |
out_tile[41] += filts_strip[1]*in_strip[8]; | |
out_tile[42] += filts_strip[2]*in_strip[8]; | |
out_tile[43] += filts_strip[3]*in_strip[8]; | |
out_tile[44] += filts_strip[4]*in_strip[8]; | |
out_tile[45] += filts_strip[5]*in_strip[8]; | |
out_tile[46] += filts_strip[6]*in_strip[8]; | |
out_tile[47] += filts_strip[7]*in_strip[8]; | |
out_tile[48] += filts_strip[0]*in_strip[9]; | |
out_tile[49] += filts_strip[1]*in_strip[9]; | |
out_tile[50] += filts_strip[2]*in_strip[9]; | |
out_tile[51] += filts_strip[3]*in_strip[9]; | |
out_tile[52] += filts_strip[4]*in_strip[9]; | |
out_tile[53] += filts_strip[5]*in_strip[9]; | |
out_tile[54] += filts_strip[6]*in_strip[9]; | |
out_tile[55] += filts_strip[7]*in_strip[9]; | |
out_tile[56] += filts_strip[0]*in_strip[10]; | |
out_tile[57] += filts_strip[1]*in_strip[10]; | |
out_tile[58] += filts_strip[2]*in_strip[10]; | |
out_tile[59] += filts_strip[3]*in_strip[10]; | |
out_tile[60] += filts_strip[4]*in_strip[10]; | |
out_tile[61] += filts_strip[5]*in_strip[10]; | |
out_tile[62] += filts_strip[6]*in_strip[10]; | |
out_tile[63] += filts_strip[7]*in_strip[10]; | |
filts_strip[0] = filts_smem_off[4*32+0*4]; | |
filts_strip[1] = filts_smem_off[4*32+1*4]; | |
filts_strip[2] = filts_smem_off[4*32+2*4]; | |
filts_strip[3] = filts_smem_off[4*32+3*4]; | |
filts_strip[4] = filts_smem_off[4*32+4*4]; | |
filts_strip[5] = filts_smem_off[4*32+5*4]; | |
filts_strip[6] = filts_smem_off[4*32+6*4]; | |
filts_strip[7] = filts_smem_off[4*32+7*4]; | |
out_tile[0] += filts_strip[0]*in_strip[4]; | |
out_tile[1] += filts_strip[1]*in_strip[4]; | |
out_tile[2] += filts_strip[2]*in_strip[4]; | |
out_tile[3] += filts_strip[3]*in_strip[4]; | |
out_tile[4] += filts_strip[4]*in_strip[4]; | |
out_tile[5] += filts_strip[5]*in_strip[4]; | |
out_tile[6] += filts_strip[6]*in_strip[4]; | |
out_tile[7] += filts_strip[7]*in_strip[4]; | |
out_tile[8] += filts_strip[0]*in_strip[5]; | |
out_tile[9] += filts_strip[1]*in_strip[5]; | |
out_tile[10] += filts_strip[2]*in_strip[5]; | |
out_tile[11] += filts_strip[3]*in_strip[5]; | |
out_tile[12] += filts_strip[4]*in_strip[5]; | |
out_tile[13] += filts_strip[5]*in_strip[5]; | |
out_tile[14] += filts_strip[6]*in_strip[5]; | |
out_tile[15] += filts_strip[7]*in_strip[5]; | |
out_tile[16] += filts_strip[0]*in_strip[6]; | |
out_tile[17] += filts_strip[1]*in_strip[6]; | |
out_tile[18] += filts_strip[2]*in_strip[6]; | |
out_tile[19] += filts_strip[3]*in_strip[6]; | |
out_tile[20] += filts_strip[4]*in_strip[6]; | |
out_tile[21] += filts_strip[5]*in_strip[6]; | |
out_tile[22] += filts_strip[6]*in_strip[6]; | |
out_tile[23] += filts_strip[7]*in_strip[6]; | |
out_tile[24] += filts_strip[0]*in_strip[7]; | |
out_tile[25] += filts_strip[1]*in_strip[7]; | |
out_tile[26] += filts_strip[2]*in_strip[7]; | |
out_tile[27] += filts_strip[3]*in_strip[7]; | |
out_tile[28] += filts_strip[4]*in_strip[7]; | |
out_tile[29] += filts_strip[5]*in_strip[7]; | |
out_tile[30] += filts_strip[6]*in_strip[7]; | |
out_tile[31] += filts_strip[7]*in_strip[7]; | |
out_tile[32] += filts_strip[0]*in_strip[8]; | |
out_tile[33] += filts_strip[1]*in_strip[8]; | |
out_tile[34] += filts_strip[2]*in_strip[8]; | |
out_tile[35] += filts_strip[3]*in_strip[8]; | |
out_tile[36] += filts_strip[4]*in_strip[8]; | |
out_tile[37] += filts_strip[5]*in_strip[8]; | |
out_tile[38] += filts_strip[6]*in_strip[8]; | |
out_tile[39] += filts_strip[7]*in_strip[8]; | |
out_tile[40] += filts_strip[0]*in_strip[9]; | |
out_tile[41] += filts_strip[1]*in_strip[9]; | |
out_tile[42] += filts_strip[2]*in_strip[9]; | |
out_tile[43] += filts_strip[3]*in_strip[9]; | |
out_tile[44] += filts_strip[4]*in_strip[9]; | |
out_tile[45] += filts_strip[5]*in_strip[9]; | |
out_tile[46] += filts_strip[6]*in_strip[9]; | |
out_tile[47] += filts_strip[7]*in_strip[9]; | |
out_tile[48] += filts_strip[0]*in_strip[10]; | |
out_tile[49] += filts_strip[1]*in_strip[10]; | |
out_tile[50] += filts_strip[2]*in_strip[10]; | |
out_tile[51] += filts_strip[3]*in_strip[10]; | |
out_tile[52] += filts_strip[4]*in_strip[10]; | |
out_tile[53] += filts_strip[5]*in_strip[10]; | |
out_tile[54] += filts_strip[6]*in_strip[10]; | |
out_tile[55] += filts_strip[7]*in_strip[10]; | |
out_tile[56] += filts_strip[0]*in_strip[11]; | |
out_tile[57] += filts_strip[1]*in_strip[11]; | |
out_tile[58] += filts_strip[2]*in_strip[11]; | |
out_tile[59] += filts_strip[3]*in_strip[11]; | |
out_tile[60] += filts_strip[4]*in_strip[11]; | |
out_tile[61] += filts_strip[5]*in_strip[11]; | |
out_tile[62] += filts_strip[6]*in_strip[11]; | |
out_tile[63] += filts_strip[7]*in_strip[11]; | |
; | |
} | |
} | |
if( flags == 2 ) { return; } | |
BARRIER_SYNC; | |
for( int32_t i = 0; i != 1; ++i ) { | |
int32_t const t_smem_bias_ix = LOC_ID_1D+128*i; | |
if( t_smem_bias_ix < 32 ) { | |
int32_t const ocix_base = (GRP_ID_1D%1)*32; | |
int32_t const load_reg = t_smem_bias_ix / 4; | |
int32_t const load_tile = t_smem_bias_ix % 4; | |
int32_t const ocix = ocix_base + load_tile*8 + load_reg; | |
if( ocix < 32 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; } | |
} | |
} | |
BARRIER_SYNC; | |
// begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*4]; | |
filts_strip[1] = filts_smem_off[1*4]; | |
filts_strip[2] = filts_smem_off[2*4]; | |
filts_strip[3] = filts_smem_off[3*4]; | |
filts_strip[4] = filts_smem_off[4*4]; | |
filts_strip[5] = filts_smem_off[5*4]; | |
filts_strip[6] = filts_smem_off[6*4]; | |
filts_strip[7] = filts_smem_off[7*4]; | |
// end t_tile_bias_loads; | |
if( flags == 1 ) { return; } | |
// begin t_tile_stores | |
if( (out_line/28) >= 20 ) { return; } | |
int32_t out_x = (GRP_ID_1D%4)*8; | |
int32_t out_chan = ((GRP_ID_1D%1)*4 + (LOC_ID_1D%4))*8; | |
GASQ float * out_off = out + (out_line/28)*25088 + out_chan*784 + (out_line%28)*28 + out_x*1 ; | |
if( (out_x + 0) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 32 ) { out_off[ 0*784 + 0*1 ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( (out_chan + 1) < 32 ) { out_off[ 1*784 + 0*1 ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( (out_chan + 2) < 32 ) { out_off[ 2*784 + 0*1 ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( (out_chan + 3) < 32 ) { out_off[ 3*784 + 0*1 ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( (out_chan + 4) < 32 ) { out_off[ 4*784 + 0*1 ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( (out_chan + 5) < 32 ) { out_off[ 5*784 + 0*1 ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( (out_chan + 6) < 32 ) { out_off[ 6*784 + 0*1 ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( (out_chan + 7) < 32 ) { out_off[ 7*784 + 0*1 ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( (out_x + 1) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 32 ) { out_off[ 0*784 + 1*1 ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( (out_chan + 1) < 32 ) { out_off[ 1*784 + 1*1 ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( (out_chan + 2) < 32 ) { out_off[ 2*784 + 1*1 ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( (out_chan + 3) < 32 ) { out_off[ 3*784 + 1*1 ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( (out_chan + 4) < 32 ) { out_off[ 4*784 + 1*1 ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( (out_chan + 5) < 32 ) { out_off[ 5*784 + 1*1 ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( (out_chan + 6) < 32 ) { out_off[ 6*784 + 1*1 ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( (out_chan + 7) < 32 ) { out_off[ 7*784 + 1*1 ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( (out_x + 2) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 32 ) { out_off[ 0*784 + 2*1 ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( (out_chan + 1) < 32 ) { out_off[ 1*784 + 2*1 ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( (out_chan + 2) < 32 ) { out_off[ 2*784 + 2*1 ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( (out_chan + 3) < 32 ) { out_off[ 3*784 + 2*1 ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( (out_chan + 4) < 32 ) { out_off[ 4*784 + 2*1 ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( (out_chan + 5) < 32 ) { out_off[ 5*784 + 2*1 ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( (out_chan + 6) < 32 ) { out_off[ 6*784 + 2*1 ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( (out_chan + 7) < 32 ) { out_off[ 7*784 + 2*1 ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( (out_x + 3) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 32 ) { out_off[ 0*784 + 3*1 ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( (out_chan + 1) < 32 ) { out_off[ 1*784 + 3*1 ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( (out_chan + 2) < 32 ) { out_off[ 2*784 + 3*1 ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( (out_chan + 3) < 32 ) { out_off[ 3*784 + 3*1 ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( (out_chan + 4) < 32 ) { out_off[ 4*784 + 3*1 ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( (out_chan + 5) < 32 ) { out_off[ 5*784 + 3*1 ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( (out_chan + 6) < 32 ) { out_off[ 6*784 + 3*1 ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( (out_chan + 7) < 32 ) { out_off[ 7*784 + 3*1 ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( (out_x + 4) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 32 ) { out_off[ 0*784 + 4*1 ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( (out_chan + 1) < 32 ) { out_off[ 1*784 + 4*1 ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( (out_chan + 2) < 32 ) { out_off[ 2*784 + 4*1 ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( (out_chan + 3) < 32 ) { out_off[ 3*784 + 4*1 ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( (out_chan + 4) < 32 ) { out_off[ 4*784 + 4*1 ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( (out_chan + 5) < 32 ) { out_off[ 5*784 + 4*1 ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( (out_chan + 6) < 32 ) { out_off[ 6*784 + 4*1 ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( (out_chan + 7) < 32 ) { out_off[ 7*784 + 4*1 ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( (out_x + 5) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 32 ) { out_off[ 0*784 + 5*1 ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( (out_chan + 1) < 32 ) { out_off[ 1*784 + 5*1 ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( (out_chan + 2) < 32 ) { out_off[ 2*784 + 5*1 ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( (out_chan + 3) < 32 ) { out_off[ 3*784 + 5*1 ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( (out_chan + 4) < 32 ) { out_off[ 4*784 + 5*1 ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( (out_chan + 5) < 32 ) { out_off[ 5*784 + 5*1 ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( (out_chan + 6) < 32 ) { out_off[ 6*784 + 5*1 ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( (out_chan + 7) < 32 ) { out_off[ 7*784 + 5*1 ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( (out_x + 6) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 32 ) { out_off[ 0*784 + 6*1 ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( (out_chan + 1) < 32 ) { out_off[ 1*784 + 6*1 ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( (out_chan + 2) < 32 ) { out_off[ 2*784 + 6*1 ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( (out_chan + 3) < 32 ) { out_off[ 3*784 + 6*1 ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( (out_chan + 4) < 32 ) { out_off[ 4*784 + 6*1 ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( (out_chan + 5) < 32 ) { out_off[ 5*784 + 6*1 ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( (out_chan + 6) < 32 ) { out_off[ 6*784 + 6*1 ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( (out_chan + 7) < 32 ) { out_off[ 7*784 + 6*1 ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( (out_x + 7) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 32 ) { out_off[ 0*784 + 7*1 ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( (out_chan + 1) < 32 ) { out_off[ 1*784 + 7*1 ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( (out_chan + 2) < 32 ) { out_off[ 2*784 + 7*1 ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( (out_chan + 3) < 32 ) { out_off[ 3*784 + 7*1 ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( (out_chan + 4) < 32 ) { out_off[ 4*784 + 7*1 ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( (out_chan + 5) < 32 ) { out_off[ 5*784 + 7*1 ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( (out_chan + 6) < 32 ) { out_off[ 6*784 + 7*1 ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( (out_chan + 7) < 32 ) { out_off[ 7*784 + 7*1 ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_dim_0 = 28 */ | |
/* in_dim_1 = 28 */ | |
/* kern_sz = 5 */ | |
/* stride = 1 */ | |
/* in_pad = 2 */ | |
/* t_tile_sz = 8 */ | |
/* conv_has_relu = 1 */ | |
/* out_chans = 32 */ | |
/* in_chans = 16 */ | |
/* rtc_func_name = tconv__num_imgs_20__in_dim_0_28__in_dim_1_28__kern_sz_5__stride_1__in_pad_2__t_tile_sz_8__conv_has_relu_1__out_chans_32__in_chans_16 */ | |
/* out_ix_x_dim = 28 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%28) */ | |
/* out_ix_y_dim = 28 */ | |
/* out_ix_y_sz = 28 */ | |
/* out_ix_y_nomod = (out_ix/28) */ | |
/* out_ix_y = ((out_ix/28)%%28) */ | |
/* out_ix_chan_dim = 32 */ | |
/* out_ix_chan_sz = 784 */ | |
/* out_ix_chan_nomod = (out_ix/784) */ | |
/* out_ix_chan = ((out_ix/784)%%32) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 25088 */ | |
/* out_ix_img_nomod = (out_ix/25088) */ | |
/* out_ix_img = (out_ix/25088) */ | |
/* out_ix_sz = 501760 */ | |
/* tpb = 128 */ | |
/* out_line_y_dim = 28 */ | |
/* out_line_y_sz = 1 */ | |
/* out_line_y_nomod = out_line */ | |
/* out_line_y = (out_line%%28) */ | |
/* out_line_img_dim = 20 */ | |
/* out_line_img_sz = 28 */ | |
/* out_line_img_nomod = (out_line/28) */ | |
/* out_line_img = (out_line/28) */ | |
/* out_line_sz = 560 */ | |
/* in_ix_blk_x_dim = 12 */ | |
/* in_ix_blk_x_sz = 1 */ | |
/* in_ix_blk_x_nomod = in_ix */ | |
/* in_ix_blk_x = (in_ix%%12) */ | |
/* in_ix_blk_y_dim = 40 */ | |
/* in_ix_blk_y_sz = 12 */ | |
/* in_ix_blk_y_nomod = (in_ix/12) */ | |
/* in_ix_blk_y = ((in_ix/12)%%40) */ | |
/* in_ix_blk_in_chan_dim = 16 */ | |
/* in_ix_blk_in_chan_sz = 480 */ | |
/* in_ix_blk_in_chan_nomod = (in_ix/480) */ | |
/* in_ix_blk_in_chan = ((in_ix/480)%%16) */ | |
/* in_ix_blk_bx_dim = 4 */ | |
/* in_ix_blk_bx_sz = 7680 */ | |
/* in_ix_blk_bx_nomod = (in_ix/7680) */ | |
/* in_ix_blk_bx = ((in_ix/7680)%%4) */ | |
/* in_ix_blk_bline_dim = 18 */ | |
/* in_ix_blk_bline_sz = 30720 */ | |
/* in_ix_blk_bline_nomod = (in_ix/30720) */ | |
/* in_ix_blk_bline = (in_ix/30720) */ | |
/* in_ix_sz = 552960 */ | |
/* LOC_ID_1D_out_chan_tile_dim = 4 */ | |
/* LOC_ID_1D_out_chan_tile_sz = 1 */ | |
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */ | |
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%4) */ | |
/* LOC_ID_1D_blk_y_dim = 32 */ | |
/* LOC_ID_1D_blk_y_sz = 4 */ | |
/* LOC_ID_1D_blk_y_nomod = (LOC_ID_1D/4) */ | |
/* LOC_ID_1D_blk_y = (LOC_ID_1D/4) */ | |
/* LOC_ID_1D_sz = 128 */ | |
/* GRP_ID_1D_out_chan_blk_dim = 1 */ | |
/* GRP_ID_1D_out_chan_blk_sz = 1 */ | |
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */ | |
/* GRP_ID_1D_blk_bx_dim = 4 */ | |
/* GRP_ID_1D_blk_bx_sz = 1 */ | |
/* GRP_ID_1D_blk_bx_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_blk_bx = (GRP_ID_1D%%4) */ | |
/* GRP_ID_1D_blk_bline_dim = 18 */ | |
/* GRP_ID_1D_blk_bline_sz = 4 */ | |
/* GRP_ID_1D_blk_bline_nomod = (GRP_ID_1D/4) */ | |
/* GRP_ID_1D_blk_bline = (GRP_ID_1D/4) */ | |
/* GRP_ID_1D_sz = 72 */ | |
/* blk_filt_ix_sz = 32 */ | |
/* filts_smem_sz = 160 */ | |
/* in_smem_sz = 480 */ | |
/* out_smem_sz = 1024 */ | |
/* all_smem_sz = 1024 */ | |
/* filts_xp_ix_out_chan_tile_dim = 4 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%4) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 4 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/4) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/4)%%8) */ | |
/* filts_xp_ix_x_dim = 5 */ | |
/* filts_xp_ix_x_sz = 32 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/32) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/32)%%5) */ | |
/* filts_xp_ix_y_dim = 5 */ | |
/* filts_xp_ix_y_sz = 160 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/160) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/160)%%5) */ | |
/* filts_xp_ix_in_chan_dim = 16 */ | |
/* filts_xp_ix_in_chan_sz = 800 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/800) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/800)%%16) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 12800 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/12800) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/12800) */ | |
/* filts_xp_ix_sz = 12800 */ | |
/* out_chan_bias_smem_load_iter = 1 */ | |
/* filts_off_adj = LOC_ID_1D */ | |
/* filt_smem_loads = // begin filt_smem_loads | |
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)]; | |
if( (LOC_ID_1D + %(tpb) * 1) < %(filts_smem_sz) ) { filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)];} | |
filts_off += %(filts_xp_ix_y_sz); | |
// end filt_smem_loads */ | |
/* in_smem_loads = // begin in_smem_loads | |
in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 1)] = in[ blk_in_ix_base + (%(tpb)*1) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 2)] = in[ blk_in_ix_base + (%(tpb)*2) ]; | |
if( (LOC_ID_1D + %(tpb) * 3) < %(in_smem_sz)) { in_smem[(LOC_ID_1D + %(tpb) * 3)] = in[ blk_in_ix_base + (%(tpb)*3) ];} | |
blk_in_ix_base += %(in_ix_blk_in_chan_sz); | |
// end in_smem_loads */ | |
/* inner_loop_body = // begin inner_loop_body | |
in_strip[0] = in_smem_off[0]; | |
in_strip[1] = in_smem_off[1]; | |
in_strip[2] = in_smem_off[2]; | |
in_strip[3] = in_smem_off[3]; | |
in_strip[4] = in_smem_off[4]; | |
in_strip[5] = in_smem_off[5]; | |
in_strip[6] = in_smem_off[6]; | |
in_strip[7] = in_smem_off[7]; | |
in_strip[8] = in_smem_off[8]; | |
in_strip[9] = in_smem_off[9]; | |
in_strip[10] = in_smem_off[10]; | |
in_strip[11] = in_smem_off[11]; | |
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[1]; | |
out_tile[1] += filts_strip[1]*in_strip[1]; | |
out_tile[2] += filts_strip[2]*in_strip[1]; | |
out_tile[3] += filts_strip[3]*in_strip[1]; | |
out_tile[4] += filts_strip[4]*in_strip[1]; | |
out_tile[5] += filts_strip[5]*in_strip[1]; | |
out_tile[6] += filts_strip[6]*in_strip[1]; | |
out_tile[7] += filts_strip[7]*in_strip[1]; | |
out_tile[8] += filts_strip[0]*in_strip[2]; | |
out_tile[9] += filts_strip[1]*in_strip[2]; | |
out_tile[10] += filts_strip[2]*in_strip[2]; | |
out_tile[11] += filts_strip[3]*in_strip[2]; | |
out_tile[12] += filts_strip[4]*in_strip[2]; | |
out_tile[13] += filts_strip[5]*in_strip[2]; | |
out_tile[14] += filts_strip[6]*in_strip[2]; | |
out_tile[15] += filts_strip[7]*in_strip[2]; | |
out_tile[16] += filts_strip[0]*in_strip[3]; | |
out_tile[17] += filts_strip[1]*in_strip[3]; | |
out_tile[18] += filts_strip[2]*in_strip[3]; | |
out_tile[19] += filts_strip[3]*in_strip[3]; | |
out_tile[20] += filts_strip[4]*in_strip[3]; | |
out_tile[21] += filts_strip[5]*in_strip[3]; | |
out_tile[22] += filts_strip[6]*in_strip[3]; | |
out_tile[23] += filts_strip[7]*in_strip[3]; | |
out_tile[24] += filts_strip[0]*in_strip[4]; | |
out_tile[25] += filts_strip[1]*in_strip[4]; | |
out_tile[26] += filts_strip[2]*in_strip[4]; | |
out_tile[27] += filts_strip[3]*in_strip[4]; | |
out_tile[28] += filts_strip[4]*in_strip[4]; | |
out_tile[29] += filts_strip[5]*in_strip[4]; | |
out_tile[30] += filts_strip[6]*in_strip[4]; | |
out_tile[31] += filts_strip[7]*in_strip[4]; | |
out_tile[32] += filts_strip[0]*in_strip[5]; | |
out_tile[33] += filts_strip[1]*in_strip[5]; | |
out_tile[34] += filts_strip[2]*in_strip[5]; | |
out_tile[35] += filts_strip[3]*in_strip[5]; | |
out_tile[36] += filts_strip[4]*in_strip[5]; | |
out_tile[37] += filts_strip[5]*in_strip[5]; | |
out_tile[38] += filts_strip[6]*in_strip[5]; | |
out_tile[39] += filts_strip[7]*in_strip[5]; | |
out_tile[40] += filts_strip[0]*in_strip[6]; | |
out_tile[41] += filts_strip[1]*in_strip[6]; | |
out_tile[42] += filts_strip[2]*in_strip[6]; | |
out_tile[43] += filts_strip[3]*in_strip[6]; | |
out_tile[44] += filts_strip[4]*in_strip[6]; | |
out_tile[45] += filts_strip[5]*in_strip[6]; | |
out_tile[46] += filts_strip[6]*in_strip[6]; | |
out_tile[47] += filts_strip[7]*in_strip[6]; | |
out_tile[48] += filts_strip[0]*in_strip[7]; | |
out_tile[49] += filts_strip[1]*in_strip[7]; | |
out_tile[50] += filts_strip[2]*in_strip[7]; | |
out_tile[51] += filts_strip[3]*in_strip[7]; | |
out_tile[52] += filts_strip[4]*in_strip[7]; | |
out_tile[53] += filts_strip[5]*in_strip[7]; | |
out_tile[54] += filts_strip[6]*in_strip[7]; | |
out_tile[55] += filts_strip[7]*in_strip[7]; | |
out_tile[56] += filts_strip[0]*in_strip[8]; | |
out_tile[57] += filts_strip[1]*in_strip[8]; | |
out_tile[58] += filts_strip[2]*in_strip[8]; | |
out_tile[59] += filts_strip[3]*in_strip[8]; | |
out_tile[60] += filts_strip[4]*in_strip[8]; | |
out_tile[61] += filts_strip[5]*in_strip[8]; | |
out_tile[62] += filts_strip[6]*in_strip[8]; | |
out_tile[63] += filts_strip[7]*in_strip[8]; | |
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[2]; | |
out_tile[1] += filts_strip[1]*in_strip[2]; | |
out_tile[2] += filts_strip[2]*in_strip[2]; | |
out_tile[3] += filts_strip[3]*in_strip[2]; | |
out_tile[4] += filts_strip[4]*in_strip[2]; | |
out_tile[5] += filts_strip[5]*in_strip[2]; | |
out_tile[6] += filts_strip[6]*in_strip[2]; | |
out_tile[7] += filts_strip[7]*in_strip[2]; | |
out_tile[8] += filts_strip[0]*in_strip[3]; | |
out_tile[9] += filts_strip[1]*in_strip[3]; | |
out_tile[10] += filts_strip[2]*in_strip[3]; | |
out_tile[11] += filts_strip[3]*in_strip[3]; | |
out_tile[12] += filts_strip[4]*in_strip[3]; | |
out_tile[13] += filts_strip[5]*in_strip[3]; | |
out_tile[14] += filts_strip[6]*in_strip[3]; | |
out_tile[15] += filts_strip[7]*in_strip[3]; | |
out_tile[16] += filts_strip[0]*in_strip[4]; | |
out_tile[17] += filts_strip[1]*in_strip[4]; | |
out_tile[18] += filts_strip[2]*in_strip[4]; | |
out_tile[19] += filts_strip[3]*in_strip[4]; | |
out_tile[20] += filts_strip[4]*in_strip[4]; | |
out_tile[21] += filts_strip[5]*in_strip[4]; | |
out_tile[22] += filts_strip[6]*in_strip[4]; | |
out_tile[23] += filts_strip[7]*in_strip[4]; | |
out_tile[24] += filts_strip[0]*in_strip[5]; | |
out_tile[25] += filts_strip[1]*in_strip[5]; | |
out_tile[26] += filts_strip[2]*in_strip[5]; | |
out_tile[27] += filts_strip[3]*in_strip[5]; | |
out_tile[28] += filts_strip[4]*in_strip[5]; | |
out_tile[29] += filts_strip[5]*in_strip[5]; | |
out_tile[30] += filts_strip[6]*in_strip[5]; | |
out_tile[31] += filts_strip[7]*in_strip[5]; | |
out_tile[32] += filts_strip[0]*in_strip[6]; | |
out_tile[33] += filts_strip[1]*in_strip[6]; | |
out_tile[34] += filts_strip[2]*in_strip[6]; | |
out_tile[35] += filts_strip[3]*in_strip[6]; | |
out_tile[36] += filts_strip[4]*in_strip[6]; | |
out_tile[37] += filts_strip[5]*in_strip[6]; | |
out_tile[38] += filts_strip[6]*in_strip[6]; | |
out_tile[39] += filts_strip[7]*in_strip[6]; | |
out_tile[40] += filts_strip[0]*in_strip[7]; | |
out_tile[41] += filts_strip[1]*in_strip[7]; | |
out_tile[42] += filts_strip[2]*in_strip[7]; | |
out_tile[43] += filts_strip[3]*in_strip[7]; | |
out_tile[44] += filts_strip[4]*in_strip[7]; | |
out_tile[45] += filts_strip[5]*in_strip[7]; | |
out_tile[46] += filts_strip[6]*in_strip[7]; | |
out_tile[47] += filts_strip[7]*in_strip[7]; | |
out_tile[48] += filts_strip[0]*in_strip[8]; | |
out_tile[49] += filts_strip[1]*in_strip[8]; | |
out_tile[50] += filts_strip[2]*in_strip[8]; | |
out_tile[51] += filts_strip[3]*in_strip[8]; | |
out_tile[52] += filts_strip[4]*in_strip[8]; | |
out_tile[53] += filts_strip[5]*in_strip[8]; | |
out_tile[54] += filts_strip[6]*in_strip[8]; | |
out_tile[55] += filts_strip[7]*in_strip[8]; | |
out_tile[56] += filts_strip[0]*in_strip[9]; | |
out_tile[57] += filts_strip[1]*in_strip[9]; | |
out_tile[58] += filts_strip[2]*in_strip[9]; | |
out_tile[59] += filts_strip[3]*in_strip[9]; | |
out_tile[60] += filts_strip[4]*in_strip[9]; | |
out_tile[61] += filts_strip[5]*in_strip[9]; | |
out_tile[62] += filts_strip[6]*in_strip[9]; | |
out_tile[63] += filts_strip[7]*in_strip[9]; | |
filts_strip[0] = filts_smem_off[3*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[3*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[3*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[3*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[3*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[3*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[3*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[3*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[3]; | |
out_tile[1] += filts_strip[1]*in_strip[3]; | |
out_tile[2] += filts_strip[2]*in_strip[3]; | |
out_tile[3] += filts_strip[3]*in_strip[3]; | |
out_tile[4] += filts_strip[4]*in_strip[3]; | |
out_tile[5] += filts_strip[5]*in_strip[3]; | |
out_tile[6] += filts_strip[6]*in_strip[3]; | |
out_tile[7] += filts_strip[7]*in_strip[3]; | |
out_tile[8] += filts_strip[0]*in_strip[4]; | |
out_tile[9] += filts_strip[1]*in_strip[4]; | |
out_tile[10] += filts_strip[2]*in_strip[4]; | |
out_tile[11] += filts_strip[3]*in_strip[4]; | |
out_tile[12] += filts_strip[4]*in_strip[4]; | |
out_tile[13] += filts_strip[5]*in_strip[4]; | |
out_tile[14] += filts_strip[6]*in_strip[4]; | |
out_tile[15] += filts_strip[7]*in_strip[4]; | |
out_tile[16] += filts_strip[0]*in_strip[5]; | |
out_tile[17] += filts_strip[1]*in_strip[5]; | |
out_tile[18] += filts_strip[2]*in_strip[5]; | |
out_tile[19] += filts_strip[3]*in_strip[5]; | |
out_tile[20] += filts_strip[4]*in_strip[5]; | |
out_tile[21] += filts_strip[5]*in_strip[5]; | |
out_tile[22] += filts_strip[6]*in_strip[5]; | |
out_tile[23] += filts_strip[7]*in_strip[5]; | |
out_tile[24] += filts_strip[0]*in_strip[6]; | |
out_tile[25] += filts_strip[1]*in_strip[6]; | |
out_tile[26] += filts_strip[2]*in_strip[6]; | |
out_tile[27] += filts_strip[3]*in_strip[6]; | |
out_tile[28] += filts_strip[4]*in_strip[6]; | |
out_tile[29] += filts_strip[5]*in_strip[6]; | |
out_tile[30] += filts_strip[6]*in_strip[6]; | |
out_tile[31] += filts_strip[7]*in_strip[6]; | |
out_tile[32] += filts_strip[0]*in_strip[7]; | |
out_tile[33] += filts_strip[1]*in_strip[7]; | |
out_tile[34] += filts_strip[2]*in_strip[7]; | |
out_tile[35] += filts_strip[3]*in_strip[7]; | |
out_tile[36] += filts_strip[4]*in_strip[7]; | |
out_tile[37] += filts_strip[5]*in_strip[7]; | |
out_tile[38] += filts_strip[6]*in_strip[7]; | |
out_tile[39] += filts_strip[7]*in_strip[7]; | |
out_tile[40] += filts_strip[0]*in_strip[8]; | |
out_tile[41] += filts_strip[1]*in_strip[8]; | |
out_tile[42] += filts_strip[2]*in_strip[8]; | |
out_tile[43] += filts_strip[3]*in_strip[8]; | |
out_tile[44] += filts_strip[4]*in_strip[8]; | |
out_tile[45] += filts_strip[5]*in_strip[8]; | |
out_tile[46] += filts_strip[6]*in_strip[8]; | |
out_tile[47] += filts_strip[7]*in_strip[8]; | |
out_tile[48] += filts_strip[0]*in_strip[9]; | |
out_tile[49] += filts_strip[1]*in_strip[9]; | |
out_tile[50] += filts_strip[2]*in_strip[9]; | |
out_tile[51] += filts_strip[3]*in_strip[9]; | |
out_tile[52] += filts_strip[4]*in_strip[9]; | |
out_tile[53] += filts_strip[5]*in_strip[9]; | |
out_tile[54] += filts_strip[6]*in_strip[9]; | |
out_tile[55] += filts_strip[7]*in_strip[9]; | |
out_tile[56] += filts_strip[0]*in_strip[10]; | |
out_tile[57] += filts_strip[1]*in_strip[10]; | |
out_tile[58] += filts_strip[2]*in_strip[10]; | |
out_tile[59] += filts_strip[3]*in_strip[10]; | |
out_tile[60] += filts_strip[4]*in_strip[10]; | |
out_tile[61] += filts_strip[5]*in_strip[10]; | |
out_tile[62] += filts_strip[6]*in_strip[10]; | |
out_tile[63] += filts_strip[7]*in_strip[10]; | |
filts_strip[0] = filts_smem_off[4*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[4*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[4*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[4*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[4*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[4*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[4*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[4*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[4]; | |
out_tile[1] += filts_strip[1]*in_strip[4]; | |
out_tile[2] += filts_strip[2]*in_strip[4]; | |
out_tile[3] += filts_strip[3]*in_strip[4]; | |
out_tile[4] += filts_strip[4]*in_strip[4]; | |
out_tile[5] += filts_strip[5]*in_strip[4]; | |
out_tile[6] += filts_strip[6]*in_strip[4]; | |
out_tile[7] += filts_strip[7]*in_strip[4]; | |
out_tile[8] += filts_strip[0]*in_strip[5]; | |
out_tile[9] += filts_strip[1]*in_strip[5]; | |
out_tile[10] += filts_strip[2]*in_strip[5]; | |
out_tile[11] += filts_strip[3]*in_strip[5]; | |
out_tile[12] += filts_strip[4]*in_strip[5]; | |
out_tile[13] += filts_strip[5]*in_strip[5]; | |
out_tile[14] += filts_strip[6]*in_strip[5]; | |
out_tile[15] += filts_strip[7]*in_strip[5]; | |
out_tile[16] += filts_strip[0]*in_strip[6]; | |
out_tile[17] += filts_strip[1]*in_strip[6]; | |
out_tile[18] += filts_strip[2]*in_strip[6]; | |
out_tile[19] += filts_strip[3]*in_strip[6]; | |
out_tile[20] += filts_strip[4]*in_strip[6]; | |
out_tile[21] += filts_strip[5]*in_strip[6]; | |
out_tile[22] += filts_strip[6]*in_strip[6]; | |
out_tile[23] += filts_strip[7]*in_strip[6]; | |
out_tile[24] += filts_strip[0]*in_strip[7]; | |
out_tile[25] += filts_strip[1]*in_strip[7]; | |
out_tile[26] += filts_strip[2]*in_strip[7]; | |
out_tile[27] += filts_strip[3]*in_strip[7]; | |
out_tile[28] += filts_strip[4]*in_strip[7]; | |
out_tile[29] += filts_strip[5]*in_strip[7]; | |
out_tile[30] += filts_strip[6]*in_strip[7]; | |
out_tile[31] += filts_strip[7]*in_strip[7]; | |
out_tile[32] += filts_strip[0]*in_strip[8]; | |
out_tile[33] += filts_strip[1]*in_strip[8]; | |
out_tile[34] += filts_strip[2]*in_strip[8]; | |
out_tile[35] += filts_strip[3]*in_strip[8]; | |
out_tile[36] += filts_strip[4]*in_strip[8]; | |
out_tile[37] += filts_strip[5]*in_strip[8]; | |
out_tile[38] += filts_strip[6]*in_strip[8]; | |
out_tile[39] += filts_strip[7]*in_strip[8]; | |
out_tile[40] += filts_strip[0]*in_strip[9]; | |
out_tile[41] += filts_strip[1]*in_strip[9]; | |
out_tile[42] += filts_strip[2]*in_strip[9]; | |
out_tile[43] += filts_strip[3]*in_strip[9]; | |
out_tile[44] += filts_strip[4]*in_strip[9]; | |
out_tile[45] += filts_strip[5]*in_strip[9]; | |
out_tile[46] += filts_strip[6]*in_strip[9]; | |
out_tile[47] += filts_strip[7]*in_strip[9]; | |
out_tile[48] += filts_strip[0]*in_strip[10]; | |
out_tile[49] += filts_strip[1]*in_strip[10]; | |
out_tile[50] += filts_strip[2]*in_strip[10]; | |
out_tile[51] += filts_strip[3]*in_strip[10]; | |
out_tile[52] += filts_strip[4]*in_strip[10]; | |
out_tile[53] += filts_strip[5]*in_strip[10]; | |
out_tile[54] += filts_strip[6]*in_strip[10]; | |
out_tile[55] += filts_strip[7]*in_strip[10]; | |
out_tile[56] += filts_strip[0]*in_strip[11]; | |
out_tile[57] += filts_strip[1]*in_strip[11]; | |
out_tile[58] += filts_strip[2]*in_strip[11]; | |
out_tile[59] += filts_strip[3]*in_strip[11]; | |
out_tile[60] += filts_strip[4]*in_strip[11]; | |
out_tile[61] += filts_strip[5]*in_strip[11]; | |
out_tile[62] += filts_strip[6]*in_strip[11]; | |
out_tile[63] += filts_strip[7]*in_strip[11]; | |
*/ | |
/* t_tile_bias_loads = // begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
// end t_tile_bias_loads */ | |
/* t_tile_stores = // begin t_tile_stores | |
if( %(out_line_img) >= %(out_ix_img_dim) ) { return; } | |
int32_t out_x = %(GRP_ID_1D_blk_bx)*%(t_tile_sz); | |
int32_t out_chan = (%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim) + %(LOC_ID_1D_out_chan_tile))*%(t_tile_sz); | |
GASQ float * out_off = out + %(out_line_img)*%(out_ix_img_sz) + out_chan*%(out_ix_chan_sz) + %(out_line_y)*%(out_ix_y_sz) + out_x*%(out_ix_x_sz) ; | |
if( (out_x + 0) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( (out_x + 1) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( (out_x + 2) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( (out_x + 3) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( (out_x + 4) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( (out_x + 5) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( (out_x + 6) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( (out_x + 7) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores */ | |
CUCL_GLOBAL_KERNEL void in_tile_xpose__num_imgs_20__stride_1__kern_sz_5__in_pad_2__in_chans_16__ysz_28__xsz_28__tix_pels_tile_sz_32__t_tile_sz_8__bix_pels_blk_sz_72( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const out_ix = GLOB_ID_1D; | |
if( out_ix >= 552960 ) { return; } | |
int32_t const out_line = (out_ix/30720)*32; | |
int32_t const fi_skip_in_lines = (out_line%28)*1; | |
int32_t const in_line = (((out_ix/12)%40)+fi_skip_in_lines); | |
int32_t const img_in_lines = (28 - 1)*1 + 5; | |
int32_t const img_off = in_line/img_in_lines; | |
int32_t const img = (out_line/28) + img_off; | |
int32_t const iy = (in_line % img_in_lines) - 2; //(out_line%28)*1 + ((out_ix/12)%40) - 2; | |
int32_t const ix = ((out_ix/7680)%4)*8*1 + (out_ix%12) - 2; | |
float v = 0.0f; | |
if( 1 | |
&& ( ix >= 0 ) | |
&& ( iy >= 0 ) | |
&& ( ix < 28 ) | |
&& ( iy < 28 ) | |
&& ( img < 20 ) | |
) | |
{ | |
v = in[ img*12544 + | |
((out_ix/480)%16)*784 + | |
iy*28 + | |
ix*1 ]; | |
} | |
out[out_ix] = v; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* stride = 1 */ | |
/* kern_sz = 5 */ | |
/* in_pad = 2 */ | |
/* in_chans = 16 */ | |
/* ysz = 28 */ | |
/* xsz = 28 */ | |
/* tix_pels_tile_sz = 32 */ | |
/* t_tile_sz = 8 */ | |
/* bix_pels_blk_sz = 72 */ | |
/* rtc_func_name = in_tile_xpose__num_imgs_20__stride_1__kern_sz_5__in_pad_2__in_chans_16__ysz_28__xsz_28__tix_pels_tile_sz_32__t_tile_sz_8__bix_pels_blk_sz_72 */ | |
/* out_ix_blk_x_dim = 12 */ | |
/* out_ix_blk_x_sz = 1 */ | |
/* out_ix_blk_x_nomod = out_ix */ | |
/* out_ix_blk_x = (out_ix%%12) */ | |
/* out_ix_blk_y_dim = 40 */ | |
/* out_ix_blk_y_sz = 12 */ | |
/* out_ix_blk_y_nomod = (out_ix/12) */ | |
/* out_ix_blk_y = ((out_ix/12)%%40) */ | |
/* out_ix_blk_in_chan_dim = 16 */ | |
/* out_ix_blk_in_chan_sz = 480 */ | |
/* out_ix_blk_in_chan_nomod = (out_ix/480) */ | |
/* out_ix_blk_in_chan = ((out_ix/480)%%16) */ | |
/* out_ix_blk_bx_dim = 4 */ | |
/* out_ix_blk_bx_sz = 7680 */ | |
/* out_ix_blk_bx_nomod = (out_ix/7680) */ | |
/* out_ix_blk_bx = ((out_ix/7680)%%4) */ | |
/* out_ix_blk_bline_dim = 18 */ | |
/* out_ix_blk_bline_sz = 30720 */ | |
/* out_ix_blk_bline_nomod = (out_ix/30720) */ | |
/* out_ix_blk_bline = (out_ix/30720) */ | |
/* out_ix_sz = 552960 */ | |
/* out_line_y_dim = 28 */ | |
/* out_line_y_sz = 1 */ | |
/* out_line_y_nomod = out_line */ | |
/* out_line_y = (out_line%%28) */ | |
/* out_line_img_dim = 20 */ | |
/* out_line_img_sz = 28 */ | |
/* out_line_img_nomod = (out_line/28) */ | |
/* out_line_img = (out_line/28) */ | |
/* out_line_sz = 560 */ | |
/* in_ix_x_dim = 28 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%28) */ | |
/* in_ix_y_dim = 28 */ | |
/* in_ix_y_sz = 28 */ | |
/* in_ix_y_nomod = (in_ix/28) */ | |
/* in_ix_y = ((in_ix/28)%%28) */ | |
/* in_ix_chan_dim = 16 */ | |
/* in_ix_chan_sz = 784 */ | |
/* in_ix_chan_nomod = (in_ix/784) */ | |
/* in_ix_chan = ((in_ix/784)%%16) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 12544 */ | |
/* in_ix_img_nomod = (in_ix/12544) */ | |
/* in_ix_img = (in_ix/12544) */ | |
/* in_ix_sz = 250880 */ | |
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_32__in_chans_16__kysz_5__kxsz_5( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x | |
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile | |
{ | |
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg | |
int32_t const filts_ix = GLOB_ID_1D; | |
if( filts_ix >= 12800 ) { return; } | |
int32_t const fioc = (filts_ix/400); | |
float val = 0.0f; | |
int32_t const filts_xp_ix = | |
(fioc/32)*12800 + | |
(fioc%8)*4 + | |
((fioc/8)%4)*1 + | |
((filts_ix/25)%16)*800 + | |
((filts_ix/5)%5)*160 + | |
(filts_ix%5)*32; | |
#if 1 | |
val = in[filts_ix]; | |
#else | |
if( ((filts_ix/25)%16) == 0 ) { | |
// if( ((filts_ix%5) == 5) && (((filts_ix/5)%5) == 5) ) | |
{ | |
val = (filts_ix%5)*100 + ((filts_ix/5)%5); | |
} | |
} | |
#endif | |
out[filts_xp_ix] = val; | |
} | |
// -- template substituion table used: -- | |
/* out_chans = 32 */ | |
/* in_chans = 16 */ | |
/* kysz = 5 */ | |
/* kxsz = 5 */ | |
/* rtc_func_name = xpose_filts__out_chans_32__in_chans_16__kysz_5__kxsz_5 */ | |
/* t_tile_sz = 8 */ | |
/* filts_ix_x_dim = 5 */ | |
/* filts_ix_x_sz = 1 */ | |
/* filts_ix_x_nomod = filts_ix */ | |
/* filts_ix_x = (filts_ix%%5) */ | |
/* filts_ix_y_dim = 5 */ | |
/* filts_ix_y_sz = 5 */ | |
/* filts_ix_y_nomod = (filts_ix/5) */ | |
/* filts_ix_y = ((filts_ix/5)%%5) */ | |
/* filts_ix_in_chan_dim = 16 */ | |
/* filts_ix_in_chan_sz = 25 */ | |
/* filts_ix_in_chan_nomod = (filts_ix/25) */ | |
/* filts_ix_in_chan = ((filts_ix/25)%%16) */ | |
/* filts_ix_out_chan_dim = 32 */ | |
/* filts_ix_out_chan_sz = 400 */ | |
/* filts_ix_out_chan_nomod = (filts_ix/400) */ | |
/* filts_ix_out_chan = (filts_ix/400) */ | |
/* filts_ix_sz = 12800 */ | |
/* filts_xp_ix_out_chan_tile_dim = 4 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%4) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 4 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/4) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/4)%%8) */ | |
/* filts_xp_ix_x_dim = 5 */ | |
/* filts_xp_ix_x_sz = 32 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/32) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/32)%%5) */ | |
/* filts_xp_ix_y_dim = 5 */ | |
/* filts_xp_ix_y_sz = 160 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/160) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/160)%%5) */ | |
/* filts_xp_ix_in_chan_dim = 16 */ | |
/* filts_xp_ix_in_chan_sz = 800 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/800) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/800)%%16) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 12800 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/12800) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/12800) */ | |
/* filts_xp_ix_sz = 12800 */ | |
/* fioc_out_chan_reg_dim = 8 */ | |
/* fioc_out_chan_reg_sz = 1 */ | |
/* fioc_out_chan_reg_nomod = fioc */ | |
/* fioc_out_chan_reg = (fioc%%8) */ | |
/* fioc_out_chan_tile_dim = 4 */ | |
/* fioc_out_chan_tile_sz = 8 */ | |
/* fioc_out_chan_tile_nomod = (fioc/8) */ | |
/* fioc_out_chan_tile = ((fioc/8)%%4) */ | |
/* fioc_out_chan_blk_dim = 1 */ | |
/* fioc_out_chan_blk_sz = 32 */ | |
/* fioc_out_chan_blk_nomod = (fioc/32) */ | |
/* fioc_out_chan_blk = (fioc/32) */ | |
/* fioc_sz = 32 */ | |
CUCL_GLOBAL_KERNEL void pool__num_imgs_20__in_pad_1__in_dim_0_28__in_dim_1_28__conv_has_relu_0__kern_sz_3__stride_1__out_chans_192__avg_pool_0( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const out_ix = GLOB_ID_1D; | |
if( out_ix >= 3010560 ) { return; } | |
float out_v = 0.0f; | |
for( int32_t kx = 0; kx != 3; ++kx ) { | |
for( int32_t ky = 0; ky != 3; ++ky ) { | |
float v = 0; | |
int const in_ix_y = ((out_ix/28)%28)*1 + ky - 1; | |
int const in_ix_x = (out_ix%28)*1 + kx - 1; | |
if(in_ix_y >= 0 && in_ix_x >= 0 && in_ix_x < 28 && in_ix_y < 28 ) { | |
int32_t const in_ix = (out_ix/150528)*150528 + ((out_ix/784)%192)*784 + | |
in_ix_y*28 + in_ix_x*1; | |
v = in[in_ix]; | |
} | |
out_v = max( out_v, v ); | |
} | |
} | |
; | |
out[out_ix] = out_v; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_pad = 1 */ | |
/* in_dim_0 = 28 */ | |
/* in_dim_1 = 28 */ | |
/* conv_has_relu = 0 */ | |
/* kern_sz = 3 */ | |
/* stride = 1 */ | |
/* out_chans = 192 */ | |
/* avg_pool = 0 */ | |
/* rtc_func_name = pool__num_imgs_20__in_pad_1__in_dim_0_28__in_dim_1_28__conv_has_relu_0__kern_sz_3__stride_1__out_chans_192__avg_pool_0 */ | |
/* t_tile_sz = 8 */ | |
/* out_ix_x_dim = 28 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%28) */ | |
/* out_ix_y_dim = 28 */ | |
/* out_ix_y_sz = 28 */ | |
/* out_ix_y_nomod = (out_ix/28) */ | |
/* out_ix_y = ((out_ix/28)%%28) */ | |
/* out_ix_chan_dim = 192 */ | |
/* out_ix_chan_sz = 784 */ | |
/* out_ix_chan_nomod = (out_ix/784) */ | |
/* out_ix_chan = ((out_ix/784)%%192) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 150528 */ | |
/* out_ix_img_nomod = (out_ix/150528) */ | |
/* out_ix_img = (out_ix/150528) */ | |
/* out_ix_sz = 3010560 */ | |
/* in_ix_x_dim = 28 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%28) */ | |
/* in_ix_y_dim = 28 */ | |
/* in_ix_y_sz = 28 */ | |
/* in_ix_y_nomod = (in_ix/28) */ | |
/* in_ix_y = ((in_ix/28)%%28) */ | |
/* in_ix_chan_dim = 192 */ | |
/* in_ix_chan_sz = 784 */ | |
/* in_ix_chan_nomod = (in_ix/784) */ | |
/* in_ix_chan = ((in_ix/784)%%192) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 150528 */ | |
/* in_ix_img_nomod = (in_ix/150528) */ | |
/* in_ix_img = (in_ix/150528) */ | |
/* in_ix_sz = 3010560 */ | |
/* op = out_v = max( out_v, v ) */ | |
/* op_post = */ | |
// 256 tbp | |
// each thread: computes 8x8 block of out | |
// loop over k dim | |
CUCL_GLOBAL_KERNEL void conv__num_imgs_20__in_pad_0__in_dim_0_28__in_dim_1_28__conv_has_relu_1__kern_sz_1__stride_1__out_chans_32__in_chans_192( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out ) { | |
LOCSHAR_MEM float in_smem[32*8]; | |
int32_t const blk_filt_ix_sz = 4*8; | |
LOCSHAR_MEM float filts_smem[4*8]; // aka blk_filt_ix_sz, which wasn't const enough OpenCL | |
float out_tile[8*8] = {0}; // tile of output for this thread to compute, stored in registers | |
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem | |
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz ) | |
float in_strip[8]; // across patches (approx square block in x/y space, favoring x if sqrt() not integer) | |
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*6144; | |
int32_t const blk_patch_ix_sz = 32*8; | |
int32_t const blk_patch_ix_base = GRP_ID_1D*blk_patch_ix_sz; | |
// iteratate over filter elements | |
int32_t filts_off = blk_filt_ix_base; | |
for( int32_t filts_ix_out_chan_elem = 0; filts_ix_out_chan_elem != | |
(192 * 1 * 1); ++filts_ix_out_chan_elem ) { | |
BARRIER_SYNC; | |
if( LOC_ID_1D < blk_filt_ix_sz ) { | |
#ifdef NO_IOX // by default, we don't ever disable this, since it's seems about as good as it can be already | |
//filts_smem[LOC_ID_1D] = LOC_ID_1D; | |
filts_smem[LOC_ID_1D] = filts[LOC_ID_1D]; | |
#else | |
filts_smem[LOC_ID_1D] = filts[filts_off + LOC_ID_1D]; | |
#endif | |
} | |
for( int32_t i = 0; i != 2; ++i ) { | |
if( (LOC_ID_1D+LOC_SZ_1D*i) < blk_patch_ix_sz ) { | |
int32_t const t_smem_patch_ix = (blk_patch_ix_base+LOC_ID_1D+LOC_SZ_1D*i); | |
#ifdef NO_IO | |
//float v = LOC_ID_1D; | |
//float v = in[LOC_ID_1D]; | |
float v = in[filts_off + LOC_ID_1D]; | |
#else | |
float v = 0; | |
int const smem_in_ix_y = ((t_smem_patch_ix/28)%28)*1+(filts_ix_out_chan_elem%1) - 0; | |
int const smem_in_ix_x = (t_smem_patch_ix%28)*1+(filts_ix_out_chan_elem%1) - 0; | |
if(smem_in_ix_y >= 0 && smem_in_ix_x >= 0 && | |
(t_smem_patch_ix/784) < 20 && | |
smem_in_ix_x < 28 && smem_in_ix_y < 28 ) { | |
v = in[(t_smem_patch_ix/784)*150528 + | |
filts_ix_out_chan_elem*784 + | |
smem_in_ix_y*28 + | |
smem_in_ix_x*1]; | |
}; | |
#endif | |
in_smem[LOC_ID_1D+LOC_SZ_1D*i] = v; | |
} | |
} | |
filts_off += 32; | |
BARRIER_SYNC; | |
#ifdef NO_IO | |
// begin t_tile_dummy_loads | |
filts_strip[0] = filts_smem[(LOC_ID_1D % 32) + 0]; | |
filts_strip[1] = filts_smem[(LOC_ID_1D % 32) + 1]; | |
filts_strip[2] = filts_smem[(LOC_ID_1D % 32) + 2]; | |
filts_strip[3] = filts_smem[(LOC_ID_1D % 32) + 3]; | |
filts_strip[4] = filts_smem[(LOC_ID_1D % 32) + 4]; | |
filts_strip[5] = filts_smem[(LOC_ID_1D % 32) + 5]; | |
filts_strip[6] = filts_smem[(LOC_ID_1D % 32) + 6]; | |
filts_strip[7] = filts_smem[(LOC_ID_1D % 32) + 7]; | |
in_strip[0] = in_smem[(LOC_ID_1D % 32) + 0]; | |
in_strip[1] = in_smem[(LOC_ID_1D % 32) + 1]; | |
in_strip[2] = in_smem[(LOC_ID_1D % 32) + 2]; | |
in_strip[3] = in_smem[(LOC_ID_1D % 32) + 3]; | |
in_strip[4] = in_smem[(LOC_ID_1D % 32) + 4]; | |
in_strip[5] = in_smem[(LOC_ID_1D % 32) + 5]; | |
in_strip[6] = in_smem[(LOC_ID_1D % 32) + 6]; | |
in_strip[7] = in_smem[(LOC_ID_1D % 32) + 7]; | |
// end t_tile_dummy_loads; | |
#else | |
// begin t_tile_loads | |
filts_strip[0] = filts_smem[(LOC_ID_1D%4)+0*4]; | |
filts_strip[1] = filts_smem[(LOC_ID_1D%4)+1*4]; | |
filts_strip[2] = filts_smem[(LOC_ID_1D%4)+2*4]; | |
filts_strip[3] = filts_smem[(LOC_ID_1D%4)+3*4]; | |
filts_strip[4] = filts_smem[(LOC_ID_1D%4)+4*4]; | |
filts_strip[5] = filts_smem[(LOC_ID_1D%4)+5*4]; | |
filts_strip[6] = filts_smem[(LOC_ID_1D%4)+6*4]; | |
filts_strip[7] = filts_smem[(LOC_ID_1D%4)+7*4]; | |
in_strip[0] = in_smem[8*(LOC_ID_1D/4)+0]; | |
in_strip[1] = in_smem[8*(LOC_ID_1D/4)+1]; | |
in_strip[2] = in_smem[8*(LOC_ID_1D/4)+2]; | |
in_strip[3] = in_smem[8*(LOC_ID_1D/4)+3]; | |
in_strip[4] = in_smem[8*(LOC_ID_1D/4)+4]; | |
in_strip[5] = in_smem[8*(LOC_ID_1D/4)+5]; | |
in_strip[6] = in_smem[8*(LOC_ID_1D/4)+6]; | |
in_strip[7] = in_smem[8*(LOC_ID_1D/4)+7]; | |
// end t_tile_loads; | |
#endif | |
// (2) do 8^2 fmas into out_tile | |
// begin t_tile_fmas | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
// end t_tile_fmas; | |
} | |
// load per-block biases into smem | |
BARRIER_SYNC; | |
if( LOC_ID_1D < blk_filt_ix_sz ) { | |
int32_t const ocix_base = (GRP_ID_1D%1)*blk_filt_ix_sz; | |
int32_t const load_reg = LOC_ID_1D / 4; | |
int32_t const load_tile = LOC_ID_1D % 4; | |
int32_t const ocix = ocix_base + load_tile*8 + load_reg; | |
if( ocix < 32 ) { filts_smem[LOC_ID_1D] = biases[ ocix ]; } | |
//int32_t const ocix_tile = (ocix / 8) % 4; | |
//int32_t const ocix_reg = ocix % 8; | |
//filts_smem[ocix_tile * 1 + ocix_reg * 4] = biases[ocix]; | |
} | |
BARRIER_SYNC; | |
// load biases into filts_strip | |
// begin t_tile_loads | |
filts_strip[0] = filts_smem[(LOC_ID_1D%4)+0*4]; | |
filts_strip[1] = filts_smem[(LOC_ID_1D%4)+1*4]; | |
filts_strip[2] = filts_smem[(LOC_ID_1D%4)+2*4]; | |
filts_strip[3] = filts_smem[(LOC_ID_1D%4)+3*4]; | |
filts_strip[4] = filts_smem[(LOC_ID_1D%4)+4*4]; | |
filts_strip[5] = filts_smem[(LOC_ID_1D%4)+5*4]; | |
filts_strip[6] = filts_smem[(LOC_ID_1D%4)+6*4]; | |
filts_strip[7] = filts_smem[(LOC_ID_1D%4)+7*4]; | |
in_strip[0] = in_smem[8*(LOC_ID_1D/4)+0]; | |
in_strip[1] = in_smem[8*(LOC_ID_1D/4)+1]; | |
in_strip[2] = in_smem[8*(LOC_ID_1D/4)+2]; | |
in_strip[3] = in_smem[8*(LOC_ID_1D/4)+3]; | |
in_strip[4] = in_smem[8*(LOC_ID_1D/4)+4]; | |
in_strip[5] = in_smem[8*(LOC_ID_1D/4)+5]; | |
in_strip[6] = in_smem[8*(LOC_ID_1D/4)+6]; | |
in_strip[7] = in_smem[8*(LOC_ID_1D/4)+7]; | |
// end t_tile_loads; | |
// add bias to each elem of out_tile[] and store the results to out[] | |
#ifdef NO_IO | |
// begin t_tile_dummy_stores | |
out[0] = 0.0f | |
+ max(0.0f,out_tile[0] + filts_strip[0]) | |
+ max(0.0f,out_tile[1] + filts_strip[1]) | |
+ max(0.0f,out_tile[2] + filts_strip[2]) | |
+ max(0.0f,out_tile[3] + filts_strip[3]) | |
+ max(0.0f,out_tile[4] + filts_strip[4]) | |
+ max(0.0f,out_tile[5] + filts_strip[5]) | |
+ max(0.0f,out_tile[6] + filts_strip[6]) | |
+ max(0.0f,out_tile[7] + filts_strip[7]) | |
+ max(0.0f,out_tile[8] + filts_strip[0]) | |
+ max(0.0f,out_tile[9] + filts_strip[1]) | |
+ max(0.0f,out_tile[10] + filts_strip[2]) | |
+ max(0.0f,out_tile[11] + filts_strip[3]) | |
+ max(0.0f,out_tile[12] + filts_strip[4]) | |
+ max(0.0f,out_tile[13] + filts_strip[5]) | |
+ max(0.0f,out_tile[14] + filts_strip[6]) | |
+ max(0.0f,out_tile[15] + filts_strip[7]) | |
+ max(0.0f,out_tile[16] + filts_strip[0]) | |
+ max(0.0f,out_tile[17] + filts_strip[1]) | |
+ max(0.0f,out_tile[18] + filts_strip[2]) | |
+ max(0.0f,out_tile[19] + filts_strip[3]) | |
+ max(0.0f,out_tile[20] + filts_strip[4]) | |
+ max(0.0f,out_tile[21] + filts_strip[5]) | |
+ max(0.0f,out_tile[22] + filts_strip[6]) | |
+ max(0.0f,out_tile[23] + filts_strip[7]) | |
+ max(0.0f,out_tile[24] + filts_strip[0]) | |
+ max(0.0f,out_tile[25] + filts_strip[1]) | |
+ max(0.0f,out_tile[26] + filts_strip[2]) | |
+ max(0.0f,out_tile[27] + filts_strip[3]) | |
+ max(0.0f,out_tile[28] + filts_strip[4]) | |
+ max(0.0f,out_tile[29] + filts_strip[5]) | |
+ max(0.0f,out_tile[30] + filts_strip[6]) | |
+ max(0.0f,out_tile[31] + filts_strip[7]) | |
+ max(0.0f,out_tile[32] + filts_strip[0]) | |
+ max(0.0f,out_tile[33] + filts_strip[1]) | |
+ max(0.0f,out_tile[34] + filts_strip[2]) | |
+ max(0.0f,out_tile[35] + filts_strip[3]) | |
+ max(0.0f,out_tile[36] + filts_strip[4]) | |
+ max(0.0f,out_tile[37] + filts_strip[5]) | |
+ max(0.0f,out_tile[38] + filts_strip[6]) | |
+ max(0.0f,out_tile[39] + filts_strip[7]) | |
+ max(0.0f,out_tile[40] + filts_strip[0]) | |
+ max(0.0f,out_tile[41] + filts_strip[1]) | |
+ max(0.0f,out_tile[42] + filts_strip[2]) | |
+ max(0.0f,out_tile[43] + filts_strip[3]) | |
+ max(0.0f,out_tile[44] + filts_strip[4]) | |
+ max(0.0f,out_tile[45] + filts_strip[5]) | |
+ max(0.0f,out_tile[46] + filts_strip[6]) | |
+ max(0.0f,out_tile[47] + filts_strip[7]) | |
+ max(0.0f,out_tile[48] + filts_strip[0]) | |
+ max(0.0f,out_tile[49] + filts_strip[1]) | |
+ max(0.0f,out_tile[50] + filts_strip[2]) | |
+ max(0.0f,out_tile[51] + filts_strip[3]) | |
+ max(0.0f,out_tile[52] + filts_strip[4]) | |
+ max(0.0f,out_tile[53] + filts_strip[5]) | |
+ max(0.0f,out_tile[54] + filts_strip[6]) | |
+ max(0.0f,out_tile[55] + filts_strip[7]) | |
+ max(0.0f,out_tile[56] + filts_strip[0]) | |
+ max(0.0f,out_tile[57] + filts_strip[1]) | |
+ max(0.0f,out_tile[58] + filts_strip[2]) | |
+ max(0.0f,out_tile[59] + filts_strip[3]) | |
+ max(0.0f,out_tile[60] + filts_strip[4]) | |
+ max(0.0f,out_tile[61] + filts_strip[5]) | |
+ max(0.0f,out_tile[62] + filts_strip[6]) | |
+ max(0.0f,out_tile[63] + filts_strip[7]) | |
; | |
// end t_tile_dummy_stores; | |
#else | |
// begin t_tile_stores | |
int32_t tpix[8]; | |
int32_t tcix[8]; | |
tpix[0] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+0)/784)*25088 + | |
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+0) % 784 ); // cache out patch ixs | |
tpix[1] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+1)/784)*25088 + | |
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+1) % 784 ); // cache out patch ixs | |
tpix[2] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+2)/784)*25088 + | |
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+2) % 784 ); // cache out patch ixs | |
tpix[3] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+3)/784)*25088 + | |
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+3) % 784 ); // cache out patch ixs | |
tpix[4] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+4)/784)*25088 + | |
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+4) % 784 ); // cache out patch ixs | |
tpix[5] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+5)/784)*25088 + | |
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+5) % 784 ); // cache out patch ixs | |
tpix[6] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+6)/784)*25088 + | |
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+6) % 784 ); // cache out patch ixs | |
tpix[7] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+7)/784)*25088 + | |
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+7) % 784 ); // cache out patch ixs | |
tcix[0] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+0)*784; // cache out chan ixs | |
tcix[1] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+1)*784; // cache out chan ixs | |
tcix[2] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+2)*784; // cache out chan ixs | |
tcix[3] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+3)*784; // cache out chan ixs | |
tcix[4] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+4)*784; // cache out chan ixs | |
tcix[5] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+5)*784; // cache out chan ixs | |
tcix[6] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+6)*784; // cache out chan ixs | |
tcix[7] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+7)*784; // cache out chan ixs | |
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+0) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (32*784) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( tcix[1] < (32*784) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( tcix[2] < (32*784) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( tcix[3] < (32*784) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( tcix[4] < (32*784) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( tcix[5] < (32*784) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( tcix[6] < (32*784) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( tcix[7] < (32*784) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+1) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (32*784) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( tcix[1] < (32*784) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( tcix[2] < (32*784) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( tcix[3] < (32*784) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( tcix[4] < (32*784) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( tcix[5] < (32*784) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( tcix[6] < (32*784) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( tcix[7] < (32*784) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+2) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (32*784) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( tcix[1] < (32*784) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( tcix[2] < (32*784) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( tcix[3] < (32*784) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( tcix[4] < (32*784) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( tcix[5] < (32*784) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( tcix[6] < (32*784) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( tcix[7] < (32*784) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+3) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (32*784) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( tcix[1] < (32*784) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( tcix[2] < (32*784) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( tcix[3] < (32*784) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( tcix[4] < (32*784) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( tcix[5] < (32*784) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( tcix[6] < (32*784) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( tcix[7] < (32*784) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+4) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (32*784) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( tcix[1] < (32*784) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( tcix[2] < (32*784) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( tcix[3] < (32*784) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( tcix[4] < (32*784) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( tcix[5] < (32*784) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( tcix[6] < (32*784) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( tcix[7] < (32*784) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+5) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (32*784) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( tcix[1] < (32*784) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( tcix[2] < (32*784) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( tcix[3] < (32*784) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( tcix[4] < (32*784) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( tcix[5] < (32*784) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( tcix[6] < (32*784) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( tcix[7] < (32*784) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+6) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (32*784) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( tcix[1] < (32*784) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( tcix[2] < (32*784) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( tcix[3] < (32*784) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( tcix[4] < (32*784) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( tcix[5] < (32*784) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( tcix[6] < (32*784) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( tcix[7] < (32*784) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+7) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (32*784) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( tcix[1] < (32*784) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( tcix[2] < (32*784) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( tcix[3] < (32*784) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( tcix[4] < (32*784) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( tcix[5] < (32*784) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( tcix[6] < (32*784) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( tcix[7] < (32*784) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores; | |
#endif | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_pad = 0 */ | |
/* in_dim_0 = 28 */ | |
/* in_dim_1 = 28 */ | |
/* conv_has_relu = 1 */ | |
/* kern_sz = 1 */ | |
/* stride = 1 */ | |
/* out_chans = 32 */ | |
/* in_chans = 192 */ | |
/* rtc_func_name = conv__num_imgs_20__in_pad_0__in_dim_0_28__in_dim_1_28__conv_has_relu_1__kern_sz_1__stride_1__out_chans_32__in_chans_192 */ | |
/* t_tile_sz = 8 */ | |
/* out_ix_x_dim = 28 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%28) */ | |
/* out_ix_y_dim = 28 */ | |
/* out_ix_y_sz = 28 */ | |
/* out_ix_y_nomod = (out_ix/28) */ | |
/* out_ix_y = ((out_ix/28)%%28) */ | |
/* out_ix_chan_dim = 32 */ | |
/* out_ix_chan_sz = 784 */ | |
/* out_ix_chan_nomod = (out_ix/784) */ | |
/* out_ix_chan = ((out_ix/784)%%32) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 25088 */ | |
/* out_ix_img_nomod = (out_ix/25088) */ | |
/* out_ix_img = (out_ix/25088) */ | |
/* out_ix_sz = 501760 */ | |
/* in_ix_x_dim = 28 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%28) */ | |
/* in_ix_y_dim = 28 */ | |
/* in_ix_y_sz = 28 */ | |
/* in_ix_y_nomod = (in_ix/28) */ | |
/* in_ix_y = ((in_ix/28)%%28) */ | |
/* in_ix_chan_dim = 192 */ | |
/* in_ix_chan_sz = 784 */ | |
/* in_ix_chan_nomod = (in_ix/784) */ | |
/* in_ix_chan = ((in_ix/784)%%192) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 150528 */ | |
/* in_ix_img_nomod = (in_ix/150528) */ | |
/* in_ix_img = (in_ix/150528) */ | |
/* in_ix_sz = 3010560 */ | |
/* t_smem_patch_ix_x_dim = 28 */ | |
/* t_smem_patch_ix_x_sz = 1 */ | |
/* t_smem_patch_ix_x_nomod = t_smem_patch_ix */ | |
/* t_smem_patch_ix_x = (t_smem_patch_ix%%28) */ | |
/* t_smem_patch_ix_y_dim = 28 */ | |
/* t_smem_patch_ix_y_sz = 28 */ | |
/* t_smem_patch_ix_y_nomod = (t_smem_patch_ix/28) */ | |
/* t_smem_patch_ix_y = ((t_smem_patch_ix/28)%%28) */ | |
/* t_smem_patch_ix_img_dim = 20 */ | |
/* t_smem_patch_ix_img_sz = 784 */ | |
/* t_smem_patch_ix_img_nomod = (t_smem_patch_ix/784) */ | |
/* t_smem_patch_ix_img = (t_smem_patch_ix/784) */ | |
/* t_smem_patch_ix_sz = 15680 */ | |
/* filts_ix_out_chan_elem_x_dim = 1 */ | |
/* filts_ix_out_chan_elem_x_sz = 1 */ | |
/* filts_ix_out_chan_elem_x_nomod = filts_ix_out_chan_elem */ | |
/* filts_ix_out_chan_elem_x = (filts_ix_out_chan_elem%%1) */ | |
/* filts_ix_out_chan_elem_y_dim = 1 */ | |
/* filts_ix_out_chan_elem_y_sz = 1 */ | |
/* filts_ix_out_chan_elem_y_nomod = filts_ix_out_chan_elem */ | |
/* filts_ix_out_chan_elem_y = (filts_ix_out_chan_elem%%1) */ | |
/* filts_ix_out_chan_elem_in_chan_dim = 192 */ | |
/* filts_ix_out_chan_elem_in_chan_sz = 1 */ | |
/* filts_ix_out_chan_elem_in_chan_nomod = filts_ix_out_chan_elem */ | |
/* filts_ix_out_chan_elem_in_chan = filts_ix_out_chan_elem */ | |
/* filts_ix_out_chan_elem_sz = 192 */ | |
/* LOC_ID_1D_out_chan_tile_dim = 4 */ | |
/* LOC_ID_1D_out_chan_tile_sz = 1 */ | |
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */ | |
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%4) */ | |
/* LOC_ID_1D_patch_tile_dim = 32 */ | |
/* LOC_ID_1D_patch_tile_sz = 4 */ | |
/* LOC_ID_1D_patch_tile_nomod = (LOC_ID_1D/4) */ | |
/* LOC_ID_1D_patch_tile = (LOC_ID_1D/4) */ | |
/* LOC_ID_1D_sz = 128 */ | |
/* filts_xp_ix_out_chan_tile_dim = 4 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%4) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 4 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/4) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/4)%%8) */ | |
/* filts_xp_ix_x_dim = 1 */ | |
/* filts_xp_ix_x_sz = 32 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/32) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/32)%%1) */ | |
/* filts_xp_ix_y_dim = 1 */ | |
/* filts_xp_ix_y_sz = 32 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/32) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/32)%%1) */ | |
/* filts_xp_ix_in_chan_dim = 192 */ | |
/* filts_xp_ix_in_chan_sz = 32 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/32) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/32)%%192) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 6144 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/6144) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/6144) */ | |
/* filts_xp_ix_sz = 6144 */ | |
/* patch_smem_load_iter = 2 */ | |
/* GRP_ID_1D_out_chan_blk_dim = 1 */ | |
/* GRP_ID_1D_out_chan_blk_sz = 1 */ | |
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */ | |
/* GRP_ID_1D_patch_blk_dim = 62 */ | |
/* GRP_ID_1D_patch_blk_sz = 1 */ | |
/* GRP_ID_1D_patch_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_patch_blk = GRP_ID_1D */ | |
/* GRP_ID_1D_sz = 62 */ | |
/* out_chan_tile = (%(LOC_ID_1D_out_chan_tile)+%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim)) */ | |
/* patch_tile = (%(LOC_ID_1D_patch_tile)+%(GRP_ID_1D_patch_blk)*%(LOC_ID_1D_patch_tile_dim)) */ | |
/* out_chan_ix = (%(out_chan_tile)*%(t_tile_sz)) */ | |
/* patch_ix_0 = (%(patch_tile)*%(t_tile_sz)+0) */ | |
/* patch_ix_0_x_dim = 28 */ | |
/* patch_ix_0_x_sz = 1 */ | |
/* patch_ix_0_x_nomod = %(patch_ix_0) */ | |
/* patch_ix_0_x = (%(patch_ix_0)%%28) */ | |
/* patch_ix_0_y_dim = 28 */ | |
/* patch_ix_0_y_sz = 28 */ | |
/* patch_ix_0_y_nomod = (%(patch_ix_0)/28) */ | |
/* patch_ix_0_y = ((%(patch_ix_0)/28)%%28) */ | |
/* patch_ix_0_img_dim = 20 */ | |
/* patch_ix_0_img_sz = 784 */ | |
/* patch_ix_0_img_nomod = (%(patch_ix_0)/784) */ | |
/* patch_ix_0_img = (%(patch_ix_0)/784) */ | |
/* patch_ix_0_sz = 15680 */ | |
/* patch_ix_1 = (%(patch_tile)*%(t_tile_sz)+1) */ | |
/* patch_ix_1_x_dim = 28 */ | |
/* patch_ix_1_x_sz = 1 */ | |
/* patch_ix_1_x_nomod = %(patch_ix_1) */ | |
/* patch_ix_1_x = (%(patch_ix_1)%%28) */ | |
/* patch_ix_1_y_dim = 28 */ | |
/* patch_ix_1_y_sz = 28 */ | |
/* patch_ix_1_y_nomod = (%(patch_ix_1)/28) */ | |
/* patch_ix_1_y = ((%(patch_ix_1)/28)%%28) */ | |
/* patch_ix_1_img_dim = 20 */ | |
/* patch_ix_1_img_sz = 784 */ | |
/* patch_ix_1_img_nomod = (%(patch_ix_1)/784) */ | |
/* patch_ix_1_img = (%(patch_ix_1)/784) */ | |
/* patch_ix_1_sz = 15680 */ | |
/* patch_ix_2 = (%(patch_tile)*%(t_tile_sz)+2) */ | |
/* patch_ix_2_x_dim = 28 */ | |
/* patch_ix_2_x_sz = 1 */ | |
/* patch_ix_2_x_nomod = %(patch_ix_2) */ | |
/* patch_ix_2_x = (%(patch_ix_2)%%28) */ | |
/* patch_ix_2_y_dim = 28 */ | |
/* patch_ix_2_y_sz = 28 */ | |
/* patch_ix_2_y_nomod = (%(patch_ix_2)/28) */ | |
/* patch_ix_2_y = ((%(patch_ix_2)/28)%%28) */ | |
/* patch_ix_2_img_dim = 20 */ | |
/* patch_ix_2_img_sz = 784 */ | |
/* patch_ix_2_img_nomod = (%(patch_ix_2)/784) */ | |
/* patch_ix_2_img = (%(patch_ix_2)/784) */ | |
/* patch_ix_2_sz = 15680 */ | |
/* patch_ix_3 = (%(patch_tile)*%(t_tile_sz)+3) */ | |
/* patch_ix_3_x_dim = 28 */ | |
/* patch_ix_3_x_sz = 1 */ | |
/* patch_ix_3_x_nomod = %(patch_ix_3) */ | |
/* patch_ix_3_x = (%(patch_ix_3)%%28) */ | |
/* patch_ix_3_y_dim = 28 */ | |
/* patch_ix_3_y_sz = 28 */ | |
/* patch_ix_3_y_nomod = (%(patch_ix_3)/28) */ | |
/* patch_ix_3_y = ((%(patch_ix_3)/28)%%28) */ | |
/* patch_ix_3_img_dim = 20 */ | |
/* patch_ix_3_img_sz = 784 */ | |
/* patch_ix_3_img_nomod = (%(patch_ix_3)/784) */ | |
/* patch_ix_3_img = (%(patch_ix_3)/784) */ | |
/* patch_ix_3_sz = 15680 */ | |
/* patch_ix_4 = (%(patch_tile)*%(t_tile_sz)+4) */ | |
/* patch_ix_4_x_dim = 28 */ | |
/* patch_ix_4_x_sz = 1 */ | |
/* patch_ix_4_x_nomod = %(patch_ix_4) */ | |
/* patch_ix_4_x = (%(patch_ix_4)%%28) */ | |
/* patch_ix_4_y_dim = 28 */ | |
/* patch_ix_4_y_sz = 28 */ | |
/* patch_ix_4_y_nomod = (%(patch_ix_4)/28) */ | |
/* patch_ix_4_y = ((%(patch_ix_4)/28)%%28) */ | |
/* patch_ix_4_img_dim = 20 */ | |
/* patch_ix_4_img_sz = 784 */ | |
/* patch_ix_4_img_nomod = (%(patch_ix_4)/784) */ | |
/* patch_ix_4_img = (%(patch_ix_4)/784) */ | |
/* patch_ix_4_sz = 15680 */ | |
/* patch_ix_5 = (%(patch_tile)*%(t_tile_sz)+5) */ | |
/* patch_ix_5_x_dim = 28 */ | |
/* patch_ix_5_x_sz = 1 */ | |
/* patch_ix_5_x_nomod = %(patch_ix_5) */ | |
/* patch_ix_5_x = (%(patch_ix_5)%%28) */ | |
/* patch_ix_5_y_dim = 28 */ | |
/* patch_ix_5_y_sz = 28 */ | |
/* patch_ix_5_y_nomod = (%(patch_ix_5)/28) */ | |
/* patch_ix_5_y = ((%(patch_ix_5)/28)%%28) */ | |
/* patch_ix_5_img_dim = 20 */ | |
/* patch_ix_5_img_sz = 784 */ | |
/* patch_ix_5_img_nomod = (%(patch_ix_5)/784) */ | |
/* patch_ix_5_img = (%(patch_ix_5)/784) */ | |
/* patch_ix_5_sz = 15680 */ | |
/* patch_ix_6 = (%(patch_tile)*%(t_tile_sz)+6) */ | |
/* patch_ix_6_x_dim = 28 */ | |
/* patch_ix_6_x_sz = 1 */ | |
/* patch_ix_6_x_nomod = %(patch_ix_6) */ | |
/* patch_ix_6_x = (%(patch_ix_6)%%28) */ | |
/* patch_ix_6_y_dim = 28 */ | |
/* patch_ix_6_y_sz = 28 */ | |
/* patch_ix_6_y_nomod = (%(patch_ix_6)/28) */ | |
/* patch_ix_6_y = ((%(patch_ix_6)/28)%%28) */ | |
/* patch_ix_6_img_dim = 20 */ | |
/* patch_ix_6_img_sz = 784 */ | |
/* patch_ix_6_img_nomod = (%(patch_ix_6)/784) */ | |
/* patch_ix_6_img = (%(patch_ix_6)/784) */ | |
/* patch_ix_6_sz = 15680 */ | |
/* patch_ix_7 = (%(patch_tile)*%(t_tile_sz)+7) */ | |
/* patch_ix_7_x_dim = 28 */ | |
/* patch_ix_7_x_sz = 1 */ | |
/* patch_ix_7_x_nomod = %(patch_ix_7) */ | |
/* patch_ix_7_x = (%(patch_ix_7)%%28) */ | |
/* patch_ix_7_y_dim = 28 */ | |
/* patch_ix_7_y_sz = 28 */ | |
/* patch_ix_7_y_nomod = (%(patch_ix_7)/28) */ | |
/* patch_ix_7_y = ((%(patch_ix_7)/28)%%28) */ | |
/* patch_ix_7_img_dim = 20 */ | |
/* patch_ix_7_img_sz = 784 */ | |
/* patch_ix_7_img_nomod = (%(patch_ix_7)/784) */ | |
/* patch_ix_7_img = (%(patch_ix_7)/784) */ | |
/* patch_ix_7_sz = 15680 */ | |
/* get_in = float v = 0; | |
int const smem_in_ix_y = %(t_smem_patch_ix_y)*%(stride)+%(filts_ix_out_chan_elem_y) - %(in_pad); | |
int const smem_in_ix_x = %(t_smem_patch_ix_x)*%(stride)+%(filts_ix_out_chan_elem_x) - %(in_pad); | |
if(smem_in_ix_y >= 0 && smem_in_ix_x >= 0 && | |
%(t_smem_patch_ix_img) < %(in_ix_img_dim) && | |
smem_in_ix_x < %(in_ix_x_dim) && smem_in_ix_y < %(in_ix_y_dim) ) { | |
v = in[%(t_smem_patch_ix_img)*%(in_ix_img_sz) + | |
%(filts_ix_out_chan_elem_in_chan)*%(in_ix_chan_sz) + | |
smem_in_ix_y*%(in_ix_y_sz) + | |
smem_in_ix_x*%(in_ix_x_sz)]; | |
} */ | |
/* t_tile_fmas = // begin t_tile_fmas | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
// end t_tile_fmas */ | |
/* t_tile_loads = // begin t_tile_loads | |
filts_strip[0] = filts_smem[%(LOC_ID_1D_out_chan_tile)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem[%(LOC_ID_1D_out_chan_tile)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem[%(LOC_ID_1D_out_chan_tile)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem[%(LOC_ID_1D_out_chan_tile)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem[%(LOC_ID_1D_out_chan_tile)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem[%(LOC_ID_1D_out_chan_tile)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem[%(LOC_ID_1D_out_chan_tile)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem[%(LOC_ID_1D_out_chan_tile)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+0]; | |
in_strip[1] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+1]; | |
in_strip[2] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+2]; | |
in_strip[3] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+3]; | |
in_strip[4] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+4]; | |
in_strip[5] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+5]; | |
in_strip[6] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+6]; | |
in_strip[7] = in_smem[%(t_tile_sz)*%(LOC_ID_1D_patch_tile)+7]; | |
// end t_tile_loads */ | |
/* t_tile_dummy_loads = // begin t_tile_dummy_loads | |
filts_strip[0] = filts_smem[(LOC_ID_1D %% 32) + 0]; | |
filts_strip[1] = filts_smem[(LOC_ID_1D %% 32) + 1]; | |
filts_strip[2] = filts_smem[(LOC_ID_1D %% 32) + 2]; | |
filts_strip[3] = filts_smem[(LOC_ID_1D %% 32) + 3]; | |
filts_strip[4] = filts_smem[(LOC_ID_1D %% 32) + 4]; | |
filts_strip[5] = filts_smem[(LOC_ID_1D %% 32) + 5]; | |
filts_strip[6] = filts_smem[(LOC_ID_1D %% 32) + 6]; | |
filts_strip[7] = filts_smem[(LOC_ID_1D %% 32) + 7]; | |
in_strip[0] = in_smem[(LOC_ID_1D %% 32) + 0]; | |
in_strip[1] = in_smem[(LOC_ID_1D %% 32) + 1]; | |
in_strip[2] = in_smem[(LOC_ID_1D %% 32) + 2]; | |
in_strip[3] = in_smem[(LOC_ID_1D %% 32) + 3]; | |
in_strip[4] = in_smem[(LOC_ID_1D %% 32) + 4]; | |
in_strip[5] = in_smem[(LOC_ID_1D %% 32) + 5]; | |
in_strip[6] = in_smem[(LOC_ID_1D %% 32) + 6]; | |
in_strip[7] = in_smem[(LOC_ID_1D %% 32) + 7]; | |
// end t_tile_dummy_loads */ | |
/* t_tile_stores = // begin t_tile_stores | |
int32_t tpix[%(t_tile_sz)]; | |
int32_t tcix[%(t_tile_sz)]; | |
tpix[0] = %(patch_ix_0_img)*%(out_ix_img_sz) + | |
( %(patch_ix_0) %% %(patch_ix_0_img_sz) ); // cache out patch ixs | |
tpix[1] = %(patch_ix_1_img)*%(out_ix_img_sz) + | |
( %(patch_ix_1) %% %(patch_ix_1_img_sz) ); // cache out patch ixs | |
tpix[2] = %(patch_ix_2_img)*%(out_ix_img_sz) + | |
( %(patch_ix_2) %% %(patch_ix_2_img_sz) ); // cache out patch ixs | |
tpix[3] = %(patch_ix_3_img)*%(out_ix_img_sz) + | |
( %(patch_ix_3) %% %(patch_ix_3_img_sz) ); // cache out patch ixs | |
tpix[4] = %(patch_ix_4_img)*%(out_ix_img_sz) + | |
( %(patch_ix_4) %% %(patch_ix_4_img_sz) ); // cache out patch ixs | |
tpix[5] = %(patch_ix_5_img)*%(out_ix_img_sz) + | |
( %(patch_ix_5) %% %(patch_ix_5_img_sz) ); // cache out patch ixs | |
tpix[6] = %(patch_ix_6_img)*%(out_ix_img_sz) + | |
( %(patch_ix_6) %% %(patch_ix_6_img_sz) ); // cache out patch ixs | |
tpix[7] = %(patch_ix_7_img)*%(out_ix_img_sz) + | |
( %(patch_ix_7) %% %(patch_ix_7_img_sz) ); // cache out patch ixs | |
tcix[0] = (%(out_chan_ix)+0)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[1] = (%(out_chan_ix)+1)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[2] = (%(out_chan_ix)+2)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[3] = (%(out_chan_ix)+3)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[4] = (%(out_chan_ix)+4)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[5] = (%(out_chan_ix)+5)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[6] = (%(out_chan_ix)+6)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[7] = (%(out_chan_ix)+7)*%(out_ix_chan_sz); // cache out chan ixs | |
if( %(patch_ix_0) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( %(patch_ix_1) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( %(patch_ix_2) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( %(patch_ix_3) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( %(patch_ix_4) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( %(patch_ix_5) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( %(patch_ix_6) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( %(patch_ix_7) >= %(patch_ix_0_sz) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores */ | |
/* t_tile_dummy_stores = // begin t_tile_dummy_stores | |
out[0] = 0.0f | |
+ max(0.0f,out_tile[0] + filts_strip[0]) | |
+ max(0.0f,out_tile[1] + filts_strip[1]) | |
+ max(0.0f,out_tile[2] + filts_strip[2]) | |
+ max(0.0f,out_tile[3] + filts_strip[3]) | |
+ max(0.0f,out_tile[4] + filts_strip[4]) | |
+ max(0.0f,out_tile[5] + filts_strip[5]) | |
+ max(0.0f,out_tile[6] + filts_strip[6]) | |
+ max(0.0f,out_tile[7] + filts_strip[7]) | |
+ max(0.0f,out_tile[8] + filts_strip[0]) | |
+ max(0.0f,out_tile[9] + filts_strip[1]) | |
+ max(0.0f,out_tile[10] + filts_strip[2]) | |
+ max(0.0f,out_tile[11] + filts_strip[3]) | |
+ max(0.0f,out_tile[12] + filts_strip[4]) | |
+ max(0.0f,out_tile[13] + filts_strip[5]) | |
+ max(0.0f,out_tile[14] + filts_strip[6]) | |
+ max(0.0f,out_tile[15] + filts_strip[7]) | |
+ max(0.0f,out_tile[16] + filts_strip[0]) | |
+ max(0.0f,out_tile[17] + filts_strip[1]) | |
+ max(0.0f,out_tile[18] + filts_strip[2]) | |
+ max(0.0f,out_tile[19] + filts_strip[3]) | |
+ max(0.0f,out_tile[20] + filts_strip[4]) | |
+ max(0.0f,out_tile[21] + filts_strip[5]) | |
+ max(0.0f,out_tile[22] + filts_strip[6]) | |
+ max(0.0f,out_tile[23] + filts_strip[7]) | |
+ max(0.0f,out_tile[24] + filts_strip[0]) | |
+ max(0.0f,out_tile[25] + filts_strip[1]) | |
+ max(0.0f,out_tile[26] + filts_strip[2]) | |
+ max(0.0f,out_tile[27] + filts_strip[3]) | |
+ max(0.0f,out_tile[28] + filts_strip[4]) | |
+ max(0.0f,out_tile[29] + filts_strip[5]) | |
+ max(0.0f,out_tile[30] + filts_strip[6]) | |
+ max(0.0f,out_tile[31] + filts_strip[7]) | |
+ max(0.0f,out_tile[32] + filts_strip[0]) | |
+ max(0.0f,out_tile[33] + filts_strip[1]) | |
+ max(0.0f,out_tile[34] + filts_strip[2]) | |
+ max(0.0f,out_tile[35] + filts_strip[3]) | |
+ max(0.0f,out_tile[36] + filts_strip[4]) | |
+ max(0.0f,out_tile[37] + filts_strip[5]) | |
+ max(0.0f,out_tile[38] + filts_strip[6]) | |
+ max(0.0f,out_tile[39] + filts_strip[7]) | |
+ max(0.0f,out_tile[40] + filts_strip[0]) | |
+ max(0.0f,out_tile[41] + filts_strip[1]) | |
+ max(0.0f,out_tile[42] + filts_strip[2]) | |
+ max(0.0f,out_tile[43] + filts_strip[3]) | |
+ max(0.0f,out_tile[44] + filts_strip[4]) | |
+ max(0.0f,out_tile[45] + filts_strip[5]) | |
+ max(0.0f,out_tile[46] + filts_strip[6]) | |
+ max(0.0f,out_tile[47] + filts_strip[7]) | |
+ max(0.0f,out_tile[48] + filts_strip[0]) | |
+ max(0.0f,out_tile[49] + filts_strip[1]) | |
+ max(0.0f,out_tile[50] + filts_strip[2]) | |
+ max(0.0f,out_tile[51] + filts_strip[3]) | |
+ max(0.0f,out_tile[52] + filts_strip[4]) | |
+ max(0.0f,out_tile[53] + filts_strip[5]) | |
+ max(0.0f,out_tile[54] + filts_strip[6]) | |
+ max(0.0f,out_tile[55] + filts_strip[7]) | |
+ max(0.0f,out_tile[56] + filts_strip[0]) | |
+ max(0.0f,out_tile[57] + filts_strip[1]) | |
+ max(0.0f,out_tile[58] + filts_strip[2]) | |
+ max(0.0f,out_tile[59] + filts_strip[3]) | |
+ max(0.0f,out_tile[60] + filts_strip[4]) | |
+ max(0.0f,out_tile[61] + filts_strip[5]) | |
+ max(0.0f,out_tile[62] + filts_strip[6]) | |
+ max(0.0f,out_tile[63] + filts_strip[7]) | |
; | |
// end t_tile_dummy_stores */ | |
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_32__in_chans_192__kysz_1__kxsz_1( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x | |
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile | |
{ | |
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg | |
int32_t const filts_ix = GLOB_ID_1D; | |
if( filts_ix >= 6144 ) { return; } | |
int32_t const fioc = (filts_ix/192); | |
float val = 0.0f; | |
int32_t const filts_xp_ix = | |
(fioc/32)*6144 + | |
(fioc%8)*4 + | |
((fioc/8)%4)*1 + | |
(filts_ix%192)*32 + | |
(filts_ix%1)*32 + | |
(filts_ix%1)*32; | |
#if 1 | |
val = in[filts_ix]; | |
#else | |
if( (filts_ix%192) == 0 ) { | |
// if( ((filts_ix%1) == 5) && ((filts_ix%1) == 5) ) | |
{ | |
val = (filts_ix%1)*100 + (filts_ix%1); | |
} | |
} | |
#endif | |
out[filts_xp_ix] = val; | |
} | |
// -- template substituion table used: -- | |
/* out_chans = 32 */ | |
/* in_chans = 192 */ | |
/* kysz = 1 */ | |
/* kxsz = 1 */ | |
/* rtc_func_name = xpose_filts__out_chans_32__in_chans_192__kysz_1__kxsz_1 */ | |
/* t_tile_sz = 8 */ | |
/* filts_ix_x_dim = 1 */ | |
/* filts_ix_x_sz = 1 */ | |
/* filts_ix_x_nomod = filts_ix */ | |
/* filts_ix_x = (filts_ix%%1) */ | |
/* filts_ix_y_dim = 1 */ | |
/* filts_ix_y_sz = 1 */ | |
/* filts_ix_y_nomod = filts_ix */ | |
/* filts_ix_y = (filts_ix%%1) */ | |
/* filts_ix_in_chan_dim = 192 */ | |
/* filts_ix_in_chan_sz = 1 */ | |
/* filts_ix_in_chan_nomod = filts_ix */ | |
/* filts_ix_in_chan = (filts_ix%%192) */ | |
/* filts_ix_out_chan_dim = 32 */ | |
/* filts_ix_out_chan_sz = 192 */ | |
/* filts_ix_out_chan_nomod = (filts_ix/192) */ | |
/* filts_ix_out_chan = (filts_ix/192) */ | |
/* filts_ix_sz = 6144 */ | |
/* filts_xp_ix_out_chan_tile_dim = 4 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%4) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 4 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/4) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/4)%%8) */ | |
/* filts_xp_ix_x_dim = 1 */ | |
/* filts_xp_ix_x_sz = 32 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/32) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/32)%%1) */ | |
/* filts_xp_ix_y_dim = 1 */ | |
/* filts_xp_ix_y_sz = 32 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/32) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/32)%%1) */ | |
/* filts_xp_ix_in_chan_dim = 192 */ | |
/* filts_xp_ix_in_chan_sz = 32 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/32) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/32)%%192) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 6144 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/6144) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/6144) */ | |
/* filts_xp_ix_sz = 6144 */ | |
/* fioc_out_chan_reg_dim = 8 */ | |
/* fioc_out_chan_reg_sz = 1 */ | |
/* fioc_out_chan_reg_nomod = fioc */ | |
/* fioc_out_chan_reg = (fioc%%8) */ | |
/* fioc_out_chan_tile_dim = 4 */ | |
/* fioc_out_chan_tile_sz = 8 */ | |
/* fioc_out_chan_tile_nomod = (fioc/8) */ | |
/* fioc_out_chan_tile = ((fioc/8)%%4) */ | |
/* fioc_out_chan_blk_dim = 1 */ | |
/* fioc_out_chan_blk_sz = 32 */ | |
/* fioc_out_chan_blk_nomod = (fioc/32) */ | |
/* fioc_out_chan_blk = (fioc/32) */ | |
/* fioc_sz = 32 */ | |
// each thread: computes 8x8 block of out | |
// loop over k dim | |
CUCL_GLOBAL_KERNEL void k1conv__num_imgs_20__in_dim_0_28__in_dim_1_28__conv_has_relu_1__out_chans_64__write_xposed_0__in_chans_192( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) { | |
//int32_t const blk_in_ix_sz = 16*8; | |
LOCSHAR_MEM float all_smem[1536]; // note: max(filts+in,out) == max(512+1024,1024) | |
LSMASQ float * const filts_smem = all_smem; | |
LSMASQ float * const in_smem = filts_smem + 512; | |
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers | |
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem | |
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz ) | |
float in_strip[8]; // segment of input line sufficient for one unrolling of inner loop | |
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*12288; // index of first out chan | |
int32_t blk_in_ix_base = GRP_ID_1D*24576 + LOC_ID_1D;// index of first input pel to load for this thread | |
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%8); | |
LSMASQ float * const in_smem_off = in_smem + 8*(LOC_ID_1D/8); | |
LSMASQ float * const out_smem_off = all_smem + LOC_ID_1D; | |
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D; | |
// iteratate over filter elements | |
for( int32_t blk_iter = 0; blk_iter != 24; ++blk_iter ) { | |
BARRIER_SYNC; | |
// begin smem_loads | |
filts_smem[(LOC_ID_1D + 128 * 0)] = filts[filts_off+(128*0)]; | |
filts_smem[(LOC_ID_1D + 128 * 1)] = filts[filts_off+(128*1)]; | |
filts_smem[(LOC_ID_1D + 128 * 2)] = filts[filts_off+(128*2)]; | |
filts_smem[(LOC_ID_1D + 128 * 3)] = filts[filts_off+(128*3)]; | |
in_smem[(LOC_ID_1D + 128 * 0)] = in[ blk_in_ix_base + (128*0) ]; | |
in_smem[(LOC_ID_1D + 128 * 1)] = in[ blk_in_ix_base + (128*1) ]; | |
in_smem[(LOC_ID_1D + 128 * 2)] = in[ blk_in_ix_base + (128*2) ]; | |
in_smem[(LOC_ID_1D + 128 * 3)] = in[ blk_in_ix_base + (128*3) ]; | |
in_smem[(LOC_ID_1D + 128 * 4)] = in[ blk_in_ix_base + (128*4) ]; | |
in_smem[(LOC_ID_1D + 128 * 5)] = in[ blk_in_ix_base + (128*5) ]; | |
in_smem[(LOC_ID_1D + 128 * 6)] = in[ blk_in_ix_base + (128*6) ]; | |
in_smem[(LOC_ID_1D + 128 * 7)] = in[ blk_in_ix_base + (128*7) ]; | |
// end smem_loads; | |
BARRIER_SYNC; | |
filts_off += 64*8; | |
blk_in_ix_base += 1024; | |
// begin inner_loop_body | |
filts_strip[0] = filts_smem_off[0*64+0*8]; | |
filts_strip[1] = filts_smem_off[0*64+1*8]; | |
filts_strip[2] = filts_smem_off[0*64+2*8]; | |
filts_strip[3] = filts_smem_off[0*64+3*8]; | |
filts_strip[4] = filts_smem_off[0*64+4*8]; | |
filts_strip[5] = filts_smem_off[0*64+5*8]; | |
filts_strip[6] = filts_smem_off[0*64+6*8]; | |
filts_strip[7] = filts_smem_off[0*64+7*8]; | |
in_strip[0] = in_smem_off[(0*8*16+0)]; | |
in_strip[1] = in_smem_off[(0*8*16+1)]; | |
in_strip[2] = in_smem_off[(0*8*16+2)]; | |
in_strip[3] = in_smem_off[(0*8*16+3)]; | |
in_strip[4] = in_smem_off[(0*8*16+4)]; | |
in_strip[5] = in_smem_off[(0*8*16+5)]; | |
in_strip[6] = in_smem_off[(0*8*16+6)]; | |
in_strip[7] = in_smem_off[(0*8*16+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[1*64+0*8]; | |
filts_strip[1] = filts_smem_off[1*64+1*8]; | |
filts_strip[2] = filts_smem_off[1*64+2*8]; | |
filts_strip[3] = filts_smem_off[1*64+3*8]; | |
filts_strip[4] = filts_smem_off[1*64+4*8]; | |
filts_strip[5] = filts_smem_off[1*64+5*8]; | |
filts_strip[6] = filts_smem_off[1*64+6*8]; | |
filts_strip[7] = filts_smem_off[1*64+7*8]; | |
in_strip[0] = in_smem_off[(1*8*16+0)]; | |
in_strip[1] = in_smem_off[(1*8*16+1)]; | |
in_strip[2] = in_smem_off[(1*8*16+2)]; | |
in_strip[3] = in_smem_off[(1*8*16+3)]; | |
in_strip[4] = in_smem_off[(1*8*16+4)]; | |
in_strip[5] = in_smem_off[(1*8*16+5)]; | |
in_strip[6] = in_smem_off[(1*8*16+6)]; | |
in_strip[7] = in_smem_off[(1*8*16+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[2*64+0*8]; | |
filts_strip[1] = filts_smem_off[2*64+1*8]; | |
filts_strip[2] = filts_smem_off[2*64+2*8]; | |
filts_strip[3] = filts_smem_off[2*64+3*8]; | |
filts_strip[4] = filts_smem_off[2*64+4*8]; | |
filts_strip[5] = filts_smem_off[2*64+5*8]; | |
filts_strip[6] = filts_smem_off[2*64+6*8]; | |
filts_strip[7] = filts_smem_off[2*64+7*8]; | |
in_strip[0] = in_smem_off[(2*8*16+0)]; | |
in_strip[1] = in_smem_off[(2*8*16+1)]; | |
in_strip[2] = in_smem_off[(2*8*16+2)]; | |
in_strip[3] = in_smem_off[(2*8*16+3)]; | |
in_strip[4] = in_smem_off[(2*8*16+4)]; | |
in_strip[5] = in_smem_off[(2*8*16+5)]; | |
in_strip[6] = in_smem_off[(2*8*16+6)]; | |
in_strip[7] = in_smem_off[(2*8*16+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[3*64+0*8]; | |
filts_strip[1] = filts_smem_off[3*64+1*8]; | |
filts_strip[2] = filts_smem_off[3*64+2*8]; | |
filts_strip[3] = filts_smem_off[3*64+3*8]; | |
filts_strip[4] = filts_smem_off[3*64+4*8]; | |
filts_strip[5] = filts_smem_off[3*64+5*8]; | |
filts_strip[6] = filts_smem_off[3*64+6*8]; | |
filts_strip[7] = filts_smem_off[3*64+7*8]; | |
in_strip[0] = in_smem_off[(3*8*16+0)]; | |
in_strip[1] = in_smem_off[(3*8*16+1)]; | |
in_strip[2] = in_smem_off[(3*8*16+2)]; | |
in_strip[3] = in_smem_off[(3*8*16+3)]; | |
in_strip[4] = in_smem_off[(3*8*16+4)]; | |
in_strip[5] = in_smem_off[(3*8*16+5)]; | |
in_strip[6] = in_smem_off[(3*8*16+6)]; | |
in_strip[7] = in_smem_off[(3*8*16+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[4*64+0*8]; | |
filts_strip[1] = filts_smem_off[4*64+1*8]; | |
filts_strip[2] = filts_smem_off[4*64+2*8]; | |
filts_strip[3] = filts_smem_off[4*64+3*8]; | |
filts_strip[4] = filts_smem_off[4*64+4*8]; | |
filts_strip[5] = filts_smem_off[4*64+5*8]; | |
filts_strip[6] = filts_smem_off[4*64+6*8]; | |
filts_strip[7] = filts_smem_off[4*64+7*8]; | |
in_strip[0] = in_smem_off[(4*8*16+0)]; | |
in_strip[1] = in_smem_off[(4*8*16+1)]; | |
in_strip[2] = in_smem_off[(4*8*16+2)]; | |
in_strip[3] = in_smem_off[(4*8*16+3)]; | |
in_strip[4] = in_smem_off[(4*8*16+4)]; | |
in_strip[5] = in_smem_off[(4*8*16+5)]; | |
in_strip[6] = in_smem_off[(4*8*16+6)]; | |
in_strip[7] = in_smem_off[(4*8*16+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[5*64+0*8]; | |
filts_strip[1] = filts_smem_off[5*64+1*8]; | |
filts_strip[2] = filts_smem_off[5*64+2*8]; | |
filts_strip[3] = filts_smem_off[5*64+3*8]; | |
filts_strip[4] = filts_smem_off[5*64+4*8]; | |
filts_strip[5] = filts_smem_off[5*64+5*8]; | |
filts_strip[6] = filts_smem_off[5*64+6*8]; | |
filts_strip[7] = filts_smem_off[5*64+7*8]; | |
in_strip[0] = in_smem_off[(5*8*16+0)]; | |
in_strip[1] = in_smem_off[(5*8*16+1)]; | |
in_strip[2] = in_smem_off[(5*8*16+2)]; | |
in_strip[3] = in_smem_off[(5*8*16+3)]; | |
in_strip[4] = in_smem_off[(5*8*16+4)]; | |
in_strip[5] = in_smem_off[(5*8*16+5)]; | |
in_strip[6] = in_smem_off[(5*8*16+6)]; | |
in_strip[7] = in_smem_off[(5*8*16+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[6*64+0*8]; | |
filts_strip[1] = filts_smem_off[6*64+1*8]; | |
filts_strip[2] = filts_smem_off[6*64+2*8]; | |
filts_strip[3] = filts_smem_off[6*64+3*8]; | |
filts_strip[4] = filts_smem_off[6*64+4*8]; | |
filts_strip[5] = filts_smem_off[6*64+5*8]; | |
filts_strip[6] = filts_smem_off[6*64+6*8]; | |
filts_strip[7] = filts_smem_off[6*64+7*8]; | |
in_strip[0] = in_smem_off[(6*8*16+0)]; | |
in_strip[1] = in_smem_off[(6*8*16+1)]; | |
in_strip[2] = in_smem_off[(6*8*16+2)]; | |
in_strip[3] = in_smem_off[(6*8*16+3)]; | |
in_strip[4] = in_smem_off[(6*8*16+4)]; | |
in_strip[5] = in_smem_off[(6*8*16+5)]; | |
in_strip[6] = in_smem_off[(6*8*16+6)]; | |
in_strip[7] = in_smem_off[(6*8*16+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[7*64+0*8]; | |
filts_strip[1] = filts_smem_off[7*64+1*8]; | |
filts_strip[2] = filts_smem_off[7*64+2*8]; | |
filts_strip[3] = filts_smem_off[7*64+3*8]; | |
filts_strip[4] = filts_smem_off[7*64+4*8]; | |
filts_strip[5] = filts_smem_off[7*64+5*8]; | |
filts_strip[6] = filts_smem_off[7*64+6*8]; | |
filts_strip[7] = filts_smem_off[7*64+7*8]; | |
in_strip[0] = in_smem_off[(7*8*16+0)]; | |
in_strip[1] = in_smem_off[(7*8*16+1)]; | |
in_strip[2] = in_smem_off[(7*8*16+2)]; | |
in_strip[3] = in_smem_off[(7*8*16+3)]; | |
in_strip[4] = in_smem_off[(7*8*16+4)]; | |
in_strip[5] = in_smem_off[(7*8*16+5)]; | |
in_strip[6] = in_smem_off[(7*8*16+6)]; | |
in_strip[7] = in_smem_off[(7*8*16+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
; | |
} | |
// load per-block biases into smem | |
if( flags == 2 ) { return; } | |
BARRIER_SYNC; | |
for( int32_t i = 0; i != 1; ++i ) { | |
int32_t const t_smem_bias_ix = LOC_ID_1D+128*i; | |
if( t_smem_bias_ix < 64 ) { | |
int32_t const ocix_base = (GRP_ID_1D%1)*64; | |
int32_t const load_reg = t_smem_bias_ix / 8; | |
int32_t const load_tile = t_smem_bias_ix % 8; | |
int32_t const ocix = ocix_base + load_tile*8 + load_reg; | |
if( ocix < 64 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; } | |
} | |
} | |
BARRIER_SYNC; | |
// load biases into filts_strip | |
// begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*8]; | |
filts_strip[1] = filts_smem_off[1*8]; | |
filts_strip[2] = filts_smem_off[2*8]; | |
filts_strip[3] = filts_smem_off[3*8]; | |
filts_strip[4] = filts_smem_off[4*8]; | |
filts_strip[5] = filts_smem_off[5*8]; | |
filts_strip[6] = filts_smem_off[6*8]; | |
filts_strip[7] = filts_smem_off[7*8]; | |
// end t_tile_bias_loads; | |
if( flags == 1 ) { | |
GASQ float * const out_off = out + LOC_ID_1D; | |
out_off[0] = max(0.0f,out_tile[0]+filts_strip[0]); | |
out_off[128] = max(0.0f,out_tile[1]+filts_strip[1]); | |
out_off[256] = max(0.0f,out_tile[2]+filts_strip[2]); | |
out_off[384] = max(0.0f,out_tile[3]+filts_strip[3]); | |
out_off[512] = max(0.0f,out_tile[4]+filts_strip[4]); | |
out_off[640] = max(0.0f,out_tile[5]+filts_strip[5]); | |
out_off[768] = max(0.0f,out_tile[6]+filts_strip[6]); | |
out_off[896] = max(0.0f,out_tile[7]+filts_strip[7]); | |
out_off[1024] = max(0.0f,out_tile[8]+filts_strip[0]); | |
out_off[1152] = max(0.0f,out_tile[9]+filts_strip[1]); | |
out_off[1280] = max(0.0f,out_tile[10]+filts_strip[2]); | |
out_off[1408] = max(0.0f,out_tile[11]+filts_strip[3]); | |
out_off[1536] = max(0.0f,out_tile[12]+filts_strip[4]); | |
out_off[1664] = max(0.0f,out_tile[13]+filts_strip[5]); | |
out_off[1792] = max(0.0f,out_tile[14]+filts_strip[6]); | |
out_off[1920] = max(0.0f,out_tile[15]+filts_strip[7]); | |
out_off[2048] = max(0.0f,out_tile[16]+filts_strip[0]); | |
out_off[2176] = max(0.0f,out_tile[17]+filts_strip[1]); | |
out_off[2304] = max(0.0f,out_tile[18]+filts_strip[2]); | |
out_off[2432] = max(0.0f,out_tile[19]+filts_strip[3]); | |
out_off[2560] = max(0.0f,out_tile[20]+filts_strip[4]); | |
out_off[2688] = max(0.0f,out_tile[21]+filts_strip[5]); | |
out_off[2816] = max(0.0f,out_tile[22]+filts_strip[6]); | |
out_off[2944] = max(0.0f,out_tile[23]+filts_strip[7]); | |
out_off[3072] = max(0.0f,out_tile[24]+filts_strip[0]); | |
out_off[3200] = max(0.0f,out_tile[25]+filts_strip[1]); | |
out_off[3328] = max(0.0f,out_tile[26]+filts_strip[2]); | |
out_off[3456] = max(0.0f,out_tile[27]+filts_strip[3]); | |
out_off[3584] = max(0.0f,out_tile[28]+filts_strip[4]); | |
out_off[3712] = max(0.0f,out_tile[29]+filts_strip[5]); | |
out_off[3840] = max(0.0f,out_tile[30]+filts_strip[6]); | |
out_off[3968] = max(0.0f,out_tile[31]+filts_strip[7]); | |
out_off[4096] = max(0.0f,out_tile[32]+filts_strip[0]); | |
out_off[4224] = max(0.0f,out_tile[33]+filts_strip[1]); | |
out_off[4352] = max(0.0f,out_tile[34]+filts_strip[2]); | |
out_off[4480] = max(0.0f,out_tile[35]+filts_strip[3]); | |
out_off[4608] = max(0.0f,out_tile[36]+filts_strip[4]); | |
out_off[4736] = max(0.0f,out_tile[37]+filts_strip[5]); | |
out_off[4864] = max(0.0f,out_tile[38]+filts_strip[6]); | |
out_off[4992] = max(0.0f,out_tile[39]+filts_strip[7]); | |
out_off[5120] = max(0.0f,out_tile[40]+filts_strip[0]); | |
out_off[5248] = max(0.0f,out_tile[41]+filts_strip[1]); | |
out_off[5376] = max(0.0f,out_tile[42]+filts_strip[2]); | |
out_off[5504] = max(0.0f,out_tile[43]+filts_strip[3]); | |
out_off[5632] = max(0.0f,out_tile[44]+filts_strip[4]); | |
out_off[5760] = max(0.0f,out_tile[45]+filts_strip[5]); | |
out_off[5888] = max(0.0f,out_tile[46]+filts_strip[6]); | |
out_off[6016] = max(0.0f,out_tile[47]+filts_strip[7]); | |
out_off[6144] = max(0.0f,out_tile[48]+filts_strip[0]); | |
out_off[6272] = max(0.0f,out_tile[49]+filts_strip[1]); | |
out_off[6400] = max(0.0f,out_tile[50]+filts_strip[2]); | |
out_off[6528] = max(0.0f,out_tile[51]+filts_strip[3]); | |
out_off[6656] = max(0.0f,out_tile[52]+filts_strip[4]); | |
out_off[6784] = max(0.0f,out_tile[53]+filts_strip[5]); | |
out_off[6912] = max(0.0f,out_tile[54]+filts_strip[6]); | |
out_off[7040] = max(0.0f,out_tile[55]+filts_strip[7]); | |
out_off[7168] = max(0.0f,out_tile[56]+filts_strip[0]); | |
out_off[7296] = max(0.0f,out_tile[57]+filts_strip[1]); | |
out_off[7424] = max(0.0f,out_tile[58]+filts_strip[2]); | |
out_off[7552] = max(0.0f,out_tile[59]+filts_strip[3]); | |
out_off[7680] = max(0.0f,out_tile[60]+filts_strip[4]); | |
out_off[7808] = max(0.0f,out_tile[61]+filts_strip[5]); | |
out_off[7936] = max(0.0f,out_tile[62]+filts_strip[6]); | |
out_off[8064] = max(0.0f,out_tile[63]+filts_strip[7]); | |
; | |
return; | |
} | |
// add bias to each elem of out_tile[] and store the results to out[] | |
// begin t_tile_stores | |
int32_t tpix[8]; | |
int32_t tcix[8]; | |
tpix[0] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+0)/784)*50176 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+0)%784)*1 ; // cache out patch ixs | |
tpix[1] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+1)/784)*50176 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+1)%784)*1 ; // cache out patch ixs | |
tpix[2] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+2)/784)*50176 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+2)%784)*1 ; // cache out patch ixs | |
tpix[3] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+3)/784)*50176 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+3)%784)*1 ; // cache out patch ixs | |
tpix[4] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+4)/784)*50176 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+4)%784)*1 ; // cache out patch ixs | |
tpix[5] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+5)/784)*50176 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+5)%784)*1 ; // cache out patch ixs | |
tpix[6] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+6)/784)*50176 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+6)%784)*1 ; // cache out patch ixs | |
tpix[7] = ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+7)/784)*50176 + ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+7)%784)*1 ; // cache out patch ixs | |
tcix[0] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+0)*784; // cache out chan ixs | |
tcix[1] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+1)*784; // cache out chan ixs | |
tcix[2] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+2)*784; // cache out chan ixs | |
tcix[3] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+3)*784; // cache out chan ixs | |
tcix[4] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+4)*784; // cache out chan ixs | |
tcix[5] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+5)*784; // cache out chan ixs | |
tcix[6] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+6)*784; // cache out chan ixs | |
tcix[7] = ((((LOC_ID_1D%8)+(GRP_ID_1D%1)*8)*8)+7)*784; // cache out chan ixs | |
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+0)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (64*784) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( tcix[1] < (64*784) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( tcix[2] < (64*784) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( tcix[3] < (64*784) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( tcix[4] < (64*784) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( tcix[5] < (64*784) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( tcix[6] < (64*784) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( tcix[7] < (64*784) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+1)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (64*784) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( tcix[1] < (64*784) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( tcix[2] < (64*784) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( tcix[3] < (64*784) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( tcix[4] < (64*784) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( tcix[5] < (64*784) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( tcix[6] < (64*784) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( tcix[7] < (64*784) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+2)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (64*784) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( tcix[1] < (64*784) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( tcix[2] < (64*784) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( tcix[3] < (64*784) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( tcix[4] < (64*784) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( tcix[5] < (64*784) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( tcix[6] < (64*784) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( tcix[7] < (64*784) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+3)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (64*784) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( tcix[1] < (64*784) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( tcix[2] < (64*784) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( tcix[3] < (64*784) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( tcix[4] < (64*784) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( tcix[5] < (64*784) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( tcix[6] < (64*784) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( tcix[7] < (64*784) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+4)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (64*784) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( tcix[1] < (64*784) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( tcix[2] < (64*784) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( tcix[3] < (64*784) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( tcix[4] < (64*784) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( tcix[5] < (64*784) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( tcix[6] < (64*784) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( tcix[7] < (64*784) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+5)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (64*784) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( tcix[1] < (64*784) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( tcix[2] < (64*784) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( tcix[3] < (64*784) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( tcix[4] < (64*784) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( tcix[5] < (64*784) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( tcix[6] < (64*784) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( tcix[7] < (64*784) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+6)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (64*784) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( tcix[1] < (64*784) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( tcix[2] < (64*784) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( tcix[3] < (64*784) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( tcix[4] < (64*784) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( tcix[5] < (64*784) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( tcix[6] < (64*784) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( tcix[7] < (64*784) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( ((GRP_ID_1D*128 + (LOC_ID_1D/8)*8+7)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (64*784) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( tcix[1] < (64*784) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( tcix[2] < (64*784) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( tcix[3] < (64*784) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( tcix[4] < (64*784) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( tcix[5] < (64*784) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( tcix[6] < (64*784) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( tcix[7] < (64*784) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_dim_0 = 28 */ | |
/* in_dim_1 = 28 */ | |
/* conv_has_relu = 1 */ | |
/* out_chans = 64 */ | |
/* write_xposed = 0 */ | |
/* in_chans = 192 */ | |
/* rtc_func_name = k1conv__num_imgs_20__in_dim_0_28__in_dim_1_28__conv_has_relu_1__out_chans_64__write_xposed_0__in_chans_192 */ | |
/* t_tile_sz = 8 */ | |
/* out_ix_x_dim = 28 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%28) */ | |
/* out_ix_y_dim = 28 */ | |
/* out_ix_y_sz = 28 */ | |
/* out_ix_y_nomod = (out_ix/28) */ | |
/* out_ix_y = ((out_ix/28)%%28) */ | |
/* out_ix_chan_dim = 64 */ | |
/* out_ix_chan_sz = 784 */ | |
/* out_ix_chan_nomod = (out_ix/784) */ | |
/* out_ix_chan = ((out_ix/784)%%64) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 50176 */ | |
/* out_ix_img_nomod = (out_ix/50176) */ | |
/* out_ix_img = (out_ix/50176) */ | |
/* out_ix_sz = 1003520 */ | |
/* tpb = 128 */ | |
/* in_chan_tile = 8 */ | |
/* LOC_ID_1D_out_chan_tile_dim = 8 */ | |
/* LOC_ID_1D_out_chan_tile_sz = 1 */ | |
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */ | |
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%8) */ | |
/* LOC_ID_1D_pels_tile_dim = 16 */ | |
/* LOC_ID_1D_pels_tile_sz = 8 */ | |
/* LOC_ID_1D_pels_tile_nomod = (LOC_ID_1D/8) */ | |
/* LOC_ID_1D_pels_tile = (LOC_ID_1D/8) */ | |
/* LOC_ID_1D_sz = 128 */ | |
/* GRP_ID_1D_out_chan_blk_dim = 1 */ | |
/* GRP_ID_1D_out_chan_blk_sz = 1 */ | |
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */ | |
/* GRP_ID_1D_pels_blk_dim = 123 */ | |
/* GRP_ID_1D_pels_blk_sz = 1 */ | |
/* GRP_ID_1D_pels_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_pels_blk = GRP_ID_1D */ | |
/* GRP_ID_1D_sz = 123 */ | |
/* in_ix_blk_pel_dim = 128 */ | |
/* in_ix_blk_pel_sz = 1 */ | |
/* in_ix_blk_pel_nomod = in_ix */ | |
/* in_ix_blk_pel = (in_ix%%128) */ | |
/* in_ix_blk_iter_chan_dim = 8 */ | |
/* in_ix_blk_iter_chan_sz = 128 */ | |
/* in_ix_blk_iter_chan_nomod = (in_ix/128) */ | |
/* in_ix_blk_iter_chan = ((in_ix/128)%%8) */ | |
/* in_ix_blk_iter_dim = 24 */ | |
/* in_ix_blk_iter_sz = 1024 */ | |
/* in_ix_blk_iter_nomod = (in_ix/1024) */ | |
/* in_ix_blk_iter = ((in_ix/1024)%%24) */ | |
/* in_ix_blk_dim = 123 */ | |
/* in_ix_blk_sz = 24576 */ | |
/* in_ix_blk_nomod = (in_ix/24576) */ | |
/* in_ix_blk = (in_ix/24576) */ | |
/* in_ix_sz = 3022848 */ | |
/* blk_filt_ix_sz = 64 */ | |
/* filts_smem_sz = 512 */ | |
/* in_smem_sz = 1024 */ | |
/* out_smem_sz = 1024 */ | |
/* all_smem_sz = 1536 */ | |
/* filts_xp_ix_out_chan_tile_dim = 8 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%8) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 8 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/8) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/8)%%8) */ | |
/* filts_xp_ix_in_chan_dim = 192 */ | |
/* filts_xp_ix_in_chan_sz = 64 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/64) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/64)%%192) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 12288 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/12288) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/12288) */ | |
/* filts_xp_ix_sz = 12288 */ | |
/* out_chan_bias_smem_load_iter = 1 */ | |
/* filts_off_adj = LOC_ID_1D */ | |
/* smem_loads = // begin smem_loads | |
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 2)] = filts[filts_off+(%(tpb)*2)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 3)] = filts[filts_off+(%(tpb)*3)]; | |
in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 1)] = in[ blk_in_ix_base + (%(tpb)*1) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 2)] = in[ blk_in_ix_base + (%(tpb)*2) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 3)] = in[ blk_in_ix_base + (%(tpb)*3) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 4)] = in[ blk_in_ix_base + (%(tpb)*4) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 5)] = in[ blk_in_ix_base + (%(tpb)*5) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 6)] = in[ blk_in_ix_base + (%(tpb)*6) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 7)] = in[ blk_in_ix_base + (%(tpb)*7) ]; | |
// end smem_loads */ | |
/* out_chan_tile = (%(LOC_ID_1D_out_chan_tile)+%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim)) */ | |
/* out_chan_ix = (%(out_chan_tile)*%(t_tile_sz)) */ | |
/* t_smem_ld_pel_pel_dim = 128 */ | |
/* t_smem_ld_pel_pel_sz = 1 */ | |
/* t_smem_ld_pel_pel_nomod = t_smem_ld_pel */ | |
/* t_smem_ld_pel_pel = (t_smem_ld_pel%%128) */ | |
/* t_smem_ld_pel_chan_dim = 8 */ | |
/* t_smem_ld_pel_chan_sz = 128 */ | |
/* t_smem_ld_pel_chan_nomod = (t_smem_ld_pel/128) */ | |
/* t_smem_ld_pel_chan = (t_smem_ld_pel/128) */ | |
/* t_smem_ld_pel_sz = 1024 */ | |
/* out_pel_0 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+0) */ | |
/* out_pel_0_pel_dim = 784 */ | |
/* out_pel_0_pel_sz = 1 */ | |
/* out_pel_0_pel_nomod = %(out_pel_0) */ | |
/* out_pel_0_pel = (%(out_pel_0)%%784) */ | |
/* out_pel_0_img_dim = 20 */ | |
/* out_pel_0_img_sz = 784 */ | |
/* out_pel_0_img_nomod = (%(out_pel_0)/784) */ | |
/* out_pel_0_img = (%(out_pel_0)/784) */ | |
/* out_pel_0_sz = 15680 */ | |
/* out_pel_1 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+1) */ | |
/* out_pel_1_pel_dim = 784 */ | |
/* out_pel_1_pel_sz = 1 */ | |
/* out_pel_1_pel_nomod = %(out_pel_1) */ | |
/* out_pel_1_pel = (%(out_pel_1)%%784) */ | |
/* out_pel_1_img_dim = 20 */ | |
/* out_pel_1_img_sz = 784 */ | |
/* out_pel_1_img_nomod = (%(out_pel_1)/784) */ | |
/* out_pel_1_img = (%(out_pel_1)/784) */ | |
/* out_pel_1_sz = 15680 */ | |
/* out_pel_2 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+2) */ | |
/* out_pel_2_pel_dim = 784 */ | |
/* out_pel_2_pel_sz = 1 */ | |
/* out_pel_2_pel_nomod = %(out_pel_2) */ | |
/* out_pel_2_pel = (%(out_pel_2)%%784) */ | |
/* out_pel_2_img_dim = 20 */ | |
/* out_pel_2_img_sz = 784 */ | |
/* out_pel_2_img_nomod = (%(out_pel_2)/784) */ | |
/* out_pel_2_img = (%(out_pel_2)/784) */ | |
/* out_pel_2_sz = 15680 */ | |
/* out_pel_3 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+3) */ | |
/* out_pel_3_pel_dim = 784 */ | |
/* out_pel_3_pel_sz = 1 */ | |
/* out_pel_3_pel_nomod = %(out_pel_3) */ | |
/* out_pel_3_pel = (%(out_pel_3)%%784) */ | |
/* out_pel_3_img_dim = 20 */ | |
/* out_pel_3_img_sz = 784 */ | |
/* out_pel_3_img_nomod = (%(out_pel_3)/784) */ | |
/* out_pel_3_img = (%(out_pel_3)/784) */ | |
/* out_pel_3_sz = 15680 */ | |
/* out_pel_4 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+4) */ | |
/* out_pel_4_pel_dim = 784 */ | |
/* out_pel_4_pel_sz = 1 */ | |
/* out_pel_4_pel_nomod = %(out_pel_4) */ | |
/* out_pel_4_pel = (%(out_pel_4)%%784) */ | |
/* out_pel_4_img_dim = 20 */ | |
/* out_pel_4_img_sz = 784 */ | |
/* out_pel_4_img_nomod = (%(out_pel_4)/784) */ | |
/* out_pel_4_img = (%(out_pel_4)/784) */ | |
/* out_pel_4_sz = 15680 */ | |
/* out_pel_5 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+5) */ | |
/* out_pel_5_pel_dim = 784 */ | |
/* out_pel_5_pel_sz = 1 */ | |
/* out_pel_5_pel_nomod = %(out_pel_5) */ | |
/* out_pel_5_pel = (%(out_pel_5)%%784) */ | |
/* out_pel_5_img_dim = 20 */ | |
/* out_pel_5_img_sz = 784 */ | |
/* out_pel_5_img_nomod = (%(out_pel_5)/784) */ | |
/* out_pel_5_img = (%(out_pel_5)/784) */ | |
/* out_pel_5_sz = 15680 */ | |
/* out_pel_6 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+6) */ | |
/* out_pel_6_pel_dim = 784 */ | |
/* out_pel_6_pel_sz = 1 */ | |
/* out_pel_6_pel_nomod = %(out_pel_6) */ | |
/* out_pel_6_pel = (%(out_pel_6)%%784) */ | |
/* out_pel_6_img_dim = 20 */ | |
/* out_pel_6_img_sz = 784 */ | |
/* out_pel_6_img_nomod = (%(out_pel_6)/784) */ | |
/* out_pel_6_img = (%(out_pel_6)/784) */ | |
/* out_pel_6_sz = 15680 */ | |
/* out_pel_7 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+7) */ | |
/* out_pel_7_pel_dim = 784 */ | |
/* out_pel_7_pel_sz = 1 */ | |
/* out_pel_7_pel_nomod = %(out_pel_7) */ | |
/* out_pel_7_pel = (%(out_pel_7)%%784) */ | |
/* out_pel_7_img_dim = 20 */ | |
/* out_pel_7_img_sz = 784 */ | |
/* out_pel_7_img_nomod = (%(out_pel_7)/784) */ | |
/* out_pel_7_img = (%(out_pel_7)/784) */ | |
/* out_pel_7_sz = 15680 */ | |
/* t_tile_stores = // begin t_tile_stores | |
int32_t tpix[%(t_tile_sz)]; | |
int32_t tcix[%(t_tile_sz)]; | |
tpix[0] = %(out_pel_0_img)*%(out_ix_img_sz) + %(out_pel_0_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[1] = %(out_pel_1_img)*%(out_ix_img_sz) + %(out_pel_1_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[2] = %(out_pel_2_img)*%(out_ix_img_sz) + %(out_pel_2_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[3] = %(out_pel_3_img)*%(out_ix_img_sz) + %(out_pel_3_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[4] = %(out_pel_4_img)*%(out_ix_img_sz) + %(out_pel_4_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[5] = %(out_pel_5_img)*%(out_ix_img_sz) + %(out_pel_5_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[6] = %(out_pel_6_img)*%(out_ix_img_sz) + %(out_pel_6_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[7] = %(out_pel_7_img)*%(out_ix_img_sz) + %(out_pel_7_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tcix[0] = (%(out_chan_ix)+0)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[1] = (%(out_chan_ix)+1)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[2] = (%(out_chan_ix)+2)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[3] = (%(out_chan_ix)+3)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[4] = (%(out_chan_ix)+4)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[5] = (%(out_chan_ix)+5)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[6] = (%(out_chan_ix)+6)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[7] = (%(out_chan_ix)+7)*%(out_ix_chan_sz); // cache out chan ixs | |
if( %(out_pel_0_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( %(out_pel_1_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( %(out_pel_2_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( %(out_pel_3_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( %(out_pel_4_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( %(out_pel_5_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( %(out_pel_6_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( %(out_pel_7_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores */ | |
/* t_tile_dummy_stores = out_off[0] = max(0.0f,out_tile[0]+filts_strip[0]); | |
out_off[128] = max(0.0f,out_tile[1]+filts_strip[1]); | |
out_off[256] = max(0.0f,out_tile[2]+filts_strip[2]); | |
out_off[384] = max(0.0f,out_tile[3]+filts_strip[3]); | |
out_off[512] = max(0.0f,out_tile[4]+filts_strip[4]); | |
out_off[640] = max(0.0f,out_tile[5]+filts_strip[5]); | |
out_off[768] = max(0.0f,out_tile[6]+filts_strip[6]); | |
out_off[896] = max(0.0f,out_tile[7]+filts_strip[7]); | |
out_off[1024] = max(0.0f,out_tile[8]+filts_strip[0]); | |
out_off[1152] = max(0.0f,out_tile[9]+filts_strip[1]); | |
out_off[1280] = max(0.0f,out_tile[10]+filts_strip[2]); | |
out_off[1408] = max(0.0f,out_tile[11]+filts_strip[3]); | |
out_off[1536] = max(0.0f,out_tile[12]+filts_strip[4]); | |
out_off[1664] = max(0.0f,out_tile[13]+filts_strip[5]); | |
out_off[1792] = max(0.0f,out_tile[14]+filts_strip[6]); | |
out_off[1920] = max(0.0f,out_tile[15]+filts_strip[7]); | |
out_off[2048] = max(0.0f,out_tile[16]+filts_strip[0]); | |
out_off[2176] = max(0.0f,out_tile[17]+filts_strip[1]); | |
out_off[2304] = max(0.0f,out_tile[18]+filts_strip[2]); | |
out_off[2432] = max(0.0f,out_tile[19]+filts_strip[3]); | |
out_off[2560] = max(0.0f,out_tile[20]+filts_strip[4]); | |
out_off[2688] = max(0.0f,out_tile[21]+filts_strip[5]); | |
out_off[2816] = max(0.0f,out_tile[22]+filts_strip[6]); | |
out_off[2944] = max(0.0f,out_tile[23]+filts_strip[7]); | |
out_off[3072] = max(0.0f,out_tile[24]+filts_strip[0]); | |
out_off[3200] = max(0.0f,out_tile[25]+filts_strip[1]); | |
out_off[3328] = max(0.0f,out_tile[26]+filts_strip[2]); | |
out_off[3456] = max(0.0f,out_tile[27]+filts_strip[3]); | |
out_off[3584] = max(0.0f,out_tile[28]+filts_strip[4]); | |
out_off[3712] = max(0.0f,out_tile[29]+filts_strip[5]); | |
out_off[3840] = max(0.0f,out_tile[30]+filts_strip[6]); | |
out_off[3968] = max(0.0f,out_tile[31]+filts_strip[7]); | |
out_off[4096] = max(0.0f,out_tile[32]+filts_strip[0]); | |
out_off[4224] = max(0.0f,out_tile[33]+filts_strip[1]); | |
out_off[4352] = max(0.0f,out_tile[34]+filts_strip[2]); | |
out_off[4480] = max(0.0f,out_tile[35]+filts_strip[3]); | |
out_off[4608] = max(0.0f,out_tile[36]+filts_strip[4]); | |
out_off[4736] = max(0.0f,out_tile[37]+filts_strip[5]); | |
out_off[4864] = max(0.0f,out_tile[38]+filts_strip[6]); | |
out_off[4992] = max(0.0f,out_tile[39]+filts_strip[7]); | |
out_off[5120] = max(0.0f,out_tile[40]+filts_strip[0]); | |
out_off[5248] = max(0.0f,out_tile[41]+filts_strip[1]); | |
out_off[5376] = max(0.0f,out_tile[42]+filts_strip[2]); | |
out_off[5504] = max(0.0f,out_tile[43]+filts_strip[3]); | |
out_off[5632] = max(0.0f,out_tile[44]+filts_strip[4]); | |
out_off[5760] = max(0.0f,out_tile[45]+filts_strip[5]); | |
out_off[5888] = max(0.0f,out_tile[46]+filts_strip[6]); | |
out_off[6016] = max(0.0f,out_tile[47]+filts_strip[7]); | |
out_off[6144] = max(0.0f,out_tile[48]+filts_strip[0]); | |
out_off[6272] = max(0.0f,out_tile[49]+filts_strip[1]); | |
out_off[6400] = max(0.0f,out_tile[50]+filts_strip[2]); | |
out_off[6528] = max(0.0f,out_tile[51]+filts_strip[3]); | |
out_off[6656] = max(0.0f,out_tile[52]+filts_strip[4]); | |
out_off[6784] = max(0.0f,out_tile[53]+filts_strip[5]); | |
out_off[6912] = max(0.0f,out_tile[54]+filts_strip[6]); | |
out_off[7040] = max(0.0f,out_tile[55]+filts_strip[7]); | |
out_off[7168] = max(0.0f,out_tile[56]+filts_strip[0]); | |
out_off[7296] = max(0.0f,out_tile[57]+filts_strip[1]); | |
out_off[7424] = max(0.0f,out_tile[58]+filts_strip[2]); | |
out_off[7552] = max(0.0f,out_tile[59]+filts_strip[3]); | |
out_off[7680] = max(0.0f,out_tile[60]+filts_strip[4]); | |
out_off[7808] = max(0.0f,out_tile[61]+filts_strip[5]); | |
out_off[7936] = max(0.0f,out_tile[62]+filts_strip[6]); | |
out_off[8064] = max(0.0f,out_tile[63]+filts_strip[7]); | |
*/ | |
/* t_tile_bias_loads = // begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
// end t_tile_bias_loads */ | |
/* inner_loop_body = // begin inner_loop_body | |
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[3*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[3*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[3*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[3*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[3*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[3*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[3*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[3*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[4*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[4*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[4*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[4*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[4*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[4*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[4*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[4*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[5*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[5*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[5*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[5*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[5*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[5*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[5*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[5*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[6*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[6*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[6*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[6*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[6*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[6*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[6*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[6*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[7*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[7*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[7*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[7*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[7*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[7*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[7*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[7*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
*/ | |
CUCL_GLOBAL_KERNEL void xpose_in__num_imgs_20__in_chan_tile_8__pad_in_chans_192__in_chans_192__ysz_28__xsz_28__tix_pels_tile_sz_16__bix_pels_blk_sz_123( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const out_ix = GLOB_ID_1D; | |
int32_t const chan_ix = ((out_ix/1024)%24)*8 + ((out_ix/128)%8); | |
int32_t const pel_ix = (out_ix/24576)*128 + (out_ix%128); | |
float v = 0.0f; | |
if( ( chan_ix < 192 ) && ( (pel_ix/784) < 20 ) ) { | |
v = in[ (pel_ix/784)*150528 + | |
chan_ix*784 + | |
((pel_ix/28)%28)*28 + | |
(pel_ix%28)*1 ]; | |
} | |
out[out_ix] = v; | |
} | |
/* | |
in_pels = num_img * in.sz.dims_prod() | |
num_in_blks = u32_ceil_div( in_pels, block_chan_pels ) | |
normal in dims: img, chan, y, x OR img, chan, pels // where pels = x,y dims merged | |
block_iters = u32_ceil_div( chan, in_chan_tile ) // for ccp1, 96/8=12 | |
pad_chan = block_iter * in_chan_tile // pad by up to (in_chan_tile-1) [typ. 8; pad with zeros? garbage okay?] | |
block_chan_pels = t_tile_sz*tix_pels_tile_sz // typically 8*8=64 | |
block_iter_pels = block_chan_pels * in_chan_tile; // typically 512 | |
block_pels = 12*512 = 6144 // note: 24576 bytes, prob. too big for SM to fully cache, but 512=2K (per-iter cache) is fine. | |
xposed in dims (inner): (block_iter, block_iter_chan, block_iter_pel) == block_pel | |
sz (inner): (block_iters, in_chan_tile, block_chan_pels) == block_pels (only inner 2 dims need to be linear?) | |
*/ | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_chan_tile = 8 */ | |
/* pad_in_chans = 192 */ | |
/* in_chans = 192 */ | |
/* ysz = 28 */ | |
/* xsz = 28 */ | |
/* tix_pels_tile_sz = 16 */ | |
/* bix_pels_blk_sz = 123 */ | |
/* rtc_func_name = xpose_in__num_imgs_20__in_chan_tile_8__pad_in_chans_192__in_chans_192__ysz_28__xsz_28__tix_pels_tile_sz_16__bix_pels_blk_sz_123 */ | |
/* out_ix_blk_pel_dim = 128 */ | |
/* out_ix_blk_pel_sz = 1 */ | |
/* out_ix_blk_pel_nomod = out_ix */ | |
/* out_ix_blk_pel = (out_ix%%128) */ | |
/* out_ix_blk_iter_chan_dim = 8 */ | |
/* out_ix_blk_iter_chan_sz = 128 */ | |
/* out_ix_blk_iter_chan_nomod = (out_ix/128) */ | |
/* out_ix_blk_iter_chan = ((out_ix/128)%%8) */ | |
/* out_ix_blk_iter_dim = 24 */ | |
/* out_ix_blk_iter_sz = 1024 */ | |
/* out_ix_blk_iter_nomod = (out_ix/1024) */ | |
/* out_ix_blk_iter = ((out_ix/1024)%%24) */ | |
/* out_ix_blk_dim = 123 */ | |
/* out_ix_blk_sz = 24576 */ | |
/* out_ix_blk_nomod = (out_ix/24576) */ | |
/* out_ix_blk = (out_ix/24576) */ | |
/* out_ix_sz = 3022848 */ | |
/* pel_ix_x_dim = 28 */ | |
/* pel_ix_x_sz = 1 */ | |
/* pel_ix_x_nomod = pel_ix */ | |
/* pel_ix_x = (pel_ix%%28) */ | |
/* pel_ix_y_dim = 28 */ | |
/* pel_ix_y_sz = 28 */ | |
/* pel_ix_y_nomod = (pel_ix/28) */ | |
/* pel_ix_y = ((pel_ix/28)%%28) */ | |
/* pel_ix_img_dim = 20 */ | |
/* pel_ix_img_sz = 784 */ | |
/* pel_ix_img_nomod = (pel_ix/784) */ | |
/* pel_ix_img = (pel_ix/784) */ | |
/* pel_ix_sz = 15680 */ | |
/* in_ix_x_dim = 28 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%28) */ | |
/* in_ix_y_dim = 28 */ | |
/* in_ix_y_sz = 28 */ | |
/* in_ix_y_nomod = (in_ix/28) */ | |
/* in_ix_y = ((in_ix/28)%%28) */ | |
/* in_ix_chan_dim = 192 */ | |
/* in_ix_chan_sz = 784 */ | |
/* in_ix_chan_nomod = (in_ix/784) */ | |
/* in_ix_chan = ((in_ix/784)%%192) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 150528 */ | |
/* in_ix_img_nomod = (in_ix/150528) */ | |
/* in_ix_img = (in_ix/150528) */ | |
/* in_ix_sz = 3010560 */ | |
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_64__in_chans_192__kysz_1__kxsz_1( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x | |
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile | |
{ | |
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg | |
int32_t const filts_ix = GLOB_ID_1D; | |
if( filts_ix >= 12288 ) { return; } | |
int32_t const fioc = (filts_ix/192); | |
float val = 0.0f; | |
int32_t const filts_xp_ix = | |
(fioc/64)*12288 + | |
(fioc%8)*8 + | |
((fioc/8)%8)*1 + | |
(filts_ix%192)*64 + | |
(filts_ix%1)*64 + | |
(filts_ix%1)*64; | |
#if 1 | |
val = in[filts_ix]; | |
#else | |
if( (filts_ix%192) == 0 ) { | |
// if( ((filts_ix%1) == 5) && ((filts_ix%1) == 5) ) | |
{ | |
val = (filts_ix%1)*100 + (filts_ix%1); | |
} | |
} | |
#endif | |
out[filts_xp_ix] = val; | |
} | |
// -- template substituion table used: -- | |
/* out_chans = 64 */ | |
/* in_chans = 192 */ | |
/* kysz = 1 */ | |
/* kxsz = 1 */ | |
/* rtc_func_name = xpose_filts__out_chans_64__in_chans_192__kysz_1__kxsz_1 */ | |
/* t_tile_sz = 8 */ | |
/* filts_ix_x_dim = 1 */ | |
/* filts_ix_x_sz = 1 */ | |
/* filts_ix_x_nomod = filts_ix */ | |
/* filts_ix_x = (filts_ix%%1) */ | |
/* filts_ix_y_dim = 1 */ | |
/* filts_ix_y_sz = 1 */ | |
/* filts_ix_y_nomod = filts_ix */ | |
/* filts_ix_y = (filts_ix%%1) */ | |
/* filts_ix_in_chan_dim = 192 */ | |
/* filts_ix_in_chan_sz = 1 */ | |
/* filts_ix_in_chan_nomod = filts_ix */ | |
/* filts_ix_in_chan = (filts_ix%%192) */ | |
/* filts_ix_out_chan_dim = 64 */ | |
/* filts_ix_out_chan_sz = 192 */ | |
/* filts_ix_out_chan_nomod = (filts_ix/192) */ | |
/* filts_ix_out_chan = (filts_ix/192) */ | |
/* filts_ix_sz = 12288 */ | |
/* filts_xp_ix_out_chan_tile_dim = 8 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%8) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 8 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/8) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/8)%%8) */ | |
/* filts_xp_ix_x_dim = 1 */ | |
/* filts_xp_ix_x_sz = 64 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/64) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/64)%%1) */ | |
/* filts_xp_ix_y_dim = 1 */ | |
/* filts_xp_ix_y_sz = 64 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/64) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/64)%%1) */ | |
/* filts_xp_ix_in_chan_dim = 192 */ | |
/* filts_xp_ix_in_chan_sz = 64 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/64) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/64)%%192) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 12288 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/12288) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/12288) */ | |
/* filts_xp_ix_sz = 12288 */ | |
/* fioc_out_chan_reg_dim = 8 */ | |
/* fioc_out_chan_reg_sz = 1 */ | |
/* fioc_out_chan_reg_nomod = fioc */ | |
/* fioc_out_chan_reg = (fioc%%8) */ | |
/* fioc_out_chan_tile_dim = 8 */ | |
/* fioc_out_chan_tile_sz = 8 */ | |
/* fioc_out_chan_tile_nomod = (fioc/8) */ | |
/* fioc_out_chan_tile = ((fioc/8)%%8) */ | |
/* fioc_out_chan_blk_dim = 1 */ | |
/* fioc_out_chan_blk_sz = 64 */ | |
/* fioc_out_chan_blk_nomod = (fioc/64) */ | |
/* fioc_out_chan_blk = (fioc/64) */ | |
/* fioc_sz = 64 */ | |
// each thread: computes outputs across chan dim, using inputs across chan dim | |
CUCL_GLOBAL_KERNEL void copy__num_imgs_20__in_chans_64__ysz_28__xsz_28__out_chans_256__ocix_0( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const in_ix = GLOB_ID_1D; | |
if( in_ix >= 1003520 ) { return; } | |
int32_t const out_ix = (in_ix/50176)*200704 + (((in_ix/784)%64)+0)*784 + | |
((in_ix/28)%28)*28 + (in_ix%28)*1; | |
out[out_ix] = in[in_ix]; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_chans = 64 */ | |
/* ysz = 28 */ | |
/* xsz = 28 */ | |
/* out_chans = 256 */ | |
/* ocix = 0 */ | |
/* rtc_func_name = copy__num_imgs_20__in_chans_64__ysz_28__xsz_28__out_chans_256__ocix_0 */ | |
/* in_ix_x_dim = 28 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%28) */ | |
/* in_ix_y_dim = 28 */ | |
/* in_ix_y_sz = 28 */ | |
/* in_ix_y_nomod = (in_ix/28) */ | |
/* in_ix_y = ((in_ix/28)%%28) */ | |
/* in_ix_chan_dim = 64 */ | |
/* in_ix_chan_sz = 784 */ | |
/* in_ix_chan_nomod = (in_ix/784) */ | |
/* in_ix_chan = ((in_ix/784)%%64) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 50176 */ | |
/* in_ix_img_nomod = (in_ix/50176) */ | |
/* in_ix_img = (in_ix/50176) */ | |
/* in_ix_sz = 1003520 */ | |
/* out_ix_x_dim = 28 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%28) */ | |
/* out_ix_y_dim = 28 */ | |
/* out_ix_y_sz = 28 */ | |
/* out_ix_y_nomod = (out_ix/28) */ | |
/* out_ix_y = ((out_ix/28)%%28) */ | |
/* out_ix_chan_dim = 256 */ | |
/* out_ix_chan_sz = 784 */ | |
/* out_ix_chan_nomod = (out_ix/784) */ | |
/* out_ix_chan = ((out_ix/784)%%256) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 200704 */ | |
/* out_ix_img_nomod = (out_ix/200704) */ | |
/* out_ix_img = (out_ix/200704) */ | |
/* out_ix_sz = 4014080 */ | |
// each thread: computes outputs across chan dim, using inputs across chan dim | |
CUCL_GLOBAL_KERNEL void copy__num_imgs_20__in_chans_128__ysz_28__xsz_28__out_chans_256__ocix_64( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const in_ix = GLOB_ID_1D; | |
if( in_ix >= 2007040 ) { return; } | |
int32_t const out_ix = (in_ix/100352)*200704 + (((in_ix/784)%128)+64)*784 + | |
((in_ix/28)%28)*28 + (in_ix%28)*1; | |
out[out_ix] = in[in_ix]; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_chans = 128 */ | |
/* ysz = 28 */ | |
/* xsz = 28 */ | |
/* out_chans = 256 */ | |
/* ocix = 64 */ | |
/* rtc_func_name = copy__num_imgs_20__in_chans_128__ysz_28__xsz_28__out_chans_256__ocix_64 */ | |
/* in_ix_x_dim = 28 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%28) */ | |
/* in_ix_y_dim = 28 */ | |
/* in_ix_y_sz = 28 */ | |
/* in_ix_y_nomod = (in_ix/28) */ | |
/* in_ix_y = ((in_ix/28)%%28) */ | |
/* in_ix_chan_dim = 128 */ | |
/* in_ix_chan_sz = 784 */ | |
/* in_ix_chan_nomod = (in_ix/784) */ | |
/* in_ix_chan = ((in_ix/784)%%128) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 100352 */ | |
/* in_ix_img_nomod = (in_ix/100352) */ | |
/* in_ix_img = (in_ix/100352) */ | |
/* in_ix_sz = 2007040 */ | |
/* out_ix_x_dim = 28 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%28) */ | |
/* out_ix_y_dim = 28 */ | |
/* out_ix_y_sz = 28 */ | |
/* out_ix_y_nomod = (out_ix/28) */ | |
/* out_ix_y = ((out_ix/28)%%28) */ | |
/* out_ix_chan_dim = 256 */ | |
/* out_ix_chan_sz = 784 */ | |
/* out_ix_chan_nomod = (out_ix/784) */ | |
/* out_ix_chan = ((out_ix/784)%%256) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 200704 */ | |
/* out_ix_img_nomod = (out_ix/200704) */ | |
/* out_ix_img = (out_ix/200704) */ | |
/* out_ix_sz = 4014080 */ | |
// each thread: computes outputs across chan dim, using inputs across chan dim | |
CUCL_GLOBAL_KERNEL void copy__num_imgs_20__in_chans_32__ysz_28__xsz_28__out_chans_256__ocix_192( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const in_ix = GLOB_ID_1D; | |
if( in_ix >= 501760 ) { return; } | |
int32_t const out_ix = (in_ix/25088)*200704 + (((in_ix/784)%32)+192)*784 + | |
((in_ix/28)%28)*28 + (in_ix%28)*1; | |
out[out_ix] = in[in_ix]; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_chans = 32 */ | |
/* ysz = 28 */ | |
/* xsz = 28 */ | |
/* out_chans = 256 */ | |
/* ocix = 192 */ | |
/* rtc_func_name = copy__num_imgs_20__in_chans_32__ysz_28__xsz_28__out_chans_256__ocix_192 */ | |
/* in_ix_x_dim = 28 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%28) */ | |
/* in_ix_y_dim = 28 */ | |
/* in_ix_y_sz = 28 */ | |
/* in_ix_y_nomod = (in_ix/28) */ | |
/* in_ix_y = ((in_ix/28)%%28) */ | |
/* in_ix_chan_dim = 32 */ | |
/* in_ix_chan_sz = 784 */ | |
/* in_ix_chan_nomod = (in_ix/784) */ | |
/* in_ix_chan = ((in_ix/784)%%32) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 25088 */ | |
/* in_ix_img_nomod = (in_ix/25088) */ | |
/* in_ix_img = (in_ix/25088) */ | |
/* in_ix_sz = 501760 */ | |
/* out_ix_x_dim = 28 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%28) */ | |
/* out_ix_y_dim = 28 */ | |
/* out_ix_y_sz = 28 */ | |
/* out_ix_y_nomod = (out_ix/28) */ | |
/* out_ix_y = ((out_ix/28)%%28) */ | |
/* out_ix_chan_dim = 256 */ | |
/* out_ix_chan_sz = 784 */ | |
/* out_ix_chan_nomod = (out_ix/784) */ | |
/* out_ix_chan = ((out_ix/784)%%256) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 200704 */ | |
/* out_ix_img_nomod = (out_ix/200704) */ | |
/* out_ix_img = (out_ix/200704) */ | |
/* out_ix_sz = 4014080 */ | |
// each thread: computes outputs across chan dim, using inputs across chan dim | |
CUCL_GLOBAL_KERNEL void copy__num_imgs_20__in_chans_32__ysz_28__xsz_28__out_chans_256__ocix_224( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const in_ix = GLOB_ID_1D; | |
if( in_ix >= 501760 ) { return; } | |
int32_t const out_ix = (in_ix/25088)*200704 + (((in_ix/784)%32)+224)*784 + | |
((in_ix/28)%28)*28 + (in_ix%28)*1; | |
out[out_ix] = in[in_ix]; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_chans = 32 */ | |
/* ysz = 28 */ | |
/* xsz = 28 */ | |
/* out_chans = 256 */ | |
/* ocix = 224 */ | |
/* rtc_func_name = copy__num_imgs_20__in_chans_32__ysz_28__xsz_28__out_chans_256__ocix_224 */ | |
/* in_ix_x_dim = 28 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%28) */ | |
/* in_ix_y_dim = 28 */ | |
/* in_ix_y_sz = 28 */ | |
/* in_ix_y_nomod = (in_ix/28) */ | |
/* in_ix_y = ((in_ix/28)%%28) */ | |
/* in_ix_chan_dim = 32 */ | |
/* in_ix_chan_sz = 784 */ | |
/* in_ix_chan_nomod = (in_ix/784) */ | |
/* in_ix_chan = ((in_ix/784)%%32) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 25088 */ | |
/* in_ix_img_nomod = (in_ix/25088) */ | |
/* in_ix_img = (in_ix/25088) */ | |
/* in_ix_sz = 501760 */ | |
/* out_ix_x_dim = 28 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%28) */ | |
/* out_ix_y_dim = 28 */ | |
/* out_ix_y_sz = 28 */ | |
/* out_ix_y_nomod = (out_ix/28) */ | |
/* out_ix_y = ((out_ix/28)%%28) */ | |
/* out_ix_chan_dim = 256 */ | |
/* out_ix_chan_sz = 784 */ | |
/* out_ix_chan_nomod = (out_ix/784) */ | |
/* out_ix_chan = ((out_ix/784)%%256) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 200704 */ | |
/* out_ix_img_nomod = (out_ix/200704) */ | |
/* out_ix_img = (out_ix/200704) */ | |
/* out_ix_sz = 4014080 */ | |
// each thread: computes 8x8 block of out | |
// loop over k dim | |
CUCL_GLOBAL_KERNEL void k1conv__num_imgs_20__in_dim_0_28__in_dim_1_28__conv_has_relu_1__out_chans_128__write_xposed_0__in_chans_256( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) { | |
//int32_t const blk_in_ix_sz = 8*8; | |
LOCSHAR_MEM float all_smem[1536]; // note: max(filts+in,out) == max(1024+512,1024) | |
LSMASQ float * const filts_smem = all_smem; | |
LSMASQ float * const in_smem = filts_smem + 1024; | |
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers | |
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem | |
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz ) | |
float in_strip[8]; // segment of input line sufficient for one unrolling of inner loop | |
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*32768; // index of first out chan | |
int32_t blk_in_ix_base = GRP_ID_1D*16384 + LOC_ID_1D;// index of first input pel to load for this thread | |
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%16); | |
LSMASQ float * const in_smem_off = in_smem + 8*(LOC_ID_1D/16); | |
LSMASQ float * const out_smem_off = all_smem + LOC_ID_1D; | |
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D; | |
// iteratate over filter elements | |
for( int32_t blk_iter = 0; blk_iter != 32; ++blk_iter ) { | |
BARRIER_SYNC; | |
// begin smem_loads | |
filts_smem[(LOC_ID_1D + 128 * 0)] = filts[filts_off+(128*0)]; | |
filts_smem[(LOC_ID_1D + 128 * 1)] = filts[filts_off+(128*1)]; | |
filts_smem[(LOC_ID_1D + 128 * 2)] = filts[filts_off+(128*2)]; | |
filts_smem[(LOC_ID_1D + 128 * 3)] = filts[filts_off+(128*3)]; | |
filts_smem[(LOC_ID_1D + 128 * 4)] = filts[filts_off+(128*4)]; | |
filts_smem[(LOC_ID_1D + 128 * 5)] = filts[filts_off+(128*5)]; | |
filts_smem[(LOC_ID_1D + 128 * 6)] = filts[filts_off+(128*6)]; | |
filts_smem[(LOC_ID_1D + 128 * 7)] = filts[filts_off+(128*7)]; | |
in_smem[(LOC_ID_1D + 128 * 0)] = in[ blk_in_ix_base + (128*0) ]; | |
in_smem[(LOC_ID_1D + 128 * 1)] = in[ blk_in_ix_base + (128*1) ]; | |
in_smem[(LOC_ID_1D + 128 * 2)] = in[ blk_in_ix_base + (128*2) ]; | |
in_smem[(LOC_ID_1D + 128 * 3)] = in[ blk_in_ix_base + (128*3) ]; | |
// end smem_loads; | |
BARRIER_SYNC; | |
filts_off += 128*8; | |
blk_in_ix_base += 512; | |
// begin inner_loop_body | |
filts_strip[0] = filts_smem_off[0*128+0*16]; | |
filts_strip[1] = filts_smem_off[0*128+1*16]; | |
filts_strip[2] = filts_smem_off[0*128+2*16]; | |
filts_strip[3] = filts_smem_off[0*128+3*16]; | |
filts_strip[4] = filts_smem_off[0*128+4*16]; | |
filts_strip[5] = filts_smem_off[0*128+5*16]; | |
filts_strip[6] = filts_smem_off[0*128+6*16]; | |
filts_strip[7] = filts_smem_off[0*128+7*16]; | |
in_strip[0] = in_smem_off[(0*8*8+0)]; | |
in_strip[1] = in_smem_off[(0*8*8+1)]; | |
in_strip[2] = in_smem_off[(0*8*8+2)]; | |
in_strip[3] = in_smem_off[(0*8*8+3)]; | |
in_strip[4] = in_smem_off[(0*8*8+4)]; | |
in_strip[5] = in_smem_off[(0*8*8+5)]; | |
in_strip[6] = in_smem_off[(0*8*8+6)]; | |
in_strip[7] = in_smem_off[(0*8*8+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[1*128+0*16]; | |
filts_strip[1] = filts_smem_off[1*128+1*16]; | |
filts_strip[2] = filts_smem_off[1*128+2*16]; | |
filts_strip[3] = filts_smem_off[1*128+3*16]; | |
filts_strip[4] = filts_smem_off[1*128+4*16]; | |
filts_strip[5] = filts_smem_off[1*128+5*16]; | |
filts_strip[6] = filts_smem_off[1*128+6*16]; | |
filts_strip[7] = filts_smem_off[1*128+7*16]; | |
in_strip[0] = in_smem_off[(1*8*8+0)]; | |
in_strip[1] = in_smem_off[(1*8*8+1)]; | |
in_strip[2] = in_smem_off[(1*8*8+2)]; | |
in_strip[3] = in_smem_off[(1*8*8+3)]; | |
in_strip[4] = in_smem_off[(1*8*8+4)]; | |
in_strip[5] = in_smem_off[(1*8*8+5)]; | |
in_strip[6] = in_smem_off[(1*8*8+6)]; | |
in_strip[7] = in_smem_off[(1*8*8+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[2*128+0*16]; | |
filts_strip[1] = filts_smem_off[2*128+1*16]; | |
filts_strip[2] = filts_smem_off[2*128+2*16]; | |
filts_strip[3] = filts_smem_off[2*128+3*16]; | |
filts_strip[4] = filts_smem_off[2*128+4*16]; | |
filts_strip[5] = filts_smem_off[2*128+5*16]; | |
filts_strip[6] = filts_smem_off[2*128+6*16]; | |
filts_strip[7] = filts_smem_off[2*128+7*16]; | |
in_strip[0] = in_smem_off[(2*8*8+0)]; | |
in_strip[1] = in_smem_off[(2*8*8+1)]; | |
in_strip[2] = in_smem_off[(2*8*8+2)]; | |
in_strip[3] = in_smem_off[(2*8*8+3)]; | |
in_strip[4] = in_smem_off[(2*8*8+4)]; | |
in_strip[5] = in_smem_off[(2*8*8+5)]; | |
in_strip[6] = in_smem_off[(2*8*8+6)]; | |
in_strip[7] = in_smem_off[(2*8*8+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[3*128+0*16]; | |
filts_strip[1] = filts_smem_off[3*128+1*16]; | |
filts_strip[2] = filts_smem_off[3*128+2*16]; | |
filts_strip[3] = filts_smem_off[3*128+3*16]; | |
filts_strip[4] = filts_smem_off[3*128+4*16]; | |
filts_strip[5] = filts_smem_off[3*128+5*16]; | |
filts_strip[6] = filts_smem_off[3*128+6*16]; | |
filts_strip[7] = filts_smem_off[3*128+7*16]; | |
in_strip[0] = in_smem_off[(3*8*8+0)]; | |
in_strip[1] = in_smem_off[(3*8*8+1)]; | |
in_strip[2] = in_smem_off[(3*8*8+2)]; | |
in_strip[3] = in_smem_off[(3*8*8+3)]; | |
in_strip[4] = in_smem_off[(3*8*8+4)]; | |
in_strip[5] = in_smem_off[(3*8*8+5)]; | |
in_strip[6] = in_smem_off[(3*8*8+6)]; | |
in_strip[7] = in_smem_off[(3*8*8+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[4*128+0*16]; | |
filts_strip[1] = filts_smem_off[4*128+1*16]; | |
filts_strip[2] = filts_smem_off[4*128+2*16]; | |
filts_strip[3] = filts_smem_off[4*128+3*16]; | |
filts_strip[4] = filts_smem_off[4*128+4*16]; | |
filts_strip[5] = filts_smem_off[4*128+5*16]; | |
filts_strip[6] = filts_smem_off[4*128+6*16]; | |
filts_strip[7] = filts_smem_off[4*128+7*16]; | |
in_strip[0] = in_smem_off[(4*8*8+0)]; | |
in_strip[1] = in_smem_off[(4*8*8+1)]; | |
in_strip[2] = in_smem_off[(4*8*8+2)]; | |
in_strip[3] = in_smem_off[(4*8*8+3)]; | |
in_strip[4] = in_smem_off[(4*8*8+4)]; | |
in_strip[5] = in_smem_off[(4*8*8+5)]; | |
in_strip[6] = in_smem_off[(4*8*8+6)]; | |
in_strip[7] = in_smem_off[(4*8*8+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[5*128+0*16]; | |
filts_strip[1] = filts_smem_off[5*128+1*16]; | |
filts_strip[2] = filts_smem_off[5*128+2*16]; | |
filts_strip[3] = filts_smem_off[5*128+3*16]; | |
filts_strip[4] = filts_smem_off[5*128+4*16]; | |
filts_strip[5] = filts_smem_off[5*128+5*16]; | |
filts_strip[6] = filts_smem_off[5*128+6*16]; | |
filts_strip[7] = filts_smem_off[5*128+7*16]; | |
in_strip[0] = in_smem_off[(5*8*8+0)]; | |
in_strip[1] = in_smem_off[(5*8*8+1)]; | |
in_strip[2] = in_smem_off[(5*8*8+2)]; | |
in_strip[3] = in_smem_off[(5*8*8+3)]; | |
in_strip[4] = in_smem_off[(5*8*8+4)]; | |
in_strip[5] = in_smem_off[(5*8*8+5)]; | |
in_strip[6] = in_smem_off[(5*8*8+6)]; | |
in_strip[7] = in_smem_off[(5*8*8+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[6*128+0*16]; | |
filts_strip[1] = filts_smem_off[6*128+1*16]; | |
filts_strip[2] = filts_smem_off[6*128+2*16]; | |
filts_strip[3] = filts_smem_off[6*128+3*16]; | |
filts_strip[4] = filts_smem_off[6*128+4*16]; | |
filts_strip[5] = filts_smem_off[6*128+5*16]; | |
filts_strip[6] = filts_smem_off[6*128+6*16]; | |
filts_strip[7] = filts_smem_off[6*128+7*16]; | |
in_strip[0] = in_smem_off[(6*8*8+0)]; | |
in_strip[1] = in_smem_off[(6*8*8+1)]; | |
in_strip[2] = in_smem_off[(6*8*8+2)]; | |
in_strip[3] = in_smem_off[(6*8*8+3)]; | |
in_strip[4] = in_smem_off[(6*8*8+4)]; | |
in_strip[5] = in_smem_off[(6*8*8+5)]; | |
in_strip[6] = in_smem_off[(6*8*8+6)]; | |
in_strip[7] = in_smem_off[(6*8*8+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[7*128+0*16]; | |
filts_strip[1] = filts_smem_off[7*128+1*16]; | |
filts_strip[2] = filts_smem_off[7*128+2*16]; | |
filts_strip[3] = filts_smem_off[7*128+3*16]; | |
filts_strip[4] = filts_smem_off[7*128+4*16]; | |
filts_strip[5] = filts_smem_off[7*128+5*16]; | |
filts_strip[6] = filts_smem_off[7*128+6*16]; | |
filts_strip[7] = filts_smem_off[7*128+7*16]; | |
in_strip[0] = in_smem_off[(7*8*8+0)]; | |
in_strip[1] = in_smem_off[(7*8*8+1)]; | |
in_strip[2] = in_smem_off[(7*8*8+2)]; | |
in_strip[3] = in_smem_off[(7*8*8+3)]; | |
in_strip[4] = in_smem_off[(7*8*8+4)]; | |
in_strip[5] = in_smem_off[(7*8*8+5)]; | |
in_strip[6] = in_smem_off[(7*8*8+6)]; | |
in_strip[7] = in_smem_off[(7*8*8+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
; | |
} | |
// load per-block biases into smem | |
if( flags == 2 ) { return; } | |
BARRIER_SYNC; | |
for( int32_t i = 0; i != 1; ++i ) { | |
int32_t const t_smem_bias_ix = LOC_ID_1D+128*i; | |
if( t_smem_bias_ix < 128 ) { | |
int32_t const ocix_base = (GRP_ID_1D%1)*128; | |
int32_t const load_reg = t_smem_bias_ix / 16; | |
int32_t const load_tile = t_smem_bias_ix % 16; | |
int32_t const ocix = ocix_base + load_tile*8 + load_reg; | |
if( ocix < 128 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; } | |
} | |
} | |
BARRIER_SYNC; | |
// load biases into filts_strip | |
// begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*16]; | |
filts_strip[1] = filts_smem_off[1*16]; | |
filts_strip[2] = filts_smem_off[2*16]; | |
filts_strip[3] = filts_smem_off[3*16]; | |
filts_strip[4] = filts_smem_off[4*16]; | |
filts_strip[5] = filts_smem_off[5*16]; | |
filts_strip[6] = filts_smem_off[6*16]; | |
filts_strip[7] = filts_smem_off[7*16]; | |
// end t_tile_bias_loads; | |
if( flags == 1 ) { | |
GASQ float * const out_off = out + LOC_ID_1D; | |
out_off[0] = max(0.0f,out_tile[0]+filts_strip[0]); | |
out_off[128] = max(0.0f,out_tile[1]+filts_strip[1]); | |
out_off[256] = max(0.0f,out_tile[2]+filts_strip[2]); | |
out_off[384] = max(0.0f,out_tile[3]+filts_strip[3]); | |
out_off[512] = max(0.0f,out_tile[4]+filts_strip[4]); | |
out_off[640] = max(0.0f,out_tile[5]+filts_strip[5]); | |
out_off[768] = max(0.0f,out_tile[6]+filts_strip[6]); | |
out_off[896] = max(0.0f,out_tile[7]+filts_strip[7]); | |
out_off[1024] = max(0.0f,out_tile[8]+filts_strip[0]); | |
out_off[1152] = max(0.0f,out_tile[9]+filts_strip[1]); | |
out_off[1280] = max(0.0f,out_tile[10]+filts_strip[2]); | |
out_off[1408] = max(0.0f,out_tile[11]+filts_strip[3]); | |
out_off[1536] = max(0.0f,out_tile[12]+filts_strip[4]); | |
out_off[1664] = max(0.0f,out_tile[13]+filts_strip[5]); | |
out_off[1792] = max(0.0f,out_tile[14]+filts_strip[6]); | |
out_off[1920] = max(0.0f,out_tile[15]+filts_strip[7]); | |
out_off[2048] = max(0.0f,out_tile[16]+filts_strip[0]); | |
out_off[2176] = max(0.0f,out_tile[17]+filts_strip[1]); | |
out_off[2304] = max(0.0f,out_tile[18]+filts_strip[2]); | |
out_off[2432] = max(0.0f,out_tile[19]+filts_strip[3]); | |
out_off[2560] = max(0.0f,out_tile[20]+filts_strip[4]); | |
out_off[2688] = max(0.0f,out_tile[21]+filts_strip[5]); | |
out_off[2816] = max(0.0f,out_tile[22]+filts_strip[6]); | |
out_off[2944] = max(0.0f,out_tile[23]+filts_strip[7]); | |
out_off[3072] = max(0.0f,out_tile[24]+filts_strip[0]); | |
out_off[3200] = max(0.0f,out_tile[25]+filts_strip[1]); | |
out_off[3328] = max(0.0f,out_tile[26]+filts_strip[2]); | |
out_off[3456] = max(0.0f,out_tile[27]+filts_strip[3]); | |
out_off[3584] = max(0.0f,out_tile[28]+filts_strip[4]); | |
out_off[3712] = max(0.0f,out_tile[29]+filts_strip[5]); | |
out_off[3840] = max(0.0f,out_tile[30]+filts_strip[6]); | |
out_off[3968] = max(0.0f,out_tile[31]+filts_strip[7]); | |
out_off[4096] = max(0.0f,out_tile[32]+filts_strip[0]); | |
out_off[4224] = max(0.0f,out_tile[33]+filts_strip[1]); | |
out_off[4352] = max(0.0f,out_tile[34]+filts_strip[2]); | |
out_off[4480] = max(0.0f,out_tile[35]+filts_strip[3]); | |
out_off[4608] = max(0.0f,out_tile[36]+filts_strip[4]); | |
out_off[4736] = max(0.0f,out_tile[37]+filts_strip[5]); | |
out_off[4864] = max(0.0f,out_tile[38]+filts_strip[6]); | |
out_off[4992] = max(0.0f,out_tile[39]+filts_strip[7]); | |
out_off[5120] = max(0.0f,out_tile[40]+filts_strip[0]); | |
out_off[5248] = max(0.0f,out_tile[41]+filts_strip[1]); | |
out_off[5376] = max(0.0f,out_tile[42]+filts_strip[2]); | |
out_off[5504] = max(0.0f,out_tile[43]+filts_strip[3]); | |
out_off[5632] = max(0.0f,out_tile[44]+filts_strip[4]); | |
out_off[5760] = max(0.0f,out_tile[45]+filts_strip[5]); | |
out_off[5888] = max(0.0f,out_tile[46]+filts_strip[6]); | |
out_off[6016] = max(0.0f,out_tile[47]+filts_strip[7]); | |
out_off[6144] = max(0.0f,out_tile[48]+filts_strip[0]); | |
out_off[6272] = max(0.0f,out_tile[49]+filts_strip[1]); | |
out_off[6400] = max(0.0f,out_tile[50]+filts_strip[2]); | |
out_off[6528] = max(0.0f,out_tile[51]+filts_strip[3]); | |
out_off[6656] = max(0.0f,out_tile[52]+filts_strip[4]); | |
out_off[6784] = max(0.0f,out_tile[53]+filts_strip[5]); | |
out_off[6912] = max(0.0f,out_tile[54]+filts_strip[6]); | |
out_off[7040] = max(0.0f,out_tile[55]+filts_strip[7]); | |
out_off[7168] = max(0.0f,out_tile[56]+filts_strip[0]); | |
out_off[7296] = max(0.0f,out_tile[57]+filts_strip[1]); | |
out_off[7424] = max(0.0f,out_tile[58]+filts_strip[2]); | |
out_off[7552] = max(0.0f,out_tile[59]+filts_strip[3]); | |
out_off[7680] = max(0.0f,out_tile[60]+filts_strip[4]); | |
out_off[7808] = max(0.0f,out_tile[61]+filts_strip[5]); | |
out_off[7936] = max(0.0f,out_tile[62]+filts_strip[6]); | |
out_off[8064] = max(0.0f,out_tile[63]+filts_strip[7]); | |
; | |
return; | |
} | |
// add bias to each elem of out_tile[] and store the results to out[] | |
// begin t_tile_stores | |
int32_t tpix[8]; | |
int32_t tcix[8]; | |
tpix[0] = ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+0)/784)*100352 + ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+0)%784)*1 ; // cache out patch ixs | |
tpix[1] = ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+1)/784)*100352 + ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+1)%784)*1 ; // cache out patch ixs | |
tpix[2] = ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+2)/784)*100352 + ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+2)%784)*1 ; // cache out patch ixs | |
tpix[3] = ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+3)/784)*100352 + ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+3)%784)*1 ; // cache out patch ixs | |
tpix[4] = ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+4)/784)*100352 + ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+4)%784)*1 ; // cache out patch ixs | |
tpix[5] = ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+5)/784)*100352 + ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+5)%784)*1 ; // cache out patch ixs | |
tpix[6] = ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+6)/784)*100352 + ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+6)%784)*1 ; // cache out patch ixs | |
tpix[7] = ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+7)/784)*100352 + ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+7)%784)*1 ; // cache out patch ixs | |
tcix[0] = ((((LOC_ID_1D%16)+(GRP_ID_1D%1)*16)*8)+0)*784; // cache out chan ixs | |
tcix[1] = ((((LOC_ID_1D%16)+(GRP_ID_1D%1)*16)*8)+1)*784; // cache out chan ixs | |
tcix[2] = ((((LOC_ID_1D%16)+(GRP_ID_1D%1)*16)*8)+2)*784; // cache out chan ixs | |
tcix[3] = ((((LOC_ID_1D%16)+(GRP_ID_1D%1)*16)*8)+3)*784; // cache out chan ixs | |
tcix[4] = ((((LOC_ID_1D%16)+(GRP_ID_1D%1)*16)*8)+4)*784; // cache out chan ixs | |
tcix[5] = ((((LOC_ID_1D%16)+(GRP_ID_1D%1)*16)*8)+5)*784; // cache out chan ixs | |
tcix[6] = ((((LOC_ID_1D%16)+(GRP_ID_1D%1)*16)*8)+6)*784; // cache out chan ixs | |
tcix[7] = ((((LOC_ID_1D%16)+(GRP_ID_1D%1)*16)*8)+7)*784; // cache out chan ixs | |
if( ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+0)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (128*784) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( tcix[1] < (128*784) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( tcix[2] < (128*784) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( tcix[3] < (128*784) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( tcix[4] < (128*784) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( tcix[5] < (128*784) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( tcix[6] < (128*784) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( tcix[7] < (128*784) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+1)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (128*784) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( tcix[1] < (128*784) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( tcix[2] < (128*784) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( tcix[3] < (128*784) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( tcix[4] < (128*784) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( tcix[5] < (128*784) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( tcix[6] < (128*784) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( tcix[7] < (128*784) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+2)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (128*784) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( tcix[1] < (128*784) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( tcix[2] < (128*784) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( tcix[3] < (128*784) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( tcix[4] < (128*784) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( tcix[5] < (128*784) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( tcix[6] < (128*784) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( tcix[7] < (128*784) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+3)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (128*784) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( tcix[1] < (128*784) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( tcix[2] < (128*784) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( tcix[3] < (128*784) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( tcix[4] < (128*784) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( tcix[5] < (128*784) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( tcix[6] < (128*784) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( tcix[7] < (128*784) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+4)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (128*784) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( tcix[1] < (128*784) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( tcix[2] < (128*784) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( tcix[3] < (128*784) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( tcix[4] < (128*784) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( tcix[5] < (128*784) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( tcix[6] < (128*784) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( tcix[7] < (128*784) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+5)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (128*784) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( tcix[1] < (128*784) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( tcix[2] < (128*784) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( tcix[3] < (128*784) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( tcix[4] < (128*784) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( tcix[5] < (128*784) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( tcix[6] < (128*784) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( tcix[7] < (128*784) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+6)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (128*784) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( tcix[1] < (128*784) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( tcix[2] < (128*784) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( tcix[3] < (128*784) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( tcix[4] < (128*784) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( tcix[5] < (128*784) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( tcix[6] < (128*784) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( tcix[7] < (128*784) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( ((GRP_ID_1D*64 + (LOC_ID_1D/16)*8+7)/784) >= 20 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (128*784) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( tcix[1] < (128*784) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( tcix[2] < (128*784) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( tcix[3] < (128*784) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( tcix[4] < (128*784) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( tcix[5] < (128*784) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( tcix[6] < (128*784) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( tcix[7] < (128*784) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_dim_0 = 28 */ | |
/* in_dim_1 = 28 */ | |
/* conv_has_relu = 1 */ | |
/* out_chans = 128 */ | |
/* write_xposed = 0 */ | |
/* in_chans = 256 */ | |
/* rtc_func_name = k1conv__num_imgs_20__in_dim_0_28__in_dim_1_28__conv_has_relu_1__out_chans_128__write_xposed_0__in_chans_256 */ | |
/* t_tile_sz = 8 */ | |
/* out_ix_x_dim = 28 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%28) */ | |
/* out_ix_y_dim = 28 */ | |
/* out_ix_y_sz = 28 */ | |
/* out_ix_y_nomod = (out_ix/28) */ | |
/* out_ix_y = ((out_ix/28)%%28) */ | |
/* out_ix_chan_dim = 128 */ | |
/* out_ix_chan_sz = 784 */ | |
/* out_ix_chan_nomod = (out_ix/784) */ | |
/* out_ix_chan = ((out_ix/784)%%128) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 100352 */ | |
/* out_ix_img_nomod = (out_ix/100352) */ | |
/* out_ix_img = (out_ix/100352) */ | |
/* out_ix_sz = 2007040 */ | |
/* tpb = 128 */ | |
/* in_chan_tile = 8 */ | |
/* LOC_ID_1D_out_chan_tile_dim = 16 */ | |
/* LOC_ID_1D_out_chan_tile_sz = 1 */ | |
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */ | |
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%16) */ | |
/* LOC_ID_1D_pels_tile_dim = 8 */ | |
/* LOC_ID_1D_pels_tile_sz = 16 */ | |
/* LOC_ID_1D_pels_tile_nomod = (LOC_ID_1D/16) */ | |
/* LOC_ID_1D_pels_tile = (LOC_ID_1D/16) */ | |
/* LOC_ID_1D_sz = 128 */ | |
/* GRP_ID_1D_out_chan_blk_dim = 1 */ | |
/* GRP_ID_1D_out_chan_blk_sz = 1 */ | |
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */ | |
/* GRP_ID_1D_pels_blk_dim = 245 */ | |
/* GRP_ID_1D_pels_blk_sz = 1 */ | |
/* GRP_ID_1D_pels_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_pels_blk = GRP_ID_1D */ | |
/* GRP_ID_1D_sz = 245 */ | |
/* in_ix_blk_pel_dim = 64 */ | |
/* in_ix_blk_pel_sz = 1 */ | |
/* in_ix_blk_pel_nomod = in_ix */ | |
/* in_ix_blk_pel = (in_ix%%64) */ | |
/* in_ix_blk_iter_chan_dim = 8 */ | |
/* in_ix_blk_iter_chan_sz = 64 */ | |
/* in_ix_blk_iter_chan_nomod = (in_ix/64) */ | |
/* in_ix_blk_iter_chan = ((in_ix/64)%%8) */ | |
/* in_ix_blk_iter_dim = 32 */ | |
/* in_ix_blk_iter_sz = 512 */ | |
/* in_ix_blk_iter_nomod = (in_ix/512) */ | |
/* in_ix_blk_iter = ((in_ix/512)%%32) */ | |
/* in_ix_blk_dim = 245 */ | |
/* in_ix_blk_sz = 16384 */ | |
/* in_ix_blk_nomod = (in_ix/16384) */ | |
/* in_ix_blk = (in_ix/16384) */ | |
/* in_ix_sz = 4014080 */ | |
/* blk_filt_ix_sz = 128 */ | |
/* filts_smem_sz = 1024 */ | |
/* in_smem_sz = 512 */ | |
/* out_smem_sz = 1024 */ | |
/* all_smem_sz = 1536 */ | |
/* filts_xp_ix_out_chan_tile_dim = 16 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%16) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 16 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/16) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/16)%%8) */ | |
/* filts_xp_ix_in_chan_dim = 256 */ | |
/* filts_xp_ix_in_chan_sz = 128 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/128) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/128)%%256) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 32768 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/32768) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/32768) */ | |
/* filts_xp_ix_sz = 32768 */ | |
/* out_chan_bias_smem_load_iter = 1 */ | |
/* filts_off_adj = LOC_ID_1D */ | |
/* smem_loads = // begin smem_loads | |
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 2)] = filts[filts_off+(%(tpb)*2)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 3)] = filts[filts_off+(%(tpb)*3)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 4)] = filts[filts_off+(%(tpb)*4)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 5)] = filts[filts_off+(%(tpb)*5)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 6)] = filts[filts_off+(%(tpb)*6)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 7)] = filts[filts_off+(%(tpb)*7)]; | |
in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 1)] = in[ blk_in_ix_base + (%(tpb)*1) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 2)] = in[ blk_in_ix_base + (%(tpb)*2) ]; | |
in_smem[(LOC_ID_1D + %(tpb) * 3)] = in[ blk_in_ix_base + (%(tpb)*3) ]; | |
// end smem_loads */ | |
/* out_chan_tile = (%(LOC_ID_1D_out_chan_tile)+%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim)) */ | |
/* out_chan_ix = (%(out_chan_tile)*%(t_tile_sz)) */ | |
/* t_smem_ld_pel_pel_dim = 64 */ | |
/* t_smem_ld_pel_pel_sz = 1 */ | |
/* t_smem_ld_pel_pel_nomod = t_smem_ld_pel */ | |
/* t_smem_ld_pel_pel = (t_smem_ld_pel%%64) */ | |
/* t_smem_ld_pel_chan_dim = 8 */ | |
/* t_smem_ld_pel_chan_sz = 64 */ | |
/* t_smem_ld_pel_chan_nomod = (t_smem_ld_pel/64) */ | |
/* t_smem_ld_pel_chan = (t_smem_ld_pel/64) */ | |
/* t_smem_ld_pel_sz = 512 */ | |
/* out_pel_0 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+0) */ | |
/* out_pel_0_pel_dim = 784 */ | |
/* out_pel_0_pel_sz = 1 */ | |
/* out_pel_0_pel_nomod = %(out_pel_0) */ | |
/* out_pel_0_pel = (%(out_pel_0)%%784) */ | |
/* out_pel_0_img_dim = 20 */ | |
/* out_pel_0_img_sz = 784 */ | |
/* out_pel_0_img_nomod = (%(out_pel_0)/784) */ | |
/* out_pel_0_img = (%(out_pel_0)/784) */ | |
/* out_pel_0_sz = 15680 */ | |
/* out_pel_1 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+1) */ | |
/* out_pel_1_pel_dim = 784 */ | |
/* out_pel_1_pel_sz = 1 */ | |
/* out_pel_1_pel_nomod = %(out_pel_1) */ | |
/* out_pel_1_pel = (%(out_pel_1)%%784) */ | |
/* out_pel_1_img_dim = 20 */ | |
/* out_pel_1_img_sz = 784 */ | |
/* out_pel_1_img_nomod = (%(out_pel_1)/784) */ | |
/* out_pel_1_img = (%(out_pel_1)/784) */ | |
/* out_pel_1_sz = 15680 */ | |
/* out_pel_2 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+2) */ | |
/* out_pel_2_pel_dim = 784 */ | |
/* out_pel_2_pel_sz = 1 */ | |
/* out_pel_2_pel_nomod = %(out_pel_2) */ | |
/* out_pel_2_pel = (%(out_pel_2)%%784) */ | |
/* out_pel_2_img_dim = 20 */ | |
/* out_pel_2_img_sz = 784 */ | |
/* out_pel_2_img_nomod = (%(out_pel_2)/784) */ | |
/* out_pel_2_img = (%(out_pel_2)/784) */ | |
/* out_pel_2_sz = 15680 */ | |
/* out_pel_3 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+3) */ | |
/* out_pel_3_pel_dim = 784 */ | |
/* out_pel_3_pel_sz = 1 */ | |
/* out_pel_3_pel_nomod = %(out_pel_3) */ | |
/* out_pel_3_pel = (%(out_pel_3)%%784) */ | |
/* out_pel_3_img_dim = 20 */ | |
/* out_pel_3_img_sz = 784 */ | |
/* out_pel_3_img_nomod = (%(out_pel_3)/784) */ | |
/* out_pel_3_img = (%(out_pel_3)/784) */ | |
/* out_pel_3_sz = 15680 */ | |
/* out_pel_4 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+4) */ | |
/* out_pel_4_pel_dim = 784 */ | |
/* out_pel_4_pel_sz = 1 */ | |
/* out_pel_4_pel_nomod = %(out_pel_4) */ | |
/* out_pel_4_pel = (%(out_pel_4)%%784) */ | |
/* out_pel_4_img_dim = 20 */ | |
/* out_pel_4_img_sz = 784 */ | |
/* out_pel_4_img_nomod = (%(out_pel_4)/784) */ | |
/* out_pel_4_img = (%(out_pel_4)/784) */ | |
/* out_pel_4_sz = 15680 */ | |
/* out_pel_5 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+5) */ | |
/* out_pel_5_pel_dim = 784 */ | |
/* out_pel_5_pel_sz = 1 */ | |
/* out_pel_5_pel_nomod = %(out_pel_5) */ | |
/* out_pel_5_pel = (%(out_pel_5)%%784) */ | |
/* out_pel_5_img_dim = 20 */ | |
/* out_pel_5_img_sz = 784 */ | |
/* out_pel_5_img_nomod = (%(out_pel_5)/784) */ | |
/* out_pel_5_img = (%(out_pel_5)/784) */ | |
/* out_pel_5_sz = 15680 */ | |
/* out_pel_6 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+6) */ | |
/* out_pel_6_pel_dim = 784 */ | |
/* out_pel_6_pel_sz = 1 */ | |
/* out_pel_6_pel_nomod = %(out_pel_6) */ | |
/* out_pel_6_pel = (%(out_pel_6)%%784) */ | |
/* out_pel_6_img_dim = 20 */ | |
/* out_pel_6_img_sz = 784 */ | |
/* out_pel_6_img_nomod = (%(out_pel_6)/784) */ | |
/* out_pel_6_img = (%(out_pel_6)/784) */ | |
/* out_pel_6_sz = 15680 */ | |
/* out_pel_7 = (%(GRP_ID_1D_pels_blk)*%(in_ix_blk_pel_dim) + %(LOC_ID_1D_pels_tile)*%(t_tile_sz)+7) */ | |
/* out_pel_7_pel_dim = 784 */ | |
/* out_pel_7_pel_sz = 1 */ | |
/* out_pel_7_pel_nomod = %(out_pel_7) */ | |
/* out_pel_7_pel = (%(out_pel_7)%%784) */ | |
/* out_pel_7_img_dim = 20 */ | |
/* out_pel_7_img_sz = 784 */ | |
/* out_pel_7_img_nomod = (%(out_pel_7)/784) */ | |
/* out_pel_7_img = (%(out_pel_7)/784) */ | |
/* out_pel_7_sz = 15680 */ | |
/* t_tile_stores = // begin t_tile_stores | |
int32_t tpix[%(t_tile_sz)]; | |
int32_t tcix[%(t_tile_sz)]; | |
tpix[0] = %(out_pel_0_img)*%(out_ix_img_sz) + %(out_pel_0_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[1] = %(out_pel_1_img)*%(out_ix_img_sz) + %(out_pel_1_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[2] = %(out_pel_2_img)*%(out_ix_img_sz) + %(out_pel_2_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[3] = %(out_pel_3_img)*%(out_ix_img_sz) + %(out_pel_3_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[4] = %(out_pel_4_img)*%(out_ix_img_sz) + %(out_pel_4_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[5] = %(out_pel_5_img)*%(out_ix_img_sz) + %(out_pel_5_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[6] = %(out_pel_6_img)*%(out_ix_img_sz) + %(out_pel_6_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tpix[7] = %(out_pel_7_img)*%(out_ix_img_sz) + %(out_pel_7_pel)*%(out_ix_x_sz) ; // cache out patch ixs | |
tcix[0] = (%(out_chan_ix)+0)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[1] = (%(out_chan_ix)+1)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[2] = (%(out_chan_ix)+2)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[3] = (%(out_chan_ix)+3)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[4] = (%(out_chan_ix)+4)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[5] = (%(out_chan_ix)+5)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[6] = (%(out_chan_ix)+6)*%(out_ix_chan_sz); // cache out chan ixs | |
tcix[7] = (%(out_chan_ix)+7)*%(out_ix_chan_sz); // cache out chan ixs | |
if( %(out_pel_0_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( %(out_pel_1_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( %(out_pel_2_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( %(out_pel_3_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( %(out_pel_4_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( %(out_pel_5_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( %(out_pel_6_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( %(out_pel_7_img) >= %(out_ix_img_dim) ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( tcix[1] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( tcix[2] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( tcix[3] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( tcix[4] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( tcix[5] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( tcix[6] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( tcix[7] < (%(out_ix_chan_dim)*%(out_ix_chan_sz)) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores */ | |
/* t_tile_dummy_stores = out_off[0] = max(0.0f,out_tile[0]+filts_strip[0]); | |
out_off[128] = max(0.0f,out_tile[1]+filts_strip[1]); | |
out_off[256] = max(0.0f,out_tile[2]+filts_strip[2]); | |
out_off[384] = max(0.0f,out_tile[3]+filts_strip[3]); | |
out_off[512] = max(0.0f,out_tile[4]+filts_strip[4]); | |
out_off[640] = max(0.0f,out_tile[5]+filts_strip[5]); | |
out_off[768] = max(0.0f,out_tile[6]+filts_strip[6]); | |
out_off[896] = max(0.0f,out_tile[7]+filts_strip[7]); | |
out_off[1024] = max(0.0f,out_tile[8]+filts_strip[0]); | |
out_off[1152] = max(0.0f,out_tile[9]+filts_strip[1]); | |
out_off[1280] = max(0.0f,out_tile[10]+filts_strip[2]); | |
out_off[1408] = max(0.0f,out_tile[11]+filts_strip[3]); | |
out_off[1536] = max(0.0f,out_tile[12]+filts_strip[4]); | |
out_off[1664] = max(0.0f,out_tile[13]+filts_strip[5]); | |
out_off[1792] = max(0.0f,out_tile[14]+filts_strip[6]); | |
out_off[1920] = max(0.0f,out_tile[15]+filts_strip[7]); | |
out_off[2048] = max(0.0f,out_tile[16]+filts_strip[0]); | |
out_off[2176] = max(0.0f,out_tile[17]+filts_strip[1]); | |
out_off[2304] = max(0.0f,out_tile[18]+filts_strip[2]); | |
out_off[2432] = max(0.0f,out_tile[19]+filts_strip[3]); | |
out_off[2560] = max(0.0f,out_tile[20]+filts_strip[4]); | |
out_off[2688] = max(0.0f,out_tile[21]+filts_strip[5]); | |
out_off[2816] = max(0.0f,out_tile[22]+filts_strip[6]); | |
out_off[2944] = max(0.0f,out_tile[23]+filts_strip[7]); | |
out_off[3072] = max(0.0f,out_tile[24]+filts_strip[0]); | |
out_off[3200] = max(0.0f,out_tile[25]+filts_strip[1]); | |
out_off[3328] = max(0.0f,out_tile[26]+filts_strip[2]); | |
out_off[3456] = max(0.0f,out_tile[27]+filts_strip[3]); | |
out_off[3584] = max(0.0f,out_tile[28]+filts_strip[4]); | |
out_off[3712] = max(0.0f,out_tile[29]+filts_strip[5]); | |
out_off[3840] = max(0.0f,out_tile[30]+filts_strip[6]); | |
out_off[3968] = max(0.0f,out_tile[31]+filts_strip[7]); | |
out_off[4096] = max(0.0f,out_tile[32]+filts_strip[0]); | |
out_off[4224] = max(0.0f,out_tile[33]+filts_strip[1]); | |
out_off[4352] = max(0.0f,out_tile[34]+filts_strip[2]); | |
out_off[4480] = max(0.0f,out_tile[35]+filts_strip[3]); | |
out_off[4608] = max(0.0f,out_tile[36]+filts_strip[4]); | |
out_off[4736] = max(0.0f,out_tile[37]+filts_strip[5]); | |
out_off[4864] = max(0.0f,out_tile[38]+filts_strip[6]); | |
out_off[4992] = max(0.0f,out_tile[39]+filts_strip[7]); | |
out_off[5120] = max(0.0f,out_tile[40]+filts_strip[0]); | |
out_off[5248] = max(0.0f,out_tile[41]+filts_strip[1]); | |
out_off[5376] = max(0.0f,out_tile[42]+filts_strip[2]); | |
out_off[5504] = max(0.0f,out_tile[43]+filts_strip[3]); | |
out_off[5632] = max(0.0f,out_tile[44]+filts_strip[4]); | |
out_off[5760] = max(0.0f,out_tile[45]+filts_strip[5]); | |
out_off[5888] = max(0.0f,out_tile[46]+filts_strip[6]); | |
out_off[6016] = max(0.0f,out_tile[47]+filts_strip[7]); | |
out_off[6144] = max(0.0f,out_tile[48]+filts_strip[0]); | |
out_off[6272] = max(0.0f,out_tile[49]+filts_strip[1]); | |
out_off[6400] = max(0.0f,out_tile[50]+filts_strip[2]); | |
out_off[6528] = max(0.0f,out_tile[51]+filts_strip[3]); | |
out_off[6656] = max(0.0f,out_tile[52]+filts_strip[4]); | |
out_off[6784] = max(0.0f,out_tile[53]+filts_strip[5]); | |
out_off[6912] = max(0.0f,out_tile[54]+filts_strip[6]); | |
out_off[7040] = max(0.0f,out_tile[55]+filts_strip[7]); | |
out_off[7168] = max(0.0f,out_tile[56]+filts_strip[0]); | |
out_off[7296] = max(0.0f,out_tile[57]+filts_strip[1]); | |
out_off[7424] = max(0.0f,out_tile[58]+filts_strip[2]); | |
out_off[7552] = max(0.0f,out_tile[59]+filts_strip[3]); | |
out_off[7680] = max(0.0f,out_tile[60]+filts_strip[4]); | |
out_off[7808] = max(0.0f,out_tile[61]+filts_strip[5]); | |
out_off[7936] = max(0.0f,out_tile[62]+filts_strip[6]); | |
out_off[8064] = max(0.0f,out_tile[63]+filts_strip[7]); | |
*/ | |
/* t_tile_bias_loads = // begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
// end t_tile_bias_loads */ | |
/* inner_loop_body = // begin inner_loop_body | |
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(0*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(1*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(2*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[3*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[3*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[3*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[3*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[3*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[3*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[3*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[3*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(3*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[4*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[4*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[4*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[4*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[4*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[4*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[4*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[4*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(4*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[5*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[5*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[5*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[5*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[5*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[5*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[5*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[5*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(5*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[6*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[6*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[6*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[6*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[6*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[6*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[6*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[6*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(6*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[7*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[7*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[7*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[7*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[7*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[7*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[7*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[7*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
in_strip[0] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+0)]; | |
in_strip[1] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+1)]; | |
in_strip[2] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+2)]; | |
in_strip[3] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+3)]; | |
in_strip[4] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+4)]; | |
in_strip[5] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+5)]; | |
in_strip[6] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+6)]; | |
in_strip[7] = in_smem_off[(7*%(t_tile_sz)*%(LOC_ID_1D_pels_tile_dim)+7)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
*/ | |
CUCL_GLOBAL_KERNEL void xpose_in__num_imgs_20__in_chan_tile_8__pad_in_chans_256__in_chans_256__ysz_28__xsz_28__tix_pels_tile_sz_8__bix_pels_blk_sz_245( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const out_ix = GLOB_ID_1D; | |
int32_t const chan_ix = ((out_ix/512)%32)*8 + ((out_ix/64)%8); | |
int32_t const pel_ix = (out_ix/16384)*64 + (out_ix%64); | |
float v = 0.0f; | |
if( ( chan_ix < 256 ) && ( (pel_ix/784) < 20 ) ) { | |
v = in[ (pel_ix/784)*200704 + | |
chan_ix*784 + | |
((pel_ix/28)%28)*28 + | |
(pel_ix%28)*1 ]; | |
} | |
out[out_ix] = v; | |
} | |
/* | |
in_pels = num_img * in.sz.dims_prod() | |
num_in_blks = u32_ceil_div( in_pels, block_chan_pels ) | |
normal in dims: img, chan, y, x OR img, chan, pels // where pels = x,y dims merged | |
block_iters = u32_ceil_div( chan, in_chan_tile ) // for ccp1, 96/8=12 | |
pad_chan = block_iter * in_chan_tile // pad by up to (in_chan_tile-1) [typ. 8; pad with zeros? garbage okay?] | |
block_chan_pels = t_tile_sz*tix_pels_tile_sz // typically 8*8=64 | |
block_iter_pels = block_chan_pels * in_chan_tile; // typically 512 | |
block_pels = 12*512 = 6144 // note: 24576 bytes, prob. too big for SM to fully cache, but 512=2K (per-iter cache) is fine. | |
xposed in dims (inner): (block_iter, block_iter_chan, block_iter_pel) == block_pel | |
sz (inner): (block_iters, in_chan_tile, block_chan_pels) == block_pels (only inner 2 dims need to be linear?) | |
*/ | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_chan_tile = 8 */ | |
/* pad_in_chans = 256 */ | |
/* in_chans = 256 */ | |
/* ysz = 28 */ | |
/* xsz = 28 */ | |
/* tix_pels_tile_sz = 8 */ | |
/* bix_pels_blk_sz = 245 */ | |
/* rtc_func_name = xpose_in__num_imgs_20__in_chan_tile_8__pad_in_chans_256__in_chans_256__ysz_28__xsz_28__tix_pels_tile_sz_8__bix_pels_blk_sz_245 */ | |
/* out_ix_blk_pel_dim = 64 */ | |
/* out_ix_blk_pel_sz = 1 */ | |
/* out_ix_blk_pel_nomod = out_ix */ | |
/* out_ix_blk_pel = (out_ix%%64) */ | |
/* out_ix_blk_iter_chan_dim = 8 */ | |
/* out_ix_blk_iter_chan_sz = 64 */ | |
/* out_ix_blk_iter_chan_nomod = (out_ix/64) */ | |
/* out_ix_blk_iter_chan = ((out_ix/64)%%8) */ | |
/* out_ix_blk_iter_dim = 32 */ | |
/* out_ix_blk_iter_sz = 512 */ | |
/* out_ix_blk_iter_nomod = (out_ix/512) */ | |
/* out_ix_blk_iter = ((out_ix/512)%%32) */ | |
/* out_ix_blk_dim = 245 */ | |
/* out_ix_blk_sz = 16384 */ | |
/* out_ix_blk_nomod = (out_ix/16384) */ | |
/* out_ix_blk = (out_ix/16384) */ | |
/* out_ix_sz = 4014080 */ | |
/* pel_ix_x_dim = 28 */ | |
/* pel_ix_x_sz = 1 */ | |
/* pel_ix_x_nomod = pel_ix */ | |
/* pel_ix_x = (pel_ix%%28) */ | |
/* pel_ix_y_dim = 28 */ | |
/* pel_ix_y_sz = 28 */ | |
/* pel_ix_y_nomod = (pel_ix/28) */ | |
/* pel_ix_y = ((pel_ix/28)%%28) */ | |
/* pel_ix_img_dim = 20 */ | |
/* pel_ix_img_sz = 784 */ | |
/* pel_ix_img_nomod = (pel_ix/784) */ | |
/* pel_ix_img = (pel_ix/784) */ | |
/* pel_ix_sz = 15680 */ | |
/* in_ix_x_dim = 28 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%28) */ | |
/* in_ix_y_dim = 28 */ | |
/* in_ix_y_sz = 28 */ | |
/* in_ix_y_nomod = (in_ix/28) */ | |
/* in_ix_y = ((in_ix/28)%%28) */ | |
/* in_ix_chan_dim = 256 */ | |
/* in_ix_chan_sz = 784 */ | |
/* in_ix_chan_nomod = (in_ix/784) */ | |
/* in_ix_chan = ((in_ix/784)%%256) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 200704 */ | |
/* in_ix_img_nomod = (in_ix/200704) */ | |
/* in_ix_img = (in_ix/200704) */ | |
/* in_ix_sz = 4014080 */ | |
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_128__in_chans_256__kysz_1__kxsz_1( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x | |
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile | |
{ | |
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg | |
int32_t const filts_ix = GLOB_ID_1D; | |
if( filts_ix >= 32768 ) { return; } | |
int32_t const fioc = (filts_ix/256); | |
float val = 0.0f; | |
int32_t const filts_xp_ix = | |
(fioc/128)*32768 + | |
(fioc%8)*16 + | |
((fioc/8)%16)*1 + | |
(filts_ix%256)*128 + | |
(filts_ix%1)*128 + | |
(filts_ix%1)*128; | |
#if 1 | |
val = in[filts_ix]; | |
#else | |
if( (filts_ix%256) == 0 ) { | |
// if( ((filts_ix%1) == 5) && ((filts_ix%1) == 5) ) | |
{ | |
val = (filts_ix%1)*100 + (filts_ix%1); | |
} | |
} | |
#endif | |
out[filts_xp_ix] = val; | |
} | |
// -- template substituion table used: -- | |
/* out_chans = 128 */ | |
/* in_chans = 256 */ | |
/* kysz = 1 */ | |
/* kxsz = 1 */ | |
/* rtc_func_name = xpose_filts__out_chans_128__in_chans_256__kysz_1__kxsz_1 */ | |
/* t_tile_sz = 8 */ | |
/* filts_ix_x_dim = 1 */ | |
/* filts_ix_x_sz = 1 */ | |
/* filts_ix_x_nomod = filts_ix */ | |
/* filts_ix_x = (filts_ix%%1) */ | |
/* filts_ix_y_dim = 1 */ | |
/* filts_ix_y_sz = 1 */ | |
/* filts_ix_y_nomod = filts_ix */ | |
/* filts_ix_y = (filts_ix%%1) */ | |
/* filts_ix_in_chan_dim = 256 */ | |
/* filts_ix_in_chan_sz = 1 */ | |
/* filts_ix_in_chan_nomod = filts_ix */ | |
/* filts_ix_in_chan = (filts_ix%%256) */ | |
/* filts_ix_out_chan_dim = 128 */ | |
/* filts_ix_out_chan_sz = 256 */ | |
/* filts_ix_out_chan_nomod = (filts_ix/256) */ | |
/* filts_ix_out_chan = (filts_ix/256) */ | |
/* filts_ix_sz = 32768 */ | |
/* filts_xp_ix_out_chan_tile_dim = 16 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%16) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 16 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/16) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/16)%%8) */ | |
/* filts_xp_ix_x_dim = 1 */ | |
/* filts_xp_ix_x_sz = 128 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/128) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/128)%%1) */ | |
/* filts_xp_ix_y_dim = 1 */ | |
/* filts_xp_ix_y_sz = 128 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/128) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/128)%%1) */ | |
/* filts_xp_ix_in_chan_dim = 256 */ | |
/* filts_xp_ix_in_chan_sz = 128 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/128) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/128)%%256) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 32768 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/32768) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/32768) */ | |
/* filts_xp_ix_sz = 32768 */ | |
/* fioc_out_chan_reg_dim = 8 */ | |
/* fioc_out_chan_reg_sz = 1 */ | |
/* fioc_out_chan_reg_nomod = fioc */ | |
/* fioc_out_chan_reg = (fioc%%8) */ | |
/* fioc_out_chan_tile_dim = 16 */ | |
/* fioc_out_chan_tile_sz = 8 */ | |
/* fioc_out_chan_tile_nomod = (fioc/8) */ | |
/* fioc_out_chan_tile = ((fioc/8)%%16) */ | |
/* fioc_out_chan_blk_dim = 1 */ | |
/* fioc_out_chan_blk_sz = 128 */ | |
/* fioc_out_chan_blk_nomod = (fioc/128) */ | |
/* fioc_out_chan_blk = (fioc/128) */ | |
/* fioc_sz = 128 */ | |
// each thread: computes 8x8 block of out | |
// loop over k dim | |
CUCL_GLOBAL_KERNEL void tconv__num_imgs_20__in_dim_0_28__in_dim_1_28__kern_sz_3__stride_1__in_pad_1__t_tile_sz_8__conv_has_relu_1__out_chans_192__in_chans_128( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out, int32_t const flags ) { | |
LOCSHAR_MEM float all_smem[1024]; // note: max(filts+in,out) == max(384+120,1024) | |
LSMASQ float * const filts_smem = all_smem; | |
LSMASQ float * const in_smem = filts_smem + 384; | |
float out_tile[8*8] = {0.0f}; // tile of output for this thread to compute, stored in registers | |
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem | |
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz ) | |
float in_strip[10]; // segment of input line sufficient for one unrolling of inner loop | |
int32_t blk_in_ix_base = (GRP_ID_1D/2)*15360 + LOC_ID_1D;// index of first input pel to load for this thread | |
int32_t const blk_filt_ix_base = (GRP_ID_1D%2)*147456; // index of first out chan | |
int32_t filts_off = blk_filt_ix_base + LOC_ID_1D; // adj is either 0 or LOC_ID_1D; | |
LSMASQ float * const filts_smem_off = filts_smem + (LOC_ID_1D%16); | |
int32_t out_line = (GRP_ID_1D/8)*8; // first out_line of block | |
int32_t const blk_fli = (out_line/28); // image of first out_line of block | |
out_line += (LOC_ID_1D/16); // adjust to out_line of this thread | |
// offset in lines to deal with >1 img/block = (number of prior images (partial or full) in this block) * (adj to next img) | |
int32_t const img_off_lines = ((out_line/28) - blk_fli)*(3-1); | |
int32_t const in_y = (out_line%28)*1 - 1; | |
for( int32_t in_chan = 0; in_chan != 128; ++in_chan ) { | |
BARRIER_SYNC; | |
// begin in_smem_loads | |
if( (LOC_ID_1D + 128 * 0) < 120) { in_smem[(LOC_ID_1D + 128 * 0)] = in[ blk_in_ix_base + (128*0) ];} | |
blk_in_ix_base += 120; | |
// end in_smem_loads; | |
for( int32_t ky = 0; ky != 3; ++ky ) { | |
if( ky != 0 ) { BARRIER_SYNC; } | |
// begin filt_smem_loads | |
filts_smem[(LOC_ID_1D + 128 * 0)] = filts[filts_off+(128*0)]; | |
filts_smem[(LOC_ID_1D + 128 * 1)] = filts[filts_off+(128*1)]; | |
filts_smem[(LOC_ID_1D + 128 * 2)] = filts[filts_off+(128*2)]; | |
filts_off += 384; | |
// end filt_smem_loads; | |
BARRIER_SYNC; | |
if( (out_line/28) >= 20 ) { continue; } // required: skip lines from invalid images (read might be invalid) | |
if( ((in_y+ky) < 0) || ((in_y+ky)>28) ) { continue; } // optimization: skip known-to-be-padding input lines | |
LSMASQ float * const in_smem_off = in_smem + ((LOC_ID_1D/16)*1+ky+img_off_lines)*10; | |
// begin inner_loop_body | |
in_strip[0] = in_smem_off[0]; | |
in_strip[1] = in_smem_off[1]; | |
in_strip[2] = in_smem_off[2]; | |
in_strip[3] = in_smem_off[3]; | |
in_strip[4] = in_smem_off[4]; | |
in_strip[5] = in_smem_off[5]; | |
in_strip[6] = in_smem_off[6]; | |
in_strip[7] = in_smem_off[7]; | |
in_strip[8] = in_smem_off[8]; | |
in_strip[9] = in_smem_off[9]; | |
filts_strip[0] = filts_smem_off[0*128+0*16]; | |
filts_strip[1] = filts_smem_off[0*128+1*16]; | |
filts_strip[2] = filts_smem_off[0*128+2*16]; | |
filts_strip[3] = filts_smem_off[0*128+3*16]; | |
filts_strip[4] = filts_smem_off[0*128+4*16]; | |
filts_strip[5] = filts_smem_off[0*128+5*16]; | |
filts_strip[6] = filts_smem_off[0*128+6*16]; | |
filts_strip[7] = filts_smem_off[0*128+7*16]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[1*128+0*16]; | |
filts_strip[1] = filts_smem_off[1*128+1*16]; | |
filts_strip[2] = filts_smem_off[1*128+2*16]; | |
filts_strip[3] = filts_smem_off[1*128+3*16]; | |
filts_strip[4] = filts_smem_off[1*128+4*16]; | |
filts_strip[5] = filts_smem_off[1*128+5*16]; | |
filts_strip[6] = filts_smem_off[1*128+6*16]; | |
filts_strip[7] = filts_smem_off[1*128+7*16]; | |
out_tile[0] += filts_strip[0]*in_strip[1]; | |
out_tile[1] += filts_strip[1]*in_strip[1]; | |
out_tile[2] += filts_strip[2]*in_strip[1]; | |
out_tile[3] += filts_strip[3]*in_strip[1]; | |
out_tile[4] += filts_strip[4]*in_strip[1]; | |
out_tile[5] += filts_strip[5]*in_strip[1]; | |
out_tile[6] += filts_strip[6]*in_strip[1]; | |
out_tile[7] += filts_strip[7]*in_strip[1]; | |
out_tile[8] += filts_strip[0]*in_strip[2]; | |
out_tile[9] += filts_strip[1]*in_strip[2]; | |
out_tile[10] += filts_strip[2]*in_strip[2]; | |
out_tile[11] += filts_strip[3]*in_strip[2]; | |
out_tile[12] += filts_strip[4]*in_strip[2]; | |
out_tile[13] += filts_strip[5]*in_strip[2]; | |
out_tile[14] += filts_strip[6]*in_strip[2]; | |
out_tile[15] += filts_strip[7]*in_strip[2]; | |
out_tile[16] += filts_strip[0]*in_strip[3]; | |
out_tile[17] += filts_strip[1]*in_strip[3]; | |
out_tile[18] += filts_strip[2]*in_strip[3]; | |
out_tile[19] += filts_strip[3]*in_strip[3]; | |
out_tile[20] += filts_strip[4]*in_strip[3]; | |
out_tile[21] += filts_strip[5]*in_strip[3]; | |
out_tile[22] += filts_strip[6]*in_strip[3]; | |
out_tile[23] += filts_strip[7]*in_strip[3]; | |
out_tile[24] += filts_strip[0]*in_strip[4]; | |
out_tile[25] += filts_strip[1]*in_strip[4]; | |
out_tile[26] += filts_strip[2]*in_strip[4]; | |
out_tile[27] += filts_strip[3]*in_strip[4]; | |
out_tile[28] += filts_strip[4]*in_strip[4]; | |
out_tile[29] += filts_strip[5]*in_strip[4]; | |
out_tile[30] += filts_strip[6]*in_strip[4]; | |
out_tile[31] += filts_strip[7]*in_strip[4]; | |
out_tile[32] += filts_strip[0]*in_strip[5]; | |
out_tile[33] += filts_strip[1]*in_strip[5]; | |
out_tile[34] += filts_strip[2]*in_strip[5]; | |
out_tile[35] += filts_strip[3]*in_strip[5]; | |
out_tile[36] += filts_strip[4]*in_strip[5]; | |
out_tile[37] += filts_strip[5]*in_strip[5]; | |
out_tile[38] += filts_strip[6]*in_strip[5]; | |
out_tile[39] += filts_strip[7]*in_strip[5]; | |
out_tile[40] += filts_strip[0]*in_strip[6]; | |
out_tile[41] += filts_strip[1]*in_strip[6]; | |
out_tile[42] += filts_strip[2]*in_strip[6]; | |
out_tile[43] += filts_strip[3]*in_strip[6]; | |
out_tile[44] += filts_strip[4]*in_strip[6]; | |
out_tile[45] += filts_strip[5]*in_strip[6]; | |
out_tile[46] += filts_strip[6]*in_strip[6]; | |
out_tile[47] += filts_strip[7]*in_strip[6]; | |
out_tile[48] += filts_strip[0]*in_strip[7]; | |
out_tile[49] += filts_strip[1]*in_strip[7]; | |
out_tile[50] += filts_strip[2]*in_strip[7]; | |
out_tile[51] += filts_strip[3]*in_strip[7]; | |
out_tile[52] += filts_strip[4]*in_strip[7]; | |
out_tile[53] += filts_strip[5]*in_strip[7]; | |
out_tile[54] += filts_strip[6]*in_strip[7]; | |
out_tile[55] += filts_strip[7]*in_strip[7]; | |
out_tile[56] += filts_strip[0]*in_strip[8]; | |
out_tile[57] += filts_strip[1]*in_strip[8]; | |
out_tile[58] += filts_strip[2]*in_strip[8]; | |
out_tile[59] += filts_strip[3]*in_strip[8]; | |
out_tile[60] += filts_strip[4]*in_strip[8]; | |
out_tile[61] += filts_strip[5]*in_strip[8]; | |
out_tile[62] += filts_strip[6]*in_strip[8]; | |
out_tile[63] += filts_strip[7]*in_strip[8]; | |
filts_strip[0] = filts_smem_off[2*128+0*16]; | |
filts_strip[1] = filts_smem_off[2*128+1*16]; | |
filts_strip[2] = filts_smem_off[2*128+2*16]; | |
filts_strip[3] = filts_smem_off[2*128+3*16]; | |
filts_strip[4] = filts_smem_off[2*128+4*16]; | |
filts_strip[5] = filts_smem_off[2*128+5*16]; | |
filts_strip[6] = filts_smem_off[2*128+6*16]; | |
filts_strip[7] = filts_smem_off[2*128+7*16]; | |
out_tile[0] += filts_strip[0]*in_strip[2]; | |
out_tile[1] += filts_strip[1]*in_strip[2]; | |
out_tile[2] += filts_strip[2]*in_strip[2]; | |
out_tile[3] += filts_strip[3]*in_strip[2]; | |
out_tile[4] += filts_strip[4]*in_strip[2]; | |
out_tile[5] += filts_strip[5]*in_strip[2]; | |
out_tile[6] += filts_strip[6]*in_strip[2]; | |
out_tile[7] += filts_strip[7]*in_strip[2]; | |
out_tile[8] += filts_strip[0]*in_strip[3]; | |
out_tile[9] += filts_strip[1]*in_strip[3]; | |
out_tile[10] += filts_strip[2]*in_strip[3]; | |
out_tile[11] += filts_strip[3]*in_strip[3]; | |
out_tile[12] += filts_strip[4]*in_strip[3]; | |
out_tile[13] += filts_strip[5]*in_strip[3]; | |
out_tile[14] += filts_strip[6]*in_strip[3]; | |
out_tile[15] += filts_strip[7]*in_strip[3]; | |
out_tile[16] += filts_strip[0]*in_strip[4]; | |
out_tile[17] += filts_strip[1]*in_strip[4]; | |
out_tile[18] += filts_strip[2]*in_strip[4]; | |
out_tile[19] += filts_strip[3]*in_strip[4]; | |
out_tile[20] += filts_strip[4]*in_strip[4]; | |
out_tile[21] += filts_strip[5]*in_strip[4]; | |
out_tile[22] += filts_strip[6]*in_strip[4]; | |
out_tile[23] += filts_strip[7]*in_strip[4]; | |
out_tile[24] += filts_strip[0]*in_strip[5]; | |
out_tile[25] += filts_strip[1]*in_strip[5]; | |
out_tile[26] += filts_strip[2]*in_strip[5]; | |
out_tile[27] += filts_strip[3]*in_strip[5]; | |
out_tile[28] += filts_strip[4]*in_strip[5]; | |
out_tile[29] += filts_strip[5]*in_strip[5]; | |
out_tile[30] += filts_strip[6]*in_strip[5]; | |
out_tile[31] += filts_strip[7]*in_strip[5]; | |
out_tile[32] += filts_strip[0]*in_strip[6]; | |
out_tile[33] += filts_strip[1]*in_strip[6]; | |
out_tile[34] += filts_strip[2]*in_strip[6]; | |
out_tile[35] += filts_strip[3]*in_strip[6]; | |
out_tile[36] += filts_strip[4]*in_strip[6]; | |
out_tile[37] += filts_strip[5]*in_strip[6]; | |
out_tile[38] += filts_strip[6]*in_strip[6]; | |
out_tile[39] += filts_strip[7]*in_strip[6]; | |
out_tile[40] += filts_strip[0]*in_strip[7]; | |
out_tile[41] += filts_strip[1]*in_strip[7]; | |
out_tile[42] += filts_strip[2]*in_strip[7]; | |
out_tile[43] += filts_strip[3]*in_strip[7]; | |
out_tile[44] += filts_strip[4]*in_strip[7]; | |
out_tile[45] += filts_strip[5]*in_strip[7]; | |
out_tile[46] += filts_strip[6]*in_strip[7]; | |
out_tile[47] += filts_strip[7]*in_strip[7]; | |
out_tile[48] += filts_strip[0]*in_strip[8]; | |
out_tile[49] += filts_strip[1]*in_strip[8]; | |
out_tile[50] += filts_strip[2]*in_strip[8]; | |
out_tile[51] += filts_strip[3]*in_strip[8]; | |
out_tile[52] += filts_strip[4]*in_strip[8]; | |
out_tile[53] += filts_strip[5]*in_strip[8]; | |
out_tile[54] += filts_strip[6]*in_strip[8]; | |
out_tile[55] += filts_strip[7]*in_strip[8]; | |
out_tile[56] += filts_strip[0]*in_strip[9]; | |
out_tile[57] += filts_strip[1]*in_strip[9]; | |
out_tile[58] += filts_strip[2]*in_strip[9]; | |
out_tile[59] += filts_strip[3]*in_strip[9]; | |
out_tile[60] += filts_strip[4]*in_strip[9]; | |
out_tile[61] += filts_strip[5]*in_strip[9]; | |
out_tile[62] += filts_strip[6]*in_strip[9]; | |
out_tile[63] += filts_strip[7]*in_strip[9]; | |
; | |
} | |
} | |
if( flags == 2 ) { return; } | |
BARRIER_SYNC; | |
for( int32_t i = 0; i != 1; ++i ) { | |
int32_t const t_smem_bias_ix = LOC_ID_1D+128*i; | |
if( t_smem_bias_ix < 128 ) { | |
int32_t const ocix_base = (GRP_ID_1D%2)*128; | |
int32_t const load_reg = t_smem_bias_ix / 16; | |
int32_t const load_tile = t_smem_bias_ix % 16; | |
int32_t const ocix = ocix_base + load_tile*8 + load_reg; | |
if( ocix < 192 ) { filts_smem[t_smem_bias_ix] = biases[ ocix ]; } | |
} | |
} | |
BARRIER_SYNC; | |
// begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*16]; | |
filts_strip[1] = filts_smem_off[1*16]; | |
filts_strip[2] = filts_smem_off[2*16]; | |
filts_strip[3] = filts_smem_off[3*16]; | |
filts_strip[4] = filts_smem_off[4*16]; | |
filts_strip[5] = filts_smem_off[5*16]; | |
filts_strip[6] = filts_smem_off[6*16]; | |
filts_strip[7] = filts_smem_off[7*16]; | |
// end t_tile_bias_loads; | |
if( flags == 1 ) { return; } | |
// begin t_tile_stores | |
if( (out_line/28) >= 20 ) { return; } | |
int32_t out_x = ((GRP_ID_1D/2)%4)*8; | |
int32_t out_chan = ((GRP_ID_1D%2)*16 + (LOC_ID_1D%16))*8; | |
GASQ float * out_off = out + (out_line/28)*150528 + out_chan*784 + (out_line%28)*28 + out_x*1 ; | |
if( (out_x + 0) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 192 ) { out_off[ 0*784 + 0*1 ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( (out_chan + 1) < 192 ) { out_off[ 1*784 + 0*1 ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( (out_chan + 2) < 192 ) { out_off[ 2*784 + 0*1 ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( (out_chan + 3) < 192 ) { out_off[ 3*784 + 0*1 ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( (out_chan + 4) < 192 ) { out_off[ 4*784 + 0*1 ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( (out_chan + 5) < 192 ) { out_off[ 5*784 + 0*1 ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( (out_chan + 6) < 192 ) { out_off[ 6*784 + 0*1 ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( (out_chan + 7) < 192 ) { out_off[ 7*784 + 0*1 ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( (out_x + 1) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 192 ) { out_off[ 0*784 + 1*1 ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( (out_chan + 1) < 192 ) { out_off[ 1*784 + 1*1 ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( (out_chan + 2) < 192 ) { out_off[ 2*784 + 1*1 ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( (out_chan + 3) < 192 ) { out_off[ 3*784 + 1*1 ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( (out_chan + 4) < 192 ) { out_off[ 4*784 + 1*1 ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( (out_chan + 5) < 192 ) { out_off[ 5*784 + 1*1 ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( (out_chan + 6) < 192 ) { out_off[ 6*784 + 1*1 ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( (out_chan + 7) < 192 ) { out_off[ 7*784 + 1*1 ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( (out_x + 2) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 192 ) { out_off[ 0*784 + 2*1 ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( (out_chan + 1) < 192 ) { out_off[ 1*784 + 2*1 ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( (out_chan + 2) < 192 ) { out_off[ 2*784 + 2*1 ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( (out_chan + 3) < 192 ) { out_off[ 3*784 + 2*1 ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( (out_chan + 4) < 192 ) { out_off[ 4*784 + 2*1 ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( (out_chan + 5) < 192 ) { out_off[ 5*784 + 2*1 ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( (out_chan + 6) < 192 ) { out_off[ 6*784 + 2*1 ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( (out_chan + 7) < 192 ) { out_off[ 7*784 + 2*1 ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( (out_x + 3) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 192 ) { out_off[ 0*784 + 3*1 ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( (out_chan + 1) < 192 ) { out_off[ 1*784 + 3*1 ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( (out_chan + 2) < 192 ) { out_off[ 2*784 + 3*1 ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( (out_chan + 3) < 192 ) { out_off[ 3*784 + 3*1 ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( (out_chan + 4) < 192 ) { out_off[ 4*784 + 3*1 ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( (out_chan + 5) < 192 ) { out_off[ 5*784 + 3*1 ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( (out_chan + 6) < 192 ) { out_off[ 6*784 + 3*1 ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( (out_chan + 7) < 192 ) { out_off[ 7*784 + 3*1 ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( (out_x + 4) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 192 ) { out_off[ 0*784 + 4*1 ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( (out_chan + 1) < 192 ) { out_off[ 1*784 + 4*1 ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( (out_chan + 2) < 192 ) { out_off[ 2*784 + 4*1 ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( (out_chan + 3) < 192 ) { out_off[ 3*784 + 4*1 ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( (out_chan + 4) < 192 ) { out_off[ 4*784 + 4*1 ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( (out_chan + 5) < 192 ) { out_off[ 5*784 + 4*1 ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( (out_chan + 6) < 192 ) { out_off[ 6*784 + 4*1 ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( (out_chan + 7) < 192 ) { out_off[ 7*784 + 4*1 ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( (out_x + 5) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 192 ) { out_off[ 0*784 + 5*1 ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( (out_chan + 1) < 192 ) { out_off[ 1*784 + 5*1 ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( (out_chan + 2) < 192 ) { out_off[ 2*784 + 5*1 ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( (out_chan + 3) < 192 ) { out_off[ 3*784 + 5*1 ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( (out_chan + 4) < 192 ) { out_off[ 4*784 + 5*1 ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( (out_chan + 5) < 192 ) { out_off[ 5*784 + 5*1 ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( (out_chan + 6) < 192 ) { out_off[ 6*784 + 5*1 ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( (out_chan + 7) < 192 ) { out_off[ 7*784 + 5*1 ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( (out_x + 6) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 192 ) { out_off[ 0*784 + 6*1 ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( (out_chan + 1) < 192 ) { out_off[ 1*784 + 6*1 ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( (out_chan + 2) < 192 ) { out_off[ 2*784 + 6*1 ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( (out_chan + 3) < 192 ) { out_off[ 3*784 + 6*1 ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( (out_chan + 4) < 192 ) { out_off[ 4*784 + 6*1 ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( (out_chan + 5) < 192 ) { out_off[ 5*784 + 6*1 ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( (out_chan + 6) < 192 ) { out_off[ 6*784 + 6*1 ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( (out_chan + 7) < 192 ) { out_off[ 7*784 + 6*1 ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( (out_x + 7) >= 28 ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < 192 ) { out_off[ 0*784 + 7*1 ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( (out_chan + 1) < 192 ) { out_off[ 1*784 + 7*1 ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( (out_chan + 2) < 192 ) { out_off[ 2*784 + 7*1 ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( (out_chan + 3) < 192 ) { out_off[ 3*784 + 7*1 ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( (out_chan + 4) < 192 ) { out_off[ 4*784 + 7*1 ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( (out_chan + 5) < 192 ) { out_off[ 5*784 + 7*1 ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( (out_chan + 6) < 192 ) { out_off[ 6*784 + 7*1 ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( (out_chan + 7) < 192 ) { out_off[ 7*784 + 7*1 ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_dim_0 = 28 */ | |
/* in_dim_1 = 28 */ | |
/* kern_sz = 3 */ | |
/* stride = 1 */ | |
/* in_pad = 1 */ | |
/* t_tile_sz = 8 */ | |
/* conv_has_relu = 1 */ | |
/* out_chans = 192 */ | |
/* in_chans = 128 */ | |
/* rtc_func_name = tconv__num_imgs_20__in_dim_0_28__in_dim_1_28__kern_sz_3__stride_1__in_pad_1__t_tile_sz_8__conv_has_relu_1__out_chans_192__in_chans_128 */ | |
/* out_ix_x_dim = 28 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%28) */ | |
/* out_ix_y_dim = 28 */ | |
/* out_ix_y_sz = 28 */ | |
/* out_ix_y_nomod = (out_ix/28) */ | |
/* out_ix_y = ((out_ix/28)%%28) */ | |
/* out_ix_chan_dim = 192 */ | |
/* out_ix_chan_sz = 784 */ | |
/* out_ix_chan_nomod = (out_ix/784) */ | |
/* out_ix_chan = ((out_ix/784)%%192) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 150528 */ | |
/* out_ix_img_nomod = (out_ix/150528) */ | |
/* out_ix_img = (out_ix/150528) */ | |
/* out_ix_sz = 3010560 */ | |
/* tpb = 128 */ | |
/* out_line_y_dim = 28 */ | |
/* out_line_y_sz = 1 */ | |
/* out_line_y_nomod = out_line */ | |
/* out_line_y = (out_line%%28) */ | |
/* out_line_img_dim = 20 */ | |
/* out_line_img_sz = 28 */ | |
/* out_line_img_nomod = (out_line/28) */ | |
/* out_line_img = (out_line/28) */ | |
/* out_line_sz = 560 */ | |
/* in_ix_blk_x_dim = 10 */ | |
/* in_ix_blk_x_sz = 1 */ | |
/* in_ix_blk_x_nomod = in_ix */ | |
/* in_ix_blk_x = (in_ix%%10) */ | |
/* in_ix_blk_y_dim = 12 */ | |
/* in_ix_blk_y_sz = 10 */ | |
/* in_ix_blk_y_nomod = (in_ix/10) */ | |
/* in_ix_blk_y = ((in_ix/10)%%12) */ | |
/* in_ix_blk_in_chan_dim = 128 */ | |
/* in_ix_blk_in_chan_sz = 120 */ | |
/* in_ix_blk_in_chan_nomod = (in_ix/120) */ | |
/* in_ix_blk_in_chan = ((in_ix/120)%%128) */ | |
/* in_ix_blk_bx_dim = 4 */ | |
/* in_ix_blk_bx_sz = 15360 */ | |
/* in_ix_blk_bx_nomod = (in_ix/15360) */ | |
/* in_ix_blk_bx = ((in_ix/15360)%%4) */ | |
/* in_ix_blk_bline_dim = 70 */ | |
/* in_ix_blk_bline_sz = 61440 */ | |
/* in_ix_blk_bline_nomod = (in_ix/61440) */ | |
/* in_ix_blk_bline = (in_ix/61440) */ | |
/* in_ix_sz = 4300800 */ | |
/* LOC_ID_1D_out_chan_tile_dim = 16 */ | |
/* LOC_ID_1D_out_chan_tile_sz = 1 */ | |
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */ | |
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%16) */ | |
/* LOC_ID_1D_blk_y_dim = 8 */ | |
/* LOC_ID_1D_blk_y_sz = 16 */ | |
/* LOC_ID_1D_blk_y_nomod = (LOC_ID_1D/16) */ | |
/* LOC_ID_1D_blk_y = (LOC_ID_1D/16) */ | |
/* LOC_ID_1D_sz = 128 */ | |
/* GRP_ID_1D_out_chan_blk_dim = 2 */ | |
/* GRP_ID_1D_out_chan_blk_sz = 1 */ | |
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%2) */ | |
/* GRP_ID_1D_blk_bx_dim = 4 */ | |
/* GRP_ID_1D_blk_bx_sz = 2 */ | |
/* GRP_ID_1D_blk_bx_nomod = (GRP_ID_1D/2) */ | |
/* GRP_ID_1D_blk_bx = ((GRP_ID_1D/2)%%4) */ | |
/* GRP_ID_1D_blk_bline_dim = 70 */ | |
/* GRP_ID_1D_blk_bline_sz = 8 */ | |
/* GRP_ID_1D_blk_bline_nomod = (GRP_ID_1D/8) */ | |
/* GRP_ID_1D_blk_bline = (GRP_ID_1D/8) */ | |
/* GRP_ID_1D_sz = 560 */ | |
/* blk_filt_ix_sz = 128 */ | |
/* filts_smem_sz = 384 */ | |
/* in_smem_sz = 120 */ | |
/* out_smem_sz = 1024 */ | |
/* all_smem_sz = 1024 */ | |
/* filts_xp_ix_out_chan_tile_dim = 16 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%16) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 16 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/16) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/16)%%8) */ | |
/* filts_xp_ix_x_dim = 3 */ | |
/* filts_xp_ix_x_sz = 128 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/128) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/128)%%3) */ | |
/* filts_xp_ix_y_dim = 3 */ | |
/* filts_xp_ix_y_sz = 384 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/384) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/384)%%3) */ | |
/* filts_xp_ix_in_chan_dim = 128 */ | |
/* filts_xp_ix_in_chan_sz = 1152 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/1152) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/1152)%%128) */ | |
/* filts_xp_ix_out_chan_blk_dim = 2 */ | |
/* filts_xp_ix_out_chan_blk_sz = 147456 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/147456) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/147456) */ | |
/* filts_xp_ix_sz = 294912 */ | |
/* out_chan_bias_smem_load_iter = 1 */ | |
/* filts_off_adj = LOC_ID_1D */ | |
/* filt_smem_loads = // begin filt_smem_loads | |
filts_smem[(LOC_ID_1D + %(tpb) * 0)] = filts[filts_off+(%(tpb)*0)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 1)] = filts[filts_off+(%(tpb)*1)]; | |
filts_smem[(LOC_ID_1D + %(tpb) * 2)] = filts[filts_off+(%(tpb)*2)]; | |
filts_off += %(filts_xp_ix_y_sz); | |
// end filt_smem_loads */ | |
/* in_smem_loads = // begin in_smem_loads | |
if( (LOC_ID_1D + %(tpb) * 0) < %(in_smem_sz)) { in_smem[(LOC_ID_1D + %(tpb) * 0)] = in[ blk_in_ix_base + (%(tpb)*0) ];} | |
blk_in_ix_base += %(in_ix_blk_in_chan_sz); | |
// end in_smem_loads */ | |
/* inner_loop_body = // begin inner_loop_body | |
in_strip[0] = in_smem_off[0]; | |
in_strip[1] = in_smem_off[1]; | |
in_strip[2] = in_smem_off[2]; | |
in_strip[3] = in_smem_off[3]; | |
in_strip[4] = in_smem_off[4]; | |
in_strip[5] = in_smem_off[5]; | |
in_strip[6] = in_smem_off[6]; | |
in_strip[7] = in_smem_off[7]; | |
in_strip[8] = in_smem_off[8]; | |
in_strip[9] = in_smem_off[9]; | |
filts_strip[0] = filts_smem_off[0*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[0*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[0*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[0*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[0*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[0*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[0*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[0*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
filts_strip[0] = filts_smem_off[1*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[1*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[1*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[1*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[1*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[1*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[1*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[1]; | |
out_tile[1] += filts_strip[1]*in_strip[1]; | |
out_tile[2] += filts_strip[2]*in_strip[1]; | |
out_tile[3] += filts_strip[3]*in_strip[1]; | |
out_tile[4] += filts_strip[4]*in_strip[1]; | |
out_tile[5] += filts_strip[5]*in_strip[1]; | |
out_tile[6] += filts_strip[6]*in_strip[1]; | |
out_tile[7] += filts_strip[7]*in_strip[1]; | |
out_tile[8] += filts_strip[0]*in_strip[2]; | |
out_tile[9] += filts_strip[1]*in_strip[2]; | |
out_tile[10] += filts_strip[2]*in_strip[2]; | |
out_tile[11] += filts_strip[3]*in_strip[2]; | |
out_tile[12] += filts_strip[4]*in_strip[2]; | |
out_tile[13] += filts_strip[5]*in_strip[2]; | |
out_tile[14] += filts_strip[6]*in_strip[2]; | |
out_tile[15] += filts_strip[7]*in_strip[2]; | |
out_tile[16] += filts_strip[0]*in_strip[3]; | |
out_tile[17] += filts_strip[1]*in_strip[3]; | |
out_tile[18] += filts_strip[2]*in_strip[3]; | |
out_tile[19] += filts_strip[3]*in_strip[3]; | |
out_tile[20] += filts_strip[4]*in_strip[3]; | |
out_tile[21] += filts_strip[5]*in_strip[3]; | |
out_tile[22] += filts_strip[6]*in_strip[3]; | |
out_tile[23] += filts_strip[7]*in_strip[3]; | |
out_tile[24] += filts_strip[0]*in_strip[4]; | |
out_tile[25] += filts_strip[1]*in_strip[4]; | |
out_tile[26] += filts_strip[2]*in_strip[4]; | |
out_tile[27] += filts_strip[3]*in_strip[4]; | |
out_tile[28] += filts_strip[4]*in_strip[4]; | |
out_tile[29] += filts_strip[5]*in_strip[4]; | |
out_tile[30] += filts_strip[6]*in_strip[4]; | |
out_tile[31] += filts_strip[7]*in_strip[4]; | |
out_tile[32] += filts_strip[0]*in_strip[5]; | |
out_tile[33] += filts_strip[1]*in_strip[5]; | |
out_tile[34] += filts_strip[2]*in_strip[5]; | |
out_tile[35] += filts_strip[3]*in_strip[5]; | |
out_tile[36] += filts_strip[4]*in_strip[5]; | |
out_tile[37] += filts_strip[5]*in_strip[5]; | |
out_tile[38] += filts_strip[6]*in_strip[5]; | |
out_tile[39] += filts_strip[7]*in_strip[5]; | |
out_tile[40] += filts_strip[0]*in_strip[6]; | |
out_tile[41] += filts_strip[1]*in_strip[6]; | |
out_tile[42] += filts_strip[2]*in_strip[6]; | |
out_tile[43] += filts_strip[3]*in_strip[6]; | |
out_tile[44] += filts_strip[4]*in_strip[6]; | |
out_tile[45] += filts_strip[5]*in_strip[6]; | |
out_tile[46] += filts_strip[6]*in_strip[6]; | |
out_tile[47] += filts_strip[7]*in_strip[6]; | |
out_tile[48] += filts_strip[0]*in_strip[7]; | |
out_tile[49] += filts_strip[1]*in_strip[7]; | |
out_tile[50] += filts_strip[2]*in_strip[7]; | |
out_tile[51] += filts_strip[3]*in_strip[7]; | |
out_tile[52] += filts_strip[4]*in_strip[7]; | |
out_tile[53] += filts_strip[5]*in_strip[7]; | |
out_tile[54] += filts_strip[6]*in_strip[7]; | |
out_tile[55] += filts_strip[7]*in_strip[7]; | |
out_tile[56] += filts_strip[0]*in_strip[8]; | |
out_tile[57] += filts_strip[1]*in_strip[8]; | |
out_tile[58] += filts_strip[2]*in_strip[8]; | |
out_tile[59] += filts_strip[3]*in_strip[8]; | |
out_tile[60] += filts_strip[4]*in_strip[8]; | |
out_tile[61] += filts_strip[5]*in_strip[8]; | |
out_tile[62] += filts_strip[6]*in_strip[8]; | |
out_tile[63] += filts_strip[7]*in_strip[8]; | |
filts_strip[0] = filts_smem_off[2*%(blk_filt_ix_sz)+0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[2*%(blk_filt_ix_sz)+1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(blk_filt_ix_sz)+2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[2*%(blk_filt_ix_sz)+3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[2*%(blk_filt_ix_sz)+4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[2*%(blk_filt_ix_sz)+5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[2*%(blk_filt_ix_sz)+6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[2*%(blk_filt_ix_sz)+7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
out_tile[0] += filts_strip[0]*in_strip[2]; | |
out_tile[1] += filts_strip[1]*in_strip[2]; | |
out_tile[2] += filts_strip[2]*in_strip[2]; | |
out_tile[3] += filts_strip[3]*in_strip[2]; | |
out_tile[4] += filts_strip[4]*in_strip[2]; | |
out_tile[5] += filts_strip[5]*in_strip[2]; | |
out_tile[6] += filts_strip[6]*in_strip[2]; | |
out_tile[7] += filts_strip[7]*in_strip[2]; | |
out_tile[8] += filts_strip[0]*in_strip[3]; | |
out_tile[9] += filts_strip[1]*in_strip[3]; | |
out_tile[10] += filts_strip[2]*in_strip[3]; | |
out_tile[11] += filts_strip[3]*in_strip[3]; | |
out_tile[12] += filts_strip[4]*in_strip[3]; | |
out_tile[13] += filts_strip[5]*in_strip[3]; | |
out_tile[14] += filts_strip[6]*in_strip[3]; | |
out_tile[15] += filts_strip[7]*in_strip[3]; | |
out_tile[16] += filts_strip[0]*in_strip[4]; | |
out_tile[17] += filts_strip[1]*in_strip[4]; | |
out_tile[18] += filts_strip[2]*in_strip[4]; | |
out_tile[19] += filts_strip[3]*in_strip[4]; | |
out_tile[20] += filts_strip[4]*in_strip[4]; | |
out_tile[21] += filts_strip[5]*in_strip[4]; | |
out_tile[22] += filts_strip[6]*in_strip[4]; | |
out_tile[23] += filts_strip[7]*in_strip[4]; | |
out_tile[24] += filts_strip[0]*in_strip[5]; | |
out_tile[25] += filts_strip[1]*in_strip[5]; | |
out_tile[26] += filts_strip[2]*in_strip[5]; | |
out_tile[27] += filts_strip[3]*in_strip[5]; | |
out_tile[28] += filts_strip[4]*in_strip[5]; | |
out_tile[29] += filts_strip[5]*in_strip[5]; | |
out_tile[30] += filts_strip[6]*in_strip[5]; | |
out_tile[31] += filts_strip[7]*in_strip[5]; | |
out_tile[32] += filts_strip[0]*in_strip[6]; | |
out_tile[33] += filts_strip[1]*in_strip[6]; | |
out_tile[34] += filts_strip[2]*in_strip[6]; | |
out_tile[35] += filts_strip[3]*in_strip[6]; | |
out_tile[36] += filts_strip[4]*in_strip[6]; | |
out_tile[37] += filts_strip[5]*in_strip[6]; | |
out_tile[38] += filts_strip[6]*in_strip[6]; | |
out_tile[39] += filts_strip[7]*in_strip[6]; | |
out_tile[40] += filts_strip[0]*in_strip[7]; | |
out_tile[41] += filts_strip[1]*in_strip[7]; | |
out_tile[42] += filts_strip[2]*in_strip[7]; | |
out_tile[43] += filts_strip[3]*in_strip[7]; | |
out_tile[44] += filts_strip[4]*in_strip[7]; | |
out_tile[45] += filts_strip[5]*in_strip[7]; | |
out_tile[46] += filts_strip[6]*in_strip[7]; | |
out_tile[47] += filts_strip[7]*in_strip[7]; | |
out_tile[48] += filts_strip[0]*in_strip[8]; | |
out_tile[49] += filts_strip[1]*in_strip[8]; | |
out_tile[50] += filts_strip[2]*in_strip[8]; | |
out_tile[51] += filts_strip[3]*in_strip[8]; | |
out_tile[52] += filts_strip[4]*in_strip[8]; | |
out_tile[53] += filts_strip[5]*in_strip[8]; | |
out_tile[54] += filts_strip[6]*in_strip[8]; | |
out_tile[55] += filts_strip[7]*in_strip[8]; | |
out_tile[56] += filts_strip[0]*in_strip[9]; | |
out_tile[57] += filts_strip[1]*in_strip[9]; | |
out_tile[58] += filts_strip[2]*in_strip[9]; | |
out_tile[59] += filts_strip[3]*in_strip[9]; | |
out_tile[60] += filts_strip[4]*in_strip[9]; | |
out_tile[61] += filts_strip[5]*in_strip[9]; | |
out_tile[62] += filts_strip[6]*in_strip[9]; | |
out_tile[63] += filts_strip[7]*in_strip[9]; | |
*/ | |
/* t_tile_bias_loads = // begin t_tile_bias_loads | |
filts_strip[0] = filts_smem_off[0*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[1] = filts_smem_off[1*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[2] = filts_smem_off[2*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[3] = filts_smem_off[3*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[4] = filts_smem_off[4*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[5] = filts_smem_off[5*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[6] = filts_smem_off[6*%(LOC_ID_1D_out_chan_tile_dim)]; | |
filts_strip[7] = filts_smem_off[7*%(LOC_ID_1D_out_chan_tile_dim)]; | |
// end t_tile_bias_loads */ | |
/* t_tile_stores = // begin t_tile_stores | |
if( %(out_line_img) >= %(out_ix_img_dim) ) { return; } | |
int32_t out_x = %(GRP_ID_1D_blk_bx)*%(t_tile_sz); | |
int32_t out_chan = (%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim) + %(LOC_ID_1D_out_chan_tile))*%(t_tile_sz); | |
GASQ float * out_off = out + %(out_line_img)*%(out_ix_img_sz) + out_chan*%(out_ix_chan_sz) + %(out_line_y)*%(out_ix_y_sz) + out_x*%(out_ix_x_sz) ; | |
if( (out_x + 0) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 0*%(out_ix_x_sz) ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( (out_x + 1) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 1*%(out_ix_x_sz) ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( (out_x + 2) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 2*%(out_ix_x_sz) ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( (out_x + 3) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 3*%(out_ix_x_sz) ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( (out_x + 4) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 4*%(out_ix_x_sz) ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( (out_x + 5) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 5*%(out_ix_x_sz) ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( (out_x + 6) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 6*%(out_ix_x_sz) ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( (out_x + 7) >= %(out_ix_x_dim) ) { return; } // this x value and the following are off-the-end patches, so don't store them. | |
if( (out_chan + 0) < %(out_ix_chan_dim) ) { out_off[ 0*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( (out_chan + 1) < %(out_ix_chan_dim) ) { out_off[ 1*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( (out_chan + 2) < %(out_ix_chan_dim) ) { out_off[ 2*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( (out_chan + 3) < %(out_ix_chan_dim) ) { out_off[ 3*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( (out_chan + 4) < %(out_ix_chan_dim) ) { out_off[ 4*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( (out_chan + 5) < %(out_ix_chan_dim) ) { out_off[ 5*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( (out_chan + 6) < %(out_ix_chan_dim) ) { out_off[ 6*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( (out_chan + 7) < %(out_ix_chan_dim) ) { out_off[ 7*%(out_ix_chan_sz) + 7*%(out_ix_x_sz) ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores */ | |
CUCL_GLOBAL_KERNEL void in_tile_xpose__num_imgs_20__stride_1__kern_sz_3__in_pad_1__in_chans_128__ysz_28__xsz_28__tix_pels_tile_sz_8__t_tile_sz_8__bix_pels_blk_sz_280( GASQ float const * const in, GASQ float * const out ) { | |
int32_t const out_ix = GLOB_ID_1D; | |
if( out_ix >= 4300800 ) { return; } | |
int32_t const out_line = (out_ix/61440)*8; | |
int32_t const fi_skip_in_lines = (out_line%28)*1; | |
int32_t const in_line = (((out_ix/10)%12)+fi_skip_in_lines); | |
int32_t const img_in_lines = (28 - 1)*1 + 3; | |
int32_t const img_off = in_line/img_in_lines; | |
int32_t const img = (out_line/28) + img_off; | |
int32_t const iy = (in_line % img_in_lines) - 1; //(out_line%28)*1 + ((out_ix/10)%12) - 1; | |
int32_t const ix = ((out_ix/15360)%4)*8*1 + (out_ix%10) - 1; | |
float v = 0.0f; | |
if( 1 | |
&& ( ix >= 0 ) | |
&& ( iy >= 0 ) | |
&& ( ix < 28 ) | |
&& ( iy < 28 ) | |
&& ( img < 20 ) | |
) | |
{ | |
v = in[ img*100352 + | |
((out_ix/120)%128)*784 + | |
iy*28 + | |
ix*1 ]; | |
} | |
out[out_ix] = v; | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* stride = 1 */ | |
/* kern_sz = 3 */ | |
/* in_pad = 1 */ | |
/* in_chans = 128 */ | |
/* ysz = 28 */ | |
/* xsz = 28 */ | |
/* tix_pels_tile_sz = 8 */ | |
/* t_tile_sz = 8 */ | |
/* bix_pels_blk_sz = 280 */ | |
/* rtc_func_name = in_tile_xpose__num_imgs_20__stride_1__kern_sz_3__in_pad_1__in_chans_128__ysz_28__xsz_28__tix_pels_tile_sz_8__t_tile_sz_8__bix_pels_blk_sz_280 */ | |
/* out_ix_blk_x_dim = 10 */ | |
/* out_ix_blk_x_sz = 1 */ | |
/* out_ix_blk_x_nomod = out_ix */ | |
/* out_ix_blk_x = (out_ix%%10) */ | |
/* out_ix_blk_y_dim = 12 */ | |
/* out_ix_blk_y_sz = 10 */ | |
/* out_ix_blk_y_nomod = (out_ix/10) */ | |
/* out_ix_blk_y = ((out_ix/10)%%12) */ | |
/* out_ix_blk_in_chan_dim = 128 */ | |
/* out_ix_blk_in_chan_sz = 120 */ | |
/* out_ix_blk_in_chan_nomod = (out_ix/120) */ | |
/* out_ix_blk_in_chan = ((out_ix/120)%%128) */ | |
/* out_ix_blk_bx_dim = 4 */ | |
/* out_ix_blk_bx_sz = 15360 */ | |
/* out_ix_blk_bx_nomod = (out_ix/15360) */ | |
/* out_ix_blk_bx = ((out_ix/15360)%%4) */ | |
/* out_ix_blk_bline_dim = 70 */ | |
/* out_ix_blk_bline_sz = 61440 */ | |
/* out_ix_blk_bline_nomod = (out_ix/61440) */ | |
/* out_ix_blk_bline = (out_ix/61440) */ | |
/* out_ix_sz = 4300800 */ | |
/* out_line_y_dim = 28 */ | |
/* out_line_y_sz = 1 */ | |
/* out_line_y_nomod = out_line */ | |
/* out_line_y = (out_line%%28) */ | |
/* out_line_img_dim = 20 */ | |
/* out_line_img_sz = 28 */ | |
/* out_line_img_nomod = (out_line/28) */ | |
/* out_line_img = (out_line/28) */ | |
/* out_line_sz = 560 */ | |
/* in_ix_x_dim = 28 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%28) */ | |
/* in_ix_y_dim = 28 */ | |
/* in_ix_y_sz = 28 */ | |
/* in_ix_y_nomod = (in_ix/28) */ | |
/* in_ix_y = ((in_ix/28)%%28) */ | |
/* in_ix_chan_dim = 128 */ | |
/* in_ix_chan_sz = 784 */ | |
/* in_ix_chan_nomod = (in_ix/784) */ | |
/* in_ix_chan = ((in_ix/784)%%128) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 100352 */ | |
/* in_ix_img_nomod = (in_ix/100352) */ | |
/* in_ix_img = (in_ix/100352) */ | |
/* in_ix_sz = 2007040 */ | |
CUCL_GLOBAL_KERNEL void xpose_filts__out_chans_192__in_chans_128__kysz_3__kxsz_3( GASQ float const * const in, // CUCL IN out_chan:in_chan:y:x | |
GASQ float * const out ) // CUCL OUT out_chan_blk:in_chan:y:x:out_chan_reg:out_chan_tile | |
{ | |
// CUCL IX fioc out.out_chan_blk:out.out_chan_tile:out.out_chan_reg | |
int32_t const filts_ix = GLOB_ID_1D; | |
if( filts_ix >= 221184 ) { return; } | |
int32_t const fioc = (filts_ix/1152); | |
float val = 0.0f; | |
int32_t const filts_xp_ix = | |
(fioc/128)*147456 + | |
(fioc%8)*16 + | |
((fioc/8)%16)*1 + | |
((filts_ix/9)%128)*1152 + | |
((filts_ix/3)%3)*384 + | |
(filts_ix%3)*128; | |
#if 1 | |
val = in[filts_ix]; | |
#else | |
if( ((filts_ix/9)%128) == 0 ) { | |
// if( ((filts_ix%3) == 5) && (((filts_ix/3)%3) == 5) ) | |
{ | |
val = (filts_ix%3)*100 + ((filts_ix/3)%3); | |
} | |
} | |
#endif | |
out[filts_xp_ix] = val; | |
} | |
// -- template substituion table used: -- | |
/* out_chans = 192 */ | |
/* in_chans = 128 */ | |
/* kysz = 3 */ | |
/* kxsz = 3 */ | |
/* rtc_func_name = xpose_filts__out_chans_192__in_chans_128__kysz_3__kxsz_3 */ | |
/* t_tile_sz = 8 */ | |
/* filts_ix_x_dim = 3 */ | |
/* filts_ix_x_sz = 1 */ | |
/* filts_ix_x_nomod = filts_ix */ | |
/* filts_ix_x = (filts_ix%%3) */ | |
/* filts_ix_y_dim = 3 */ | |
/* filts_ix_y_sz = 3 */ | |
/* filts_ix_y_nomod = (filts_ix/3) */ | |
/* filts_ix_y = ((filts_ix/3)%%3) */ | |
/* filts_ix_in_chan_dim = 128 */ | |
/* filts_ix_in_chan_sz = 9 */ | |
/* filts_ix_in_chan_nomod = (filts_ix/9) */ | |
/* filts_ix_in_chan = ((filts_ix/9)%%128) */ | |
/* filts_ix_out_chan_dim = 192 */ | |
/* filts_ix_out_chan_sz = 1152 */ | |
/* filts_ix_out_chan_nomod = (filts_ix/1152) */ | |
/* filts_ix_out_chan = (filts_ix/1152) */ | |
/* filts_ix_sz = 221184 */ | |
/* filts_xp_ix_out_chan_tile_dim = 16 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%16) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 16 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/16) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/16)%%8) */ | |
/* filts_xp_ix_x_dim = 3 */ | |
/* filts_xp_ix_x_sz = 128 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/128) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/128)%%3) */ | |
/* filts_xp_ix_y_dim = 3 */ | |
/* filts_xp_ix_y_sz = 384 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/384) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/384)%%3) */ | |
/* filts_xp_ix_in_chan_dim = 128 */ | |
/* filts_xp_ix_in_chan_sz = 1152 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/1152) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/1152)%%128) */ | |
/* filts_xp_ix_out_chan_blk_dim = 2 */ | |
/* filts_xp_ix_out_chan_blk_sz = 147456 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/147456) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/147456) */ | |
/* filts_xp_ix_sz = 294912 */ | |
/* fioc_out_chan_reg_dim = 8 */ | |
/* fioc_out_chan_reg_sz = 1 */ | |
/* fioc_out_chan_reg_nomod = fioc */ | |
/* fioc_out_chan_reg = (fioc%%8) */ | |
/* fioc_out_chan_tile_dim = 16 */ | |
/* fioc_out_chan_tile_sz = 8 */ | |
/* fioc_out_chan_tile_nomod = (fioc/8) */ | |
/* fioc_out_chan_tile = ((fioc/8)%%16) */ | |
/* fioc_out_chan_blk_dim = 2 */ | |
/* fioc_out_chan_blk_sz = 128 */ | |
/* fioc_out_chan_blk_nomod = (fioc/128) */ | |
/* fioc_out_chan_blk = (fioc/128) */ | |
/* fioc_sz = 256 */ | |
// 256 tbp | |
// each thread: computes 8x8 block of out | |
// loop over k dim | |
CUCL_GLOBAL_KERNEL void conv__num_imgs_20__in_pad_0__in_dim_0_28__in_dim_1_28__conv_has_relu_1__kern_sz_1__stride_1__out_chans_32__in_chans_256( GASQ float const * const filts, GASQ float const * const biases, GASQ float const * const in, GASQ float * const out ) { | |
LOCSHAR_MEM float in_smem[32*8]; | |
int32_t const blk_filt_ix_sz = 4*8; | |
LOCSHAR_MEM float filts_smem[4*8]; // aka blk_filt_ix_sz, which wasn't const enough OpenCL | |
float out_tile[8*8] = {0}; // tile of output for this thread to compute, stored in registers | |
// reg. buffers for one strip each from in and filts of 8 elements, for the same filts_ix_out_chan_elem | |
float filts_strip[8]; // across output chans (stride is blk_filt_ix_sz ) | |
float in_strip[8]; // across patches (approx square block in x/y space, favoring x if sqrt() not integer) | |
int32_t const blk_filt_ix_base = (GRP_ID_1D%1)*8192; | |
int32_t const blk_patch_ix_sz = 32*8; | |
int32_t const blk_patch_ix_base = GRP_ID_1D*blk_patch_ix_sz; | |
// iteratate over filter elements | |
int32_t filts_off = blk_filt_ix_base; | |
for( int32_t filts_ix_out_chan_elem = 0; filts_ix_out_chan_elem != | |
(256 * 1 * 1); ++filts_ix_out_chan_elem ) { | |
BARRIER_SYNC; | |
if( LOC_ID_1D < blk_filt_ix_sz ) { | |
#ifdef NO_IOX // by default, we don't ever disable this, since it's seems about as good as it can be already | |
//filts_smem[LOC_ID_1D] = LOC_ID_1D; | |
filts_smem[LOC_ID_1D] = filts[LOC_ID_1D]; | |
#else | |
filts_smem[LOC_ID_1D] = filts[filts_off + LOC_ID_1D]; | |
#endif | |
} | |
for( int32_t i = 0; i != 2; ++i ) { | |
if( (LOC_ID_1D+LOC_SZ_1D*i) < blk_patch_ix_sz ) { | |
int32_t const t_smem_patch_ix = (blk_patch_ix_base+LOC_ID_1D+LOC_SZ_1D*i); | |
#ifdef NO_IO | |
//float v = LOC_ID_1D; | |
//float v = in[LOC_ID_1D]; | |
float v = in[filts_off + LOC_ID_1D]; | |
#else | |
float v = 0; | |
int const smem_in_ix_y = ((t_smem_patch_ix/28)%28)*1+(filts_ix_out_chan_elem%1) - 0; | |
int const smem_in_ix_x = (t_smem_patch_ix%28)*1+(filts_ix_out_chan_elem%1) - 0; | |
if(smem_in_ix_y >= 0 && smem_in_ix_x >= 0 && | |
(t_smem_patch_ix/784) < 20 && | |
smem_in_ix_x < 28 && smem_in_ix_y < 28 ) { | |
v = in[(t_smem_patch_ix/784)*200704 + | |
filts_ix_out_chan_elem*784 + | |
smem_in_ix_y*28 + | |
smem_in_ix_x*1]; | |
}; | |
#endif | |
in_smem[LOC_ID_1D+LOC_SZ_1D*i] = v; | |
} | |
} | |
filts_off += 32; | |
BARRIER_SYNC; | |
#ifdef NO_IO | |
// begin t_tile_dummy_loads | |
filts_strip[0] = filts_smem[(LOC_ID_1D % 32) + 0]; | |
filts_strip[1] = filts_smem[(LOC_ID_1D % 32) + 1]; | |
filts_strip[2] = filts_smem[(LOC_ID_1D % 32) + 2]; | |
filts_strip[3] = filts_smem[(LOC_ID_1D % 32) + 3]; | |
filts_strip[4] = filts_smem[(LOC_ID_1D % 32) + 4]; | |
filts_strip[5] = filts_smem[(LOC_ID_1D % 32) + 5]; | |
filts_strip[6] = filts_smem[(LOC_ID_1D % 32) + 6]; | |
filts_strip[7] = filts_smem[(LOC_ID_1D % 32) + 7]; | |
in_strip[0] = in_smem[(LOC_ID_1D % 32) + 0]; | |
in_strip[1] = in_smem[(LOC_ID_1D % 32) + 1]; | |
in_strip[2] = in_smem[(LOC_ID_1D % 32) + 2]; | |
in_strip[3] = in_smem[(LOC_ID_1D % 32) + 3]; | |
in_strip[4] = in_smem[(LOC_ID_1D % 32) + 4]; | |
in_strip[5] = in_smem[(LOC_ID_1D % 32) + 5]; | |
in_strip[6] = in_smem[(LOC_ID_1D % 32) + 6]; | |
in_strip[7] = in_smem[(LOC_ID_1D % 32) + 7]; | |
// end t_tile_dummy_loads; | |
#else | |
// begin t_tile_loads | |
filts_strip[0] = filts_smem[(LOC_ID_1D%4)+0*4]; | |
filts_strip[1] = filts_smem[(LOC_ID_1D%4)+1*4]; | |
filts_strip[2] = filts_smem[(LOC_ID_1D%4)+2*4]; | |
filts_strip[3] = filts_smem[(LOC_ID_1D%4)+3*4]; | |
filts_strip[4] = filts_smem[(LOC_ID_1D%4)+4*4]; | |
filts_strip[5] = filts_smem[(LOC_ID_1D%4)+5*4]; | |
filts_strip[6] = filts_smem[(LOC_ID_1D%4)+6*4]; | |
filts_strip[7] = filts_smem[(LOC_ID_1D%4)+7*4]; | |
in_strip[0] = in_smem[8*(LOC_ID_1D/4)+0]; | |
in_strip[1] = in_smem[8*(LOC_ID_1D/4)+1]; | |
in_strip[2] = in_smem[8*(LOC_ID_1D/4)+2]; | |
in_strip[3] = in_smem[8*(LOC_ID_1D/4)+3]; | |
in_strip[4] = in_smem[8*(LOC_ID_1D/4)+4]; | |
in_strip[5] = in_smem[8*(LOC_ID_1D/4)+5]; | |
in_strip[6] = in_smem[8*(LOC_ID_1D/4)+6]; | |
in_strip[7] = in_smem[8*(LOC_ID_1D/4)+7]; | |
// end t_tile_loads; | |
#endif | |
// (2) do 8^2 fmas into out_tile | |
// begin t_tile_fmas | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] += filts_strip[1]*in_strip[3]; | |
out_tile[26] += filts_strip[2]*in_strip[3]; | |
out_tile[27] += filts_strip[3]*in_strip[3]; | |
out_tile[28] += filts_strip[4]*in_strip[3]; | |
out_tile[29] += filts_strip[5]*in_strip[3]; | |
out_tile[30] += filts_strip[6]*in_strip[3]; | |
out_tile[31] += filts_strip[7]*in_strip[3]; | |
out_tile[32] += filts_strip[0]*in_strip[4]; | |
out_tile[33] += filts_strip[1]*in_strip[4]; | |
out_tile[34] += filts_strip[2]*in_strip[4]; | |
out_tile[35] += filts_strip[3]*in_strip[4]; | |
out_tile[36] += filts_strip[4]*in_strip[4]; | |
out_tile[37] += filts_strip[5]*in_strip[4]; | |
out_tile[38] += filts_strip[6]*in_strip[4]; | |
out_tile[39] += filts_strip[7]*in_strip[4]; | |
out_tile[40] += filts_strip[0]*in_strip[5]; | |
out_tile[41] += filts_strip[1]*in_strip[5]; | |
out_tile[42] += filts_strip[2]*in_strip[5]; | |
out_tile[43] += filts_strip[3]*in_strip[5]; | |
out_tile[44] += filts_strip[4]*in_strip[5]; | |
out_tile[45] += filts_strip[5]*in_strip[5]; | |
out_tile[46] += filts_strip[6]*in_strip[5]; | |
out_tile[47] += filts_strip[7]*in_strip[5]; | |
out_tile[48] += filts_strip[0]*in_strip[6]; | |
out_tile[49] += filts_strip[1]*in_strip[6]; | |
out_tile[50] += filts_strip[2]*in_strip[6]; | |
out_tile[51] += filts_strip[3]*in_strip[6]; | |
out_tile[52] += filts_strip[4]*in_strip[6]; | |
out_tile[53] += filts_strip[5]*in_strip[6]; | |
out_tile[54] += filts_strip[6]*in_strip[6]; | |
out_tile[55] += filts_strip[7]*in_strip[6]; | |
out_tile[56] += filts_strip[0]*in_strip[7]; | |
out_tile[57] += filts_strip[1]*in_strip[7]; | |
out_tile[58] += filts_strip[2]*in_strip[7]; | |
out_tile[59] += filts_strip[3]*in_strip[7]; | |
out_tile[60] += filts_strip[4]*in_strip[7]; | |
out_tile[61] += filts_strip[5]*in_strip[7]; | |
out_tile[62] += filts_strip[6]*in_strip[7]; | |
out_tile[63] += filts_strip[7]*in_strip[7]; | |
// end t_tile_fmas; | |
} | |
// load per-block biases into smem | |
BARRIER_SYNC; | |
if( LOC_ID_1D < blk_filt_ix_sz ) { | |
int32_t const ocix_base = (GRP_ID_1D%1)*blk_filt_ix_sz; | |
int32_t const load_reg = LOC_ID_1D / 4; | |
int32_t const load_tile = LOC_ID_1D % 4; | |
int32_t const ocix = ocix_base + load_tile*8 + load_reg; | |
if( ocix < 32 ) { filts_smem[LOC_ID_1D] = biases[ ocix ]; } | |
//int32_t const ocix_tile = (ocix / 8) % 4; | |
//int32_t const ocix_reg = ocix % 8; | |
//filts_smem[ocix_tile * 1 + ocix_reg * 4] = biases[ocix]; | |
} | |
BARRIER_SYNC; | |
// load biases into filts_strip | |
// begin t_tile_loads | |
filts_strip[0] = filts_smem[(LOC_ID_1D%4)+0*4]; | |
filts_strip[1] = filts_smem[(LOC_ID_1D%4)+1*4]; | |
filts_strip[2] = filts_smem[(LOC_ID_1D%4)+2*4]; | |
filts_strip[3] = filts_smem[(LOC_ID_1D%4)+3*4]; | |
filts_strip[4] = filts_smem[(LOC_ID_1D%4)+4*4]; | |
filts_strip[5] = filts_smem[(LOC_ID_1D%4)+5*4]; | |
filts_strip[6] = filts_smem[(LOC_ID_1D%4)+6*4]; | |
filts_strip[7] = filts_smem[(LOC_ID_1D%4)+7*4]; | |
in_strip[0] = in_smem[8*(LOC_ID_1D/4)+0]; | |
in_strip[1] = in_smem[8*(LOC_ID_1D/4)+1]; | |
in_strip[2] = in_smem[8*(LOC_ID_1D/4)+2]; | |
in_strip[3] = in_smem[8*(LOC_ID_1D/4)+3]; | |
in_strip[4] = in_smem[8*(LOC_ID_1D/4)+4]; | |
in_strip[5] = in_smem[8*(LOC_ID_1D/4)+5]; | |
in_strip[6] = in_smem[8*(LOC_ID_1D/4)+6]; | |
in_strip[7] = in_smem[8*(LOC_ID_1D/4)+7]; | |
// end t_tile_loads; | |
// add bias to each elem of out_tile[] and store the results to out[] | |
#ifdef NO_IO | |
// begin t_tile_dummy_stores | |
out[0] = 0.0f | |
+ max(0.0f,out_tile[0] + filts_strip[0]) | |
+ max(0.0f,out_tile[1] + filts_strip[1]) | |
+ max(0.0f,out_tile[2] + filts_strip[2]) | |
+ max(0.0f,out_tile[3] + filts_strip[3]) | |
+ max(0.0f,out_tile[4] + filts_strip[4]) | |
+ max(0.0f,out_tile[5] + filts_strip[5]) | |
+ max(0.0f,out_tile[6] + filts_strip[6]) | |
+ max(0.0f,out_tile[7] + filts_strip[7]) | |
+ max(0.0f,out_tile[8] + filts_strip[0]) | |
+ max(0.0f,out_tile[9] + filts_strip[1]) | |
+ max(0.0f,out_tile[10] + filts_strip[2]) | |
+ max(0.0f,out_tile[11] + filts_strip[3]) | |
+ max(0.0f,out_tile[12] + filts_strip[4]) | |
+ max(0.0f,out_tile[13] + filts_strip[5]) | |
+ max(0.0f,out_tile[14] + filts_strip[6]) | |
+ max(0.0f,out_tile[15] + filts_strip[7]) | |
+ max(0.0f,out_tile[16] + filts_strip[0]) | |
+ max(0.0f,out_tile[17] + filts_strip[1]) | |
+ max(0.0f,out_tile[18] + filts_strip[2]) | |
+ max(0.0f,out_tile[19] + filts_strip[3]) | |
+ max(0.0f,out_tile[20] + filts_strip[4]) | |
+ max(0.0f,out_tile[21] + filts_strip[5]) | |
+ max(0.0f,out_tile[22] + filts_strip[6]) | |
+ max(0.0f,out_tile[23] + filts_strip[7]) | |
+ max(0.0f,out_tile[24] + filts_strip[0]) | |
+ max(0.0f,out_tile[25] + filts_strip[1]) | |
+ max(0.0f,out_tile[26] + filts_strip[2]) | |
+ max(0.0f,out_tile[27] + filts_strip[3]) | |
+ max(0.0f,out_tile[28] + filts_strip[4]) | |
+ max(0.0f,out_tile[29] + filts_strip[5]) | |
+ max(0.0f,out_tile[30] + filts_strip[6]) | |
+ max(0.0f,out_tile[31] + filts_strip[7]) | |
+ max(0.0f,out_tile[32] + filts_strip[0]) | |
+ max(0.0f,out_tile[33] + filts_strip[1]) | |
+ max(0.0f,out_tile[34] + filts_strip[2]) | |
+ max(0.0f,out_tile[35] + filts_strip[3]) | |
+ max(0.0f,out_tile[36] + filts_strip[4]) | |
+ max(0.0f,out_tile[37] + filts_strip[5]) | |
+ max(0.0f,out_tile[38] + filts_strip[6]) | |
+ max(0.0f,out_tile[39] + filts_strip[7]) | |
+ max(0.0f,out_tile[40] + filts_strip[0]) | |
+ max(0.0f,out_tile[41] + filts_strip[1]) | |
+ max(0.0f,out_tile[42] + filts_strip[2]) | |
+ max(0.0f,out_tile[43] + filts_strip[3]) | |
+ max(0.0f,out_tile[44] + filts_strip[4]) | |
+ max(0.0f,out_tile[45] + filts_strip[5]) | |
+ max(0.0f,out_tile[46] + filts_strip[6]) | |
+ max(0.0f,out_tile[47] + filts_strip[7]) | |
+ max(0.0f,out_tile[48] + filts_strip[0]) | |
+ max(0.0f,out_tile[49] + filts_strip[1]) | |
+ max(0.0f,out_tile[50] + filts_strip[2]) | |
+ max(0.0f,out_tile[51] + filts_strip[3]) | |
+ max(0.0f,out_tile[52] + filts_strip[4]) | |
+ max(0.0f,out_tile[53] + filts_strip[5]) | |
+ max(0.0f,out_tile[54] + filts_strip[6]) | |
+ max(0.0f,out_tile[55] + filts_strip[7]) | |
+ max(0.0f,out_tile[56] + filts_strip[0]) | |
+ max(0.0f,out_tile[57] + filts_strip[1]) | |
+ max(0.0f,out_tile[58] + filts_strip[2]) | |
+ max(0.0f,out_tile[59] + filts_strip[3]) | |
+ max(0.0f,out_tile[60] + filts_strip[4]) | |
+ max(0.0f,out_tile[61] + filts_strip[5]) | |
+ max(0.0f,out_tile[62] + filts_strip[6]) | |
+ max(0.0f,out_tile[63] + filts_strip[7]) | |
; | |
// end t_tile_dummy_stores; | |
#else | |
// begin t_tile_stores | |
int32_t tpix[8]; | |
int32_t tcix[8]; | |
tpix[0] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+0)/784)*25088 + | |
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+0) % 784 ); // cache out patch ixs | |
tpix[1] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+1)/784)*25088 + | |
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+1) % 784 ); // cache out patch ixs | |
tpix[2] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+2)/784)*25088 + | |
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+2) % 784 ); // cache out patch ixs | |
tpix[3] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+3)/784)*25088 + | |
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+3) % 784 ); // cache out patch ixs | |
tpix[4] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+4)/784)*25088 + | |
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+4) % 784 ); // cache out patch ixs | |
tpix[5] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+5)/784)*25088 + | |
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+5) % 784 ); // cache out patch ixs | |
tpix[6] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+6)/784)*25088 + | |
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+6) % 784 ); // cache out patch ixs | |
tpix[7] = ((((LOC_ID_1D/4)+GRP_ID_1D*32)*8+7)/784)*25088 + | |
( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+7) % 784 ); // cache out patch ixs | |
tcix[0] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+0)*784; // cache out chan ixs | |
tcix[1] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+1)*784; // cache out chan ixs | |
tcix[2] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+2)*784; // cache out chan ixs | |
tcix[3] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+3)*784; // cache out chan ixs | |
tcix[4] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+4)*784; // cache out chan ixs | |
tcix[5] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+5)*784; // cache out chan ixs | |
tcix[6] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+6)*784; // cache out chan ixs | |
tcix[7] = ((((LOC_ID_1D%4)+(GRP_ID_1D%1)*4)*8)+7)*784; // cache out chan ixs | |
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+0) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (32*784) ) { out[ tpix[0] + tcix[0] ] = max(0.0f,out_tile[0] + filts_strip[0]); } | |
if( tcix[1] < (32*784) ) { out[ tpix[0] + tcix[1] ] = max(0.0f,out_tile[1] + filts_strip[1]); } | |
if( tcix[2] < (32*784) ) { out[ tpix[0] + tcix[2] ] = max(0.0f,out_tile[2] + filts_strip[2]); } | |
if( tcix[3] < (32*784) ) { out[ tpix[0] + tcix[3] ] = max(0.0f,out_tile[3] + filts_strip[3]); } | |
if( tcix[4] < (32*784) ) { out[ tpix[0] + tcix[4] ] = max(0.0f,out_tile[4] + filts_strip[4]); } | |
if( tcix[5] < (32*784) ) { out[ tpix[0] + tcix[5] ] = max(0.0f,out_tile[5] + filts_strip[5]); } | |
if( tcix[6] < (32*784) ) { out[ tpix[0] + tcix[6] ] = max(0.0f,out_tile[6] + filts_strip[6]); } | |
if( tcix[7] < (32*784) ) { out[ tpix[0] + tcix[7] ] = max(0.0f,out_tile[7] + filts_strip[7]); } | |
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+1) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (32*784) ) { out[ tpix[1] + tcix[0] ] = max(0.0f,out_tile[8] + filts_strip[0]); } | |
if( tcix[1] < (32*784) ) { out[ tpix[1] + tcix[1] ] = max(0.0f,out_tile[9] + filts_strip[1]); } | |
if( tcix[2] < (32*784) ) { out[ tpix[1] + tcix[2] ] = max(0.0f,out_tile[10] + filts_strip[2]); } | |
if( tcix[3] < (32*784) ) { out[ tpix[1] + tcix[3] ] = max(0.0f,out_tile[11] + filts_strip[3]); } | |
if( tcix[4] < (32*784) ) { out[ tpix[1] + tcix[4] ] = max(0.0f,out_tile[12] + filts_strip[4]); } | |
if( tcix[5] < (32*784) ) { out[ tpix[1] + tcix[5] ] = max(0.0f,out_tile[13] + filts_strip[5]); } | |
if( tcix[6] < (32*784) ) { out[ tpix[1] + tcix[6] ] = max(0.0f,out_tile[14] + filts_strip[6]); } | |
if( tcix[7] < (32*784) ) { out[ tpix[1] + tcix[7] ] = max(0.0f,out_tile[15] + filts_strip[7]); } | |
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+2) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (32*784) ) { out[ tpix[2] + tcix[0] ] = max(0.0f,out_tile[16] + filts_strip[0]); } | |
if( tcix[1] < (32*784) ) { out[ tpix[2] + tcix[1] ] = max(0.0f,out_tile[17] + filts_strip[1]); } | |
if( tcix[2] < (32*784) ) { out[ tpix[2] + tcix[2] ] = max(0.0f,out_tile[18] + filts_strip[2]); } | |
if( tcix[3] < (32*784) ) { out[ tpix[2] + tcix[3] ] = max(0.0f,out_tile[19] + filts_strip[3]); } | |
if( tcix[4] < (32*784) ) { out[ tpix[2] + tcix[4] ] = max(0.0f,out_tile[20] + filts_strip[4]); } | |
if( tcix[5] < (32*784) ) { out[ tpix[2] + tcix[5] ] = max(0.0f,out_tile[21] + filts_strip[5]); } | |
if( tcix[6] < (32*784) ) { out[ tpix[2] + tcix[6] ] = max(0.0f,out_tile[22] + filts_strip[6]); } | |
if( tcix[7] < (32*784) ) { out[ tpix[2] + tcix[7] ] = max(0.0f,out_tile[23] + filts_strip[7]); } | |
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+3) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (32*784) ) { out[ tpix[3] + tcix[0] ] = max(0.0f,out_tile[24] + filts_strip[0]); } | |
if( tcix[1] < (32*784) ) { out[ tpix[3] + tcix[1] ] = max(0.0f,out_tile[25] + filts_strip[1]); } | |
if( tcix[2] < (32*784) ) { out[ tpix[3] + tcix[2] ] = max(0.0f,out_tile[26] + filts_strip[2]); } | |
if( tcix[3] < (32*784) ) { out[ tpix[3] + tcix[3] ] = max(0.0f,out_tile[27] + filts_strip[3]); } | |
if( tcix[4] < (32*784) ) { out[ tpix[3] + tcix[4] ] = max(0.0f,out_tile[28] + filts_strip[4]); } | |
if( tcix[5] < (32*784) ) { out[ tpix[3] + tcix[5] ] = max(0.0f,out_tile[29] + filts_strip[5]); } | |
if( tcix[6] < (32*784) ) { out[ tpix[3] + tcix[6] ] = max(0.0f,out_tile[30] + filts_strip[6]); } | |
if( tcix[7] < (32*784) ) { out[ tpix[3] + tcix[7] ] = max(0.0f,out_tile[31] + filts_strip[7]); } | |
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+4) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (32*784) ) { out[ tpix[4] + tcix[0] ] = max(0.0f,out_tile[32] + filts_strip[0]); } | |
if( tcix[1] < (32*784) ) { out[ tpix[4] + tcix[1] ] = max(0.0f,out_tile[33] + filts_strip[1]); } | |
if( tcix[2] < (32*784) ) { out[ tpix[4] + tcix[2] ] = max(0.0f,out_tile[34] + filts_strip[2]); } | |
if( tcix[3] < (32*784) ) { out[ tpix[4] + tcix[3] ] = max(0.0f,out_tile[35] + filts_strip[3]); } | |
if( tcix[4] < (32*784) ) { out[ tpix[4] + tcix[4] ] = max(0.0f,out_tile[36] + filts_strip[4]); } | |
if( tcix[5] < (32*784) ) { out[ tpix[4] + tcix[5] ] = max(0.0f,out_tile[37] + filts_strip[5]); } | |
if( tcix[6] < (32*784) ) { out[ tpix[4] + tcix[6] ] = max(0.0f,out_tile[38] + filts_strip[6]); } | |
if( tcix[7] < (32*784) ) { out[ tpix[4] + tcix[7] ] = max(0.0f,out_tile[39] + filts_strip[7]); } | |
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+5) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (32*784) ) { out[ tpix[5] + tcix[0] ] = max(0.0f,out_tile[40] + filts_strip[0]); } | |
if( tcix[1] < (32*784) ) { out[ tpix[5] + tcix[1] ] = max(0.0f,out_tile[41] + filts_strip[1]); } | |
if( tcix[2] < (32*784) ) { out[ tpix[5] + tcix[2] ] = max(0.0f,out_tile[42] + filts_strip[2]); } | |
if( tcix[3] < (32*784) ) { out[ tpix[5] + tcix[3] ] = max(0.0f,out_tile[43] + filts_strip[3]); } | |
if( tcix[4] < (32*784) ) { out[ tpix[5] + tcix[4] ] = max(0.0f,out_tile[44] + filts_strip[4]); } | |
if( tcix[5] < (32*784) ) { out[ tpix[5] + tcix[5] ] = max(0.0f,out_tile[45] + filts_strip[5]); } | |
if( tcix[6] < (32*784) ) { out[ tpix[5] + tcix[6] ] = max(0.0f,out_tile[46] + filts_strip[6]); } | |
if( tcix[7] < (32*784) ) { out[ tpix[5] + tcix[7] ] = max(0.0f,out_tile[47] + filts_strip[7]); } | |
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+6) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (32*784) ) { out[ tpix[6] + tcix[0] ] = max(0.0f,out_tile[48] + filts_strip[0]); } | |
if( tcix[1] < (32*784) ) { out[ tpix[6] + tcix[1] ] = max(0.0f,out_tile[49] + filts_strip[1]); } | |
if( tcix[2] < (32*784) ) { out[ tpix[6] + tcix[2] ] = max(0.0f,out_tile[50] + filts_strip[2]); } | |
if( tcix[3] < (32*784) ) { out[ tpix[6] + tcix[3] ] = max(0.0f,out_tile[51] + filts_strip[3]); } | |
if( tcix[4] < (32*784) ) { out[ tpix[6] + tcix[4] ] = max(0.0f,out_tile[52] + filts_strip[4]); } | |
if( tcix[5] < (32*784) ) { out[ tpix[6] + tcix[5] ] = max(0.0f,out_tile[53] + filts_strip[5]); } | |
if( tcix[6] < (32*784) ) { out[ tpix[6] + tcix[6] ] = max(0.0f,out_tile[54] + filts_strip[6]); } | |
if( tcix[7] < (32*784) ) { out[ tpix[6] + tcix[7] ] = max(0.0f,out_tile[55] + filts_strip[7]); } | |
if( (((LOC_ID_1D/4)+GRP_ID_1D*32)*8+7) >= 15680 ) { return; } // this patch and the following are off-the-end patches, so don't store them. | |
if( tcix[0] < (32*784) ) { out[ tpix[7] + tcix[0] ] = max(0.0f,out_tile[56] + filts_strip[0]); } | |
if( tcix[1] < (32*784) ) { out[ tpix[7] + tcix[1] ] = max(0.0f,out_tile[57] + filts_strip[1]); } | |
if( tcix[2] < (32*784) ) { out[ tpix[7] + tcix[2] ] = max(0.0f,out_tile[58] + filts_strip[2]); } | |
if( tcix[3] < (32*784) ) { out[ tpix[7] + tcix[3] ] = max(0.0f,out_tile[59] + filts_strip[3]); } | |
if( tcix[4] < (32*784) ) { out[ tpix[7] + tcix[4] ] = max(0.0f,out_tile[60] + filts_strip[4]); } | |
if( tcix[5] < (32*784) ) { out[ tpix[7] + tcix[5] ] = max(0.0f,out_tile[61] + filts_strip[5]); } | |
if( tcix[6] < (32*784) ) { out[ tpix[7] + tcix[6] ] = max(0.0f,out_tile[62] + filts_strip[6]); } | |
if( tcix[7] < (32*784) ) { out[ tpix[7] + tcix[7] ] = max(0.0f,out_tile[63] + filts_strip[7]); } | |
// end t_tile_stores; | |
#endif | |
} | |
// -- template substituion table used: -- | |
/* num_imgs = 20 */ | |
/* in_pad = 0 */ | |
/* in_dim_0 = 28 */ | |
/* in_dim_1 = 28 */ | |
/* conv_has_relu = 1 */ | |
/* kern_sz = 1 */ | |
/* stride = 1 */ | |
/* out_chans = 32 */ | |
/* in_chans = 256 */ | |
/* rtc_func_name = conv__num_imgs_20__in_pad_0__in_dim_0_28__in_dim_1_28__conv_has_relu_1__kern_sz_1__stride_1__out_chans_32__in_chans_256 */ | |
/* t_tile_sz = 8 */ | |
/* out_ix_x_dim = 28 */ | |
/* out_ix_x_sz = 1 */ | |
/* out_ix_x_nomod = out_ix */ | |
/* out_ix_x = (out_ix%%28) */ | |
/* out_ix_y_dim = 28 */ | |
/* out_ix_y_sz = 28 */ | |
/* out_ix_y_nomod = (out_ix/28) */ | |
/* out_ix_y = ((out_ix/28)%%28) */ | |
/* out_ix_chan_dim = 32 */ | |
/* out_ix_chan_sz = 784 */ | |
/* out_ix_chan_nomod = (out_ix/784) */ | |
/* out_ix_chan = ((out_ix/784)%%32) */ | |
/* out_ix_img_dim = 20 */ | |
/* out_ix_img_sz = 25088 */ | |
/* out_ix_img_nomod = (out_ix/25088) */ | |
/* out_ix_img = (out_ix/25088) */ | |
/* out_ix_sz = 501760 */ | |
/* in_ix_x_dim = 28 */ | |
/* in_ix_x_sz = 1 */ | |
/* in_ix_x_nomod = in_ix */ | |
/* in_ix_x = (in_ix%%28) */ | |
/* in_ix_y_dim = 28 */ | |
/* in_ix_y_sz = 28 */ | |
/* in_ix_y_nomod = (in_ix/28) */ | |
/* in_ix_y = ((in_ix/28)%%28) */ | |
/* in_ix_chan_dim = 256 */ | |
/* in_ix_chan_sz = 784 */ | |
/* in_ix_chan_nomod = (in_ix/784) */ | |
/* in_ix_chan = ((in_ix/784)%%256) */ | |
/* in_ix_img_dim = 20 */ | |
/* in_ix_img_sz = 200704 */ | |
/* in_ix_img_nomod = (in_ix/200704) */ | |
/* in_ix_img = (in_ix/200704) */ | |
/* in_ix_sz = 4014080 */ | |
/* t_smem_patch_ix_x_dim = 28 */ | |
/* t_smem_patch_ix_x_sz = 1 */ | |
/* t_smem_patch_ix_x_nomod = t_smem_patch_ix */ | |
/* t_smem_patch_ix_x = (t_smem_patch_ix%%28) */ | |
/* t_smem_patch_ix_y_dim = 28 */ | |
/* t_smem_patch_ix_y_sz = 28 */ | |
/* t_smem_patch_ix_y_nomod = (t_smem_patch_ix/28) */ | |
/* t_smem_patch_ix_y = ((t_smem_patch_ix/28)%%28) */ | |
/* t_smem_patch_ix_img_dim = 20 */ | |
/* t_smem_patch_ix_img_sz = 784 */ | |
/* t_smem_patch_ix_img_nomod = (t_smem_patch_ix/784) */ | |
/* t_smem_patch_ix_img = (t_smem_patch_ix/784) */ | |
/* t_smem_patch_ix_sz = 15680 */ | |
/* filts_ix_out_chan_elem_x_dim = 1 */ | |
/* filts_ix_out_chan_elem_x_sz = 1 */ | |
/* filts_ix_out_chan_elem_x_nomod = filts_ix_out_chan_elem */ | |
/* filts_ix_out_chan_elem_x = (filts_ix_out_chan_elem%%1) */ | |
/* filts_ix_out_chan_elem_y_dim = 1 */ | |
/* filts_ix_out_chan_elem_y_sz = 1 */ | |
/* filts_ix_out_chan_elem_y_nomod = filts_ix_out_chan_elem */ | |
/* filts_ix_out_chan_elem_y = (filts_ix_out_chan_elem%%1) */ | |
/* filts_ix_out_chan_elem_in_chan_dim = 256 */ | |
/* filts_ix_out_chan_elem_in_chan_sz = 1 */ | |
/* filts_ix_out_chan_elem_in_chan_nomod = filts_ix_out_chan_elem */ | |
/* filts_ix_out_chan_elem_in_chan = filts_ix_out_chan_elem */ | |
/* filts_ix_out_chan_elem_sz = 256 */ | |
/* LOC_ID_1D_out_chan_tile_dim = 4 */ | |
/* LOC_ID_1D_out_chan_tile_sz = 1 */ | |
/* LOC_ID_1D_out_chan_tile_nomod = LOC_ID_1D */ | |
/* LOC_ID_1D_out_chan_tile = (LOC_ID_1D%%4) */ | |
/* LOC_ID_1D_patch_tile_dim = 32 */ | |
/* LOC_ID_1D_patch_tile_sz = 4 */ | |
/* LOC_ID_1D_patch_tile_nomod = (LOC_ID_1D/4) */ | |
/* LOC_ID_1D_patch_tile = (LOC_ID_1D/4) */ | |
/* LOC_ID_1D_sz = 128 */ | |
/* filts_xp_ix_out_chan_tile_dim = 4 */ | |
/* filts_xp_ix_out_chan_tile_sz = 1 */ | |
/* filts_xp_ix_out_chan_tile_nomod = filts_xp_ix */ | |
/* filts_xp_ix_out_chan_tile = (filts_xp_ix%%4) */ | |
/* filts_xp_ix_out_chan_reg_dim = 8 */ | |
/* filts_xp_ix_out_chan_reg_sz = 4 */ | |
/* filts_xp_ix_out_chan_reg_nomod = (filts_xp_ix/4) */ | |
/* filts_xp_ix_out_chan_reg = ((filts_xp_ix/4)%%8) */ | |
/* filts_xp_ix_x_dim = 1 */ | |
/* filts_xp_ix_x_sz = 32 */ | |
/* filts_xp_ix_x_nomod = (filts_xp_ix/32) */ | |
/* filts_xp_ix_x = ((filts_xp_ix/32)%%1) */ | |
/* filts_xp_ix_y_dim = 1 */ | |
/* filts_xp_ix_y_sz = 32 */ | |
/* filts_xp_ix_y_nomod = (filts_xp_ix/32) */ | |
/* filts_xp_ix_y = ((filts_xp_ix/32)%%1) */ | |
/* filts_xp_ix_in_chan_dim = 256 */ | |
/* filts_xp_ix_in_chan_sz = 32 */ | |
/* filts_xp_ix_in_chan_nomod = (filts_xp_ix/32) */ | |
/* filts_xp_ix_in_chan = ((filts_xp_ix/32)%%256) */ | |
/* filts_xp_ix_out_chan_blk_dim = 1 */ | |
/* filts_xp_ix_out_chan_blk_sz = 8192 */ | |
/* filts_xp_ix_out_chan_blk_nomod = (filts_xp_ix/8192) */ | |
/* filts_xp_ix_out_chan_blk = (filts_xp_ix/8192) */ | |
/* filts_xp_ix_sz = 8192 */ | |
/* patch_smem_load_iter = 2 */ | |
/* GRP_ID_1D_out_chan_blk_dim = 1 */ | |
/* GRP_ID_1D_out_chan_blk_sz = 1 */ | |
/* GRP_ID_1D_out_chan_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_out_chan_blk = (GRP_ID_1D%%1) */ | |
/* GRP_ID_1D_patch_blk_dim = 62 */ | |
/* GRP_ID_1D_patch_blk_sz = 1 */ | |
/* GRP_ID_1D_patch_blk_nomod = GRP_ID_1D */ | |
/* GRP_ID_1D_patch_blk = GRP_ID_1D */ | |
/* GRP_ID_1D_sz = 62 */ | |
/* out_chan_tile = (%(LOC_ID_1D_out_chan_tile)+%(GRP_ID_1D_out_chan_blk)*%(LOC_ID_1D_out_chan_tile_dim)) */ | |
/* patch_tile = (%(LOC_ID_1D_patch_tile)+%(GRP_ID_1D_patch_blk)*%(LOC_ID_1D_patch_tile_dim)) */ | |
/* out_chan_ix = (%(out_chan_tile)*%(t_tile_sz)) */ | |
/* patch_ix_0 = (%(patch_tile)*%(t_tile_sz)+0) */ | |
/* patch_ix_0_x_dim = 28 */ | |
/* patch_ix_0_x_sz = 1 */ | |
/* patch_ix_0_x_nomod = %(patch_ix_0) */ | |
/* patch_ix_0_x = (%(patch_ix_0)%%28) */ | |
/* patch_ix_0_y_dim = 28 */ | |
/* patch_ix_0_y_sz = 28 */ | |
/* patch_ix_0_y_nomod = (%(patch_ix_0)/28) */ | |
/* patch_ix_0_y = ((%(patch_ix_0)/28)%%28) */ | |
/* patch_ix_0_img_dim = 20 */ | |
/* patch_ix_0_img_sz = 784 */ | |
/* patch_ix_0_img_nomod = (%(patch_ix_0)/784) */ | |
/* patch_ix_0_img = (%(patch_ix_0)/784) */ | |
/* patch_ix_0_sz = 15680 */ | |
/* patch_ix_1 = (%(patch_tile)*%(t_tile_sz)+1) */ | |
/* patch_ix_1_x_dim = 28 */ | |
/* patch_ix_1_x_sz = 1 */ | |
/* patch_ix_1_x_nomod = %(patch_ix_1) */ | |
/* patch_ix_1_x = (%(patch_ix_1)%%28) */ | |
/* patch_ix_1_y_dim = 28 */ | |
/* patch_ix_1_y_sz = 28 */ | |
/* patch_ix_1_y_nomod = (%(patch_ix_1)/28) */ | |
/* patch_ix_1_y = ((%(patch_ix_1)/28)%%28) */ | |
/* patch_ix_1_img_dim = 20 */ | |
/* patch_ix_1_img_sz = 784 */ | |
/* patch_ix_1_img_nomod = (%(patch_ix_1)/784) */ | |
/* patch_ix_1_img = (%(patch_ix_1)/784) */ | |
/* patch_ix_1_sz = 15680 */ | |
/* patch_ix_2 = (%(patch_tile)*%(t_tile_sz)+2) */ | |
/* patch_ix_2_x_dim = 28 */ | |
/* patch_ix_2_x_sz = 1 */ | |
/* patch_ix_2_x_nomod = %(patch_ix_2) */ | |
/* patch_ix_2_x = (%(patch_ix_2)%%28) */ | |
/* patch_ix_2_y_dim = 28 */ | |
/* patch_ix_2_y_sz = 28 */ | |
/* patch_ix_2_y_nomod = (%(patch_ix_2)/28) */ | |
/* patch_ix_2_y = ((%(patch_ix_2)/28)%%28) */ | |
/* patch_ix_2_img_dim = 20 */ | |
/* patch_ix_2_img_sz = 784 */ | |
/* patch_ix_2_img_nomod = (%(patch_ix_2)/784) */ | |
/* patch_ix_2_img = (%(patch_ix_2)/784) */ | |
/* patch_ix_2_sz = 15680 */ | |
/* patch_ix_3 = (%(patch_tile)*%(t_tile_sz)+3) */ | |
/* patch_ix_3_x_dim = 28 */ | |
/* patch_ix_3_x_sz = 1 */ | |
/* patch_ix_3_x_nomod = %(patch_ix_3) */ | |
/* patch_ix_3_x = (%(patch_ix_3)%%28) */ | |
/* patch_ix_3_y_dim = 28 */ | |
/* patch_ix_3_y_sz = 28 */ | |
/* patch_ix_3_y_nomod = (%(patch_ix_3)/28) */ | |
/* patch_ix_3_y = ((%(patch_ix_3)/28)%%28) */ | |
/* patch_ix_3_img_dim = 20 */ | |
/* patch_ix_3_img_sz = 784 */ | |
/* patch_ix_3_img_nomod = (%(patch_ix_3)/784) */ | |
/* patch_ix_3_img = (%(patch_ix_3)/784) */ | |
/* patch_ix_3_sz = 15680 */ | |
/* patch_ix_4 = (%(patch_tile)*%(t_tile_sz)+4) */ | |
/* patch_ix_4_x_dim = 28 */ | |
/* patch_ix_4_x_sz = 1 */ | |
/* patch_ix_4_x_nomod = %(patch_ix_4) */ | |
/* patch_ix_4_x = (%(patch_ix_4)%%28) */ | |
/* patch_ix_4_y_dim = 28 */ | |
/* patch_ix_4_y_sz = 28 */ | |
/* patch_ix_4_y_nomod = (%(patch_ix_4)/28) */ | |
/* patch_ix_4_y = ((%(patch_ix_4)/28)%%28) */ | |
/* patch_ix_4_img_dim = 20 */ | |
/* patch_ix_4_img_sz = 784 */ | |
/* patch_ix_4_img_nomod = (%(patch_ix_4)/784) */ | |
/* patch_ix_4_img = (%(patch_ix_4)/784) */ | |
/* patch_ix_4_sz = 15680 */ | |
/* patch_ix_5 = (%(patch_tile)*%(t_tile_sz)+5) */ | |
/* patch_ix_5_x_dim = 28 */ | |
/* patch_ix_5_x_sz = 1 */ | |
/* patch_ix_5_x_nomod = %(patch_ix_5) */ | |
/* patch_ix_5_x = (%(patch_ix_5)%%28) */ | |
/* patch_ix_5_y_dim = 28 */ | |
/* patch_ix_5_y_sz = 28 */ | |
/* patch_ix_5_y_nomod = (%(patch_ix_5)/28) */ | |
/* patch_ix_5_y = ((%(patch_ix_5)/28)%%28) */ | |
/* patch_ix_5_img_dim = 20 */ | |
/* patch_ix_5_img_sz = 784 */ | |
/* patch_ix_5_img_nomod = (%(patch_ix_5)/784) */ | |
/* patch_ix_5_img = (%(patch_ix_5)/784) */ | |
/* patch_ix_5_sz = 15680 */ | |
/* patch_ix_6 = (%(patch_tile)*%(t_tile_sz)+6) */ | |
/* patch_ix_6_x_dim = 28 */ | |
/* patch_ix_6_x_sz = 1 */ | |
/* patch_ix_6_x_nomod = %(patch_ix_6) */ | |
/* patch_ix_6_x = (%(patch_ix_6)%%28) */ | |
/* patch_ix_6_y_dim = 28 */ | |
/* patch_ix_6_y_sz = 28 */ | |
/* patch_ix_6_y_nomod = (%(patch_ix_6)/28) */ | |
/* patch_ix_6_y = ((%(patch_ix_6)/28)%%28) */ | |
/* patch_ix_6_img_dim = 20 */ | |
/* patch_ix_6_img_sz = 784 */ | |
/* patch_ix_6_img_nomod = (%(patch_ix_6)/784) */ | |
/* patch_ix_6_img = (%(patch_ix_6)/784) */ | |
/* patch_ix_6_sz = 15680 */ | |
/* patch_ix_7 = (%(patch_tile)*%(t_tile_sz)+7) */ | |
/* patch_ix_7_x_dim = 28 */ | |
/* patch_ix_7_x_sz = 1 */ | |
/* patch_ix_7_x_nomod = %(patch_ix_7) */ | |
/* patch_ix_7_x = (%(patch_ix_7)%%28) */ | |
/* patch_ix_7_y_dim = 28 */ | |
/* patch_ix_7_y_sz = 28 */ | |
/* patch_ix_7_y_nomod = (%(patch_ix_7)/28) */ | |
/* patch_ix_7_y = ((%(patch_ix_7)/28)%%28) */ | |
/* patch_ix_7_img_dim = 20 */ | |
/* patch_ix_7_img_sz = 784 */ | |
/* patch_ix_7_img_nomod = (%(patch_ix_7)/784) */ | |
/* patch_ix_7_img = (%(patch_ix_7)/784) */ | |
/* patch_ix_7_sz = 15680 */ | |
/* get_in = float v = 0; | |
int const smem_in_ix_y = %(t_smem_patch_ix_y)*%(stride)+%(filts_ix_out_chan_elem_y) - %(in_pad); | |
int const smem_in_ix_x = %(t_smem_patch_ix_x)*%(stride)+%(filts_ix_out_chan_elem_x) - %(in_pad); | |
if(smem_in_ix_y >= 0 && smem_in_ix_x >= 0 && | |
%(t_smem_patch_ix_img) < %(in_ix_img_dim) && | |
smem_in_ix_x < %(in_ix_x_dim) && smem_in_ix_y < %(in_ix_y_dim) ) { | |
v = in[%(t_smem_patch_ix_img)*%(in_ix_img_sz) + | |
%(filts_ix_out_chan_elem_in_chan)*%(in_ix_chan_sz) + | |
smem_in_ix_y*%(in_ix_y_sz) + | |
smem_in_ix_x*%(in_ix_x_sz)]; | |
} */ | |
/* t_tile_fmas = // begin t_tile_fmas | |
out_tile[0] += filts_strip[0]*in_strip[0]; | |
out_tile[1] += filts_strip[1]*in_strip[0]; | |
out_tile[2] += filts_strip[2]*in_strip[0]; | |
out_tile[3] += filts_strip[3]*in_strip[0]; | |
out_tile[4] += filts_strip[4]*in_strip[0]; | |
out_tile[5] += filts_strip[5]*in_strip[0]; | |
out_tile[6] += filts_strip[6]*in_strip[0]; | |
out_tile[7] += filts_strip[7]*in_strip[0]; | |
out_tile[8] += filts_strip[0]*in_strip[1]; | |
out_tile[9] += filts_strip[1]*in_strip[1]; | |
out_tile[10] += filts_strip[2]*in_strip[1]; | |
out_tile[11] += filts_strip[3]*in_strip[1]; | |
out_tile[12] += filts_strip[4]*in_strip[1]; | |
out_tile[13] += filts_strip[5]*in_strip[1]; | |
out_tile[14] += filts_strip[6]*in_strip[1]; | |
out_tile[15] += filts_strip[7]*in_strip[1]; | |
out_tile[16] += filts_strip[0]*in_strip[2]; | |
out_tile[17] += filts_strip[1]*in_strip[2]; | |
out_tile[18] += filts_strip[2]*in_strip[2]; | |
out_tile[19] += filts_strip[3]*in_strip[2]; | |
out_tile[20] += filts_strip[4]*in_strip[2]; | |
out_tile[21] += filts_strip[5]*in_strip[2]; | |
out_tile[22] += filts_strip[6]*in_strip[2]; | |
out_tile[23] += filts_strip[7]*in_strip[2]; | |
out_tile[24] += filts_strip[0]*in_strip[3]; | |
out_tile[25] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment