marty1885/error

## error
xor
Using Intel , OpenCL platform: Intel Gen OCL Driver
Using OpenCL device: Intel(R) HD Graphics Skylake Desktop GT2
layer 0:InputLayer{ outputPlanes=2 outputSize=1 }
layer 1:ConvolutionalLayer{ LayerDimensions{ inputPlanes=2 inputSize=1 numFilters=2 filterSize=1 outputSize=1 padZeros=1 biased=1 skip=0} }
layer 2:ActivationLayer{ SIGMOID }
layer 3:ConvolutionalLayer{ LayerDimensions{ inputPlanes=2 inputSize=1 numFilters=2 filterSize=1 outputSize=1 padZeros=1 biased=1 skip=0} }
layer 4:ActivationLayer{ SIGMOID }
layer 5:SoftMaxLayer{ perPlane=0 numPlanes=2 imageSize=1 }

statefultimer v0.7
forward try kernel 0
  ... not plausibly optimal, skipping
forward try kernel 1
   ... seems valid
ForwardAuto: kernel 1 0ms
forward try kernel 0
  ... not plausibly optimal, skipping
forward try kernel 1
   ... seems valid
ForwardAuto: kernel 1 0ms
backward try kernel 0
  ... not plausibly optimal, skipping
backward try kernel 1
   ... seems valid
BackwardAuto: kernel 1 0ms
calcGradWeights try kernel 0
  ... not plausibly optimal, skipping
calcGradWeights try kernel 1
   ... seems valid
BackpropWeightsAuto: kernel 1 0ms
calcGradWeights try kernel 0
  ... not plausibly optimal, skipping
calcGradWeights try kernel 1
   ... seems valid
BackpropWeightsAuto: kernel 1 0ms

after epoch 1 289 ms
 training loss: 2.77967
 train accuracy: 2/4 50%
forward try kernel 2
   ... seems valid
ForwardAuto: kernel 2 6554ms
forward try kernel 2
   ... seems valid
drm_intel_gem_bo_context_exec() failed: Input/output error
ForwardAuto: kernel 2 this instance cant be used:
kernel source:
1: // Copyright Hugh Perkins 2015 hughperkins at gmail
2: //
3: // This Source Code Form is subject to the terms of the Mozilla Public License,
4: // v. 2.0. If a copy of the MPL was not distributed with this file, You can
5: // obtain one at http://mozilla.org/MPL/2.0/.
6:
7: kernel void per_element_add(const int N, global float *target, global const float *source) {
8:     const int globalId = get_global_id(0);
9:     if (globalId >= N) {
10:         return;
11:     }
12:     target[globalId] += source[globalId];
13: }
14:
15: // adds source to target
16: // tiles source as necessary, according to tilingSize
17: kernel void per_element_tiled_add(const int N, const int tilingSize, global float *target, global const float *source) {
18:     const int globalId = get_global_id(0);
19:     if (globalId >= N) {
20:         return;
21:     }
22:     target[globalId] += source[globalId % tilingSize];
23: }
24:
25: kernel void repeated_add(const int N, const int sourceSize, const int repeatSize, global float *target, global const float *source) {
26:     const int globalId = get_global_id(0);
27:     if (globalId >= N) {
28:         return;
29:     }
30:     target[globalId] += source[ (globalId / repeatSize) % sourceSize ];
31: }
32:
33:

Out of resources, code -5
forward try kernel 3
   ... seems valid
drm_intel_gem_bo_context_exec() failed: Input/output error
ForwardAuto: kernel 3 this instance cant be used:
kernel source:
1: // Copyright Hugh Perkins 2014, 2015 hughperkins at gmail
2: //
3: // This Source Code Form is subject to the terms of the Mozilla Public License,
4: // v. 2.0. If a copy of the MPL was not distributed with this file, You can
5: // obtain one at http://mozilla.org/MPL/2.0/.
6:
7: // concept: each workgroup handles convolving one input example with one filtercube
8: // and writing out one single output plane
9: //
10: // workgroup id organized like: [imageid][outplane]
11: // local id organized like: [outrow][outcol]
12: // each thread iterates over: [upstreamplane][filterrow][filtercol]
13: // number workgroups = 32
14: // one filter plane takes up 5 * 5 * 4 = 100 bytes
15: // one filter cube (corresponding to one outplane) = 5*5 * 32 * 4 = 3.2KB (ok)
16: // all filter cubes = 3.2KB * 32 = 102KB (too big)
17: // output are organized like [imageid][filterid][row][col]
18: void kernel forward_3_by_n_outplane(const int batchSize,
19:       global const float *images, global const float *filters,
20:     global float *output,
21:     local float *_upstreamImage, local float *_filterCube) {
22:     const int globalId = get_global_id(0);
23:
24:     const int workgroupId = get_group_id(0);
25:     const int workgroupSize = get_local_size(0);
26:     const int n = workgroupId / gNumFilters;
27:     const int outPlane = workgroupId % gNumFilters;
28:
29:     const int localId = get_local_id(0);
30:     const int outputRow = localId / gOutputSize;
31:     const int outputCol = localId % gOutputSize;
32:
33:     const int minu = gPadZeros ? max(-gHalfFilterSize, -outputRow) : -gHalfFilterSize;
34:     const int maxu = gPadZeros ? min(gHalfFilterSize - gEven, gOutputSize - 1 - outputRow  - gEven) : gHalfFilterSize - gEven;
35:     const int minv = gPadZeros ? max(-gHalfFilterSize, -outputCol) : - gHalfFilterSize;
36:     const int maxv = gPadZeros ? min(gHalfFilterSize - gEven, gOutputSize - 1 - outputCol - gEven) : gHalfFilterSize - gEven;
37:
38:     const int numUpstreamsPerThread = (gInputSizeSquared + workgroupSize - 1) / workgroupSize;
39:
40:     const int filterCubeLength = gInputPlanes * gFilterSizeSquared;
41:     const int filterCubeGlobalOffset = outPlane * filterCubeLength;
42:     const int numPixelsPerThread = (filterCubeLength + workgroupSize - 1) / workgroupSize;
43:     for (int i = 0; i < numPixelsPerThread; i++) {
44:         int thisOffset = localId + i * workgroupSize;
45:         if (thisOffset < filterCubeLength) {
46:             _filterCube[thisOffset] = filters[filterCubeGlobalOffset + thisOffset];
47:         }
48:     }
49:     // dont need a barrier, since we'll just run behind the barrier from the upstream image download
50:
51:     float sum = 0;
52:     for (int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++) {
53:         int thisUpstreamImageOffset = (n * gInputPlanes + upstreamPlane) * gInputSizeSquared;
54:         barrier(CLK_LOCAL_MEM_FENCE);
55:         for (int i = 0; i < numUpstreamsPerThread; i++) {
56:             int thisOffset = workgroupSize * i + localId;
57:             if (thisOffset < gInputSizeSquared) {
58:                 _upstreamImage[ thisOffset ] = images[ thisUpstreamImageOffset + thisOffset ];
59:             }
60:         }
61:         barrier(CLK_LOCAL_MEM_FENCE);
62:         int filterImageOffset = upstreamPlane * gFilterSizeSquared;
63:         for (int u = minu; u <= maxu; u++) {
64:             int inputRow = outputRow + u;
65:             #if gPadZeros == 0
66:                 inputRow += gHalfFilterSize;
67:             #endif
68:             int inputimagerowoffset = inputRow * gInputSize;
69:             int filterrowoffset = filterImageOffset + (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;
70:             for (int v = minv; v <= maxv; v++) {
71:                 int inputCol = outputCol + v;
72:                 #if gPadZeros == 0
73:                     inputCol += gHalfFilterSize;
74:                 #endif
75:                 if (localId < gOutputSizeSquared) {
76:                     sum += _upstreamImage[ inputimagerowoffset + inputCol] * _filterCube[ filterrowoffset + v ];
77:                 }
78:             }
79:         }
80:     }
81:
82:     // output are organized like [imageid][filterid][row][col]
83:     int resultIndex = (n * gNumFilters + outPlane) * gOutputSizeSquared + localId;
84:     if (localId < gOutputSizeSquared) {
85:         output[resultIndex ] = sum;
86:     }
87: }
88:
89:

Out of resources, code -5
forward try kernel 4
   ... seems valid
drm_intel_gem_bo_context_exec() failed: Input/output error
ForwardAuto: kernel 4 this instance cant be used:
kernel source:
1: // Copyright Hugh Perkins 2014, 2015 hughperkins at gmail
2: //
3: // This Source Code Form is subject to the terms of the Mozilla Public License,
4: // v. 2.0. If a copy of the MPL was not distributed with this file, You can
5: // obtain one at http://mozilla.org/MPL/2.0/.
6:
7: void copyLocal(local float *target, global float const *source, int N) {
8:     int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);
9:     for (int loop = 0; loop < numLoops; loop++) {
10:         int offset = loop * get_local_size(0) + get_local_id(0);
11:         if (offset < N) {
12:             target[offset] = source[offset];
13:         }
14:     }
15: }
16:
17: #ifdef gOutputSize // for previous tests that dont define it
18: // workgroup id organized like: [n][filterid]
19: // local id organized like: [outrow][outcol]
20: // each thread iterates over: [upstreamplane][filterrow][filtercol]
21: // number workgroups = 32
22: // one filter plane takes up 5 * 5 * 4 = 100 bytes
23: // one filter cube (corresponding to one outplane) = 5*5 * 32 * 4 = 3.2KB (ok)
24: // all filter cubes = 3.2KB * 32 = 102KB (too big)
25: // output are organized like [n][filterid][outrow][outcol]
26: // the pixels per thread thing... :
27: // - we have one thread (~= cuda core) per output value,
28: //   ie one thread for each combination of [outrow][outcol]
29: // - however, the number of threads is typically limited on a gpu,
30: //   eg to 512 (eg Intel HD), or 1024 (eg nVidia K520)
31: // - so what happens if the number of output points is larger than
32: //   the maximum workgroup size?
33: // - then we have several possibilities really:
34: //   - we can divide the image into blocks, and process each block
35: //     separately.  This is probably a good option, but fair amount of
36: //     work
37: //   - we can get each thread to handle more than one output
38: //     pixel, by looping
39: //   - we can consider the output image in 1d, by putting the rows
40: //     one after another, and assign each contiguous workgroup-size
41: //     block to one workgroup
42: //     => this is how this kernel works
43: //     basically, it's a hack, so larger images actually run, without
44: //     crashing, and we can probably improve it a lot :-)
45: //
46: // So, when outputSize * outputSize > workgroupSize, then
47: // multiple workgroups will be created for each output plane
48: // the number of such workgroups is given by: `gPixelsPerThread`
49: // the id of our workgroup within such a set of workgroups is calculated
50: // as `pixel`
51: // effectiveLocalId is our local id if we had one enormous workgroup
52: // containing the whole output image plane
53: void kernel forward_4_by_n_outplane_smallercache(const int batchSize,
54:       global const float *images, global const float *filters,
55:     global float *output,
56:     local float *_inputPlane, local float *_filterPlane) {
57:     #define globalId (get_global_id(0))
58:
59:     #define localId (get_local_id(0))
60:     #define workgroupId (get_group_id(0))
61: //    const int workgroupSize = get_local_size(0);
62:     const int effectiveWorkgroupId = workgroupId / gPixelsPerThread;
63:     const int pixel = workgroupId % gPixelsPerThread;
64:     const int effectiveLocalId = localId + pixel * gWorkgroupSize;
65:     const int n = effectiveWorkgroupId / gNumFilters;
66:     const int outPlane = effectiveWorkgroupId % gNumFilters;
67:
68:     const int outputRow = effectiveLocalId / gOutputSize;
69:     const int outputCol = effectiveLocalId % gOutputSize;
70:
71:     float sum = 0;
72:     for (int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++) {
73:         barrier(CLK_LOCAL_MEM_FENCE);
74:         copyLocal(_inputPlane, images + (n * gInputPlanes + upstreamPlane) * gInputSizeSquared, gInputSizeSquared);
75:         copyLocal(_filterPlane, filters + (outPlane * gInputPlanes + upstreamPlane) * gFilterSizeSquared, gFilterSizeSquared);
76:         barrier(CLK_LOCAL_MEM_FENCE);
77:
78:         if (effectiveLocalId < gOutputSizeSquared) {
79:             for (int u = -gHalfFilterSize; u <= gHalfFilterSize - gEven; u++) {
80:                 // trying to reduce register pressure...
81:                 #if gPadZeros == 1
82:                     #define inputRow (outputRow + u)
83:                 #else
84:                     #define inputRow (outputRow + u + gHalfFilterSize)
85:                 #endif
86:                 int inputimagerowoffset = inputRow * gInputSize;
87:                 int filterrowoffset = (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;
88:                 bool rowOk = inputRow >= 0 && inputRow < gInputSize;
89:                 for (int v = -gHalfFilterSize; v <= gHalfFilterSize - gEven; v++) {
90:                     #if gPadZeros == 1
91:                         #define inputCol (outputCol + v)
92:                     #else
93:                         #define inputCol (outputCol + v + gHalfFilterSize)
94:                     #endif
95:                     bool process = rowOk && inputCol >= 0 && inputCol < gInputSize;
96:                     if (process) {
97:                             sum += _inputPlane[ inputimagerowoffset + inputCol] * _filterPlane[ filterrowoffset + v ];
98:                     }
99:                 }
100:             }
101:         }
102:     }
103:     // output are organized like [imageid][filterid][row][col]
104:     #define resultIndex (( n * gNumFilters + outPlane) * gOutputSizeSquared + effectiveLocalId)
105:     if (effectiveLocalId < gOutputSizeSquared) {
106:         output[resultIndex ] = sum;
107:     }
108: }
109: #endif
110:
111:

Out of resources, code -5
forward try kernel 5
ForwardAuto: kernel 5: this instance cant be used: For ForwardFc, padzeros must be disabled
   ... not valid
forward try kernel 6
   ... seems valid
drm_intel_gem_bo_context_exec() failed: Input/output error
ForwardAuto: kernel 6 this instance cant be used:
kernel source:
1: // Copyright Hugh Perkins 2014, 2015 hughperkins at gmail
2: //
3: // This Source Code Form is subject to the terms of the Mozilla Public License,
4: // v. 2.0. If a copy of the MPL was not distributed with this file, You can
5: // obtain one at http://mozilla.org/MPL/2.0/.
6:
7: // concept:
8: // - load same input plane from each image
9: // - hold filter plane for this input plane, for all filters
10: // - reduce afterwards
11: // local memory for one plane from each filter of 64c7 = 64 * 7 * 7 * 4 = 12.5KB
12: // local memory for one single input plane = 19 * 19 * 4 = 1.4KB
13: // => seems ok?
14: // workgroupid: [inputPlaneId]
15: // localid: [filterId][outRow] (if this is more than workgroupsize, we should reuse some threads...)
16: // iterate over: [n][outCol]
17: // output: [n][filterId][outRow][outCol][inputPlane]
18: // need to later reduce output over: [inputPlane]
19: void kernel forward_byinputplane(const int batchSize,
20:       global const float *images, global const float *filters,
21:     global float *output,
22:     local float *_inputPlane, local float *_filterPlanes) {
23: //    const int evenPadding = gFilterSize % 2 == 0 ? 1 : 0;
24:
25:     const int globalId = get_global_id(0);
26:     const int workgroupId = get_group_id(0);
27:     const int workgroupSize = get_local_size(0);
28:     const int localId = get_local_id(0);
29:
30:     const int inputPlaneId = workgroupId;
31:     const int numLoops = (gNumFilters * gOutputSize + workgroupSize - 1) / workgroupSize;
32:     const int numFilterCopyLoops = (gFilterSizeSquared + gOutputSize - 1) / gOutputSize;
33:     const int numImageCopyLoops = (gInputSizeSquared + workgroupSize - 1) / workgroupSize;
34:     for (int loop = 0; loop < numLoops; loop++) {
35:         const int loopLocalId = localId + loop * workgroupSize;
36:         const int filterId = loopLocalId / gOutputSize;
37:         const int outRow = loopLocalId % gOutputSize;
38:
39:         // copy down our filter, we have gOutputSize threads to do this
40:         global float const *globalFilterPlane = filters +
41:             (filterId * gNumInputPlanes + inputPlaneId) * gFilterSizeSquared;
42:         local float *_localFilterPlane = _filterPlanes + filterId * gFilterSizeSquared;
43:         barrier(CLK_LOCAL_MEM_FENCE);
44:         for (int i = 0; i < numFilterCopyLoops; i++) {
45:             const int offset = i * gOutputSize + outRow;
46:             bool process = filterId < gNumFilters && offset < gFilterSizeSquared;
47:             if (process) {
48:                 _localFilterPlane[ offset ] = globalFilterPlane[ offset ];
49:             }
50:         }
51:         // loop over n ...
52:         for (int n = 0; n < batchSize; n++) {
53:             // copy down our imageplane, we have workgroupSize threads to do this
54:             barrier(CLK_LOCAL_MEM_FENCE);
55:             global float const *globalImagePlane = images +
56:                 (n * gNumInputPlanes + inputPlaneId) * gInputSizeSquared;
57:             for (int i = 0; i< numImageCopyLoops; i++) {
58:                 const int offset = i * workgroupSize + localId;
59:                 if (offset < gInputSizeSquared) {
60:                     _inputPlane[ offset ] = globalImagePlane[ offset ];
61:                 }
62:             }
63:             barrier(CLK_LOCAL_MEM_FENCE);
64:             // calc output for each [outrow][outcol]
65:             bool filterPlaneOk = filterId < gNumFilters;
66:             for (int outCol = 0; outCol < gOutputSize; outCol++) {
67:                 float sum = 0;
68:                 for (int filterRow = 0; filterRow < gFilterSize; filterRow++) {
69:                     int inRow = outRow + filterRow;
70:                     #if gPadZeros == 1
71:                         inRow -= gHalfFilterSize;
72:                     #endif
73:                     bool rowOk = filterPlaneOk && inRow >= 0 && inRow < gInputSize;
74:                     for (int filterCol = 0; filterCol < gFilterSize; filterCol++) {
75:                         int inCol = outCol + filterCol;
76:                         #if gPadZeros == 1
77:                             inCol -= gHalfFilterSize;
78:                         #endif
79:                         bool process = rowOk && inCol >= 0 && inCol < gInputSize;
80:                         if (process) {
81:                             float imageValue = _inputPlane[ inRow * gInputSize + inCol ];
82:                             float filterValue = _localFilterPlane[ filterRow * gFilterSize + filterCol ];
83:                             sum += imageValue * filterValue;
84:                         }
85:                     }
86:                 }
87:                 if (filterId < gNumFilters) {
88:                     // [n][filterId][outRow][outCol][inputPlane]
89:                     int resultIndex = (( (n
90:                         * gNumFilters + filterId)
91:                         * gOutputSize + outRow)
92:                         * gOutputSize + outCol)
93:                         * gNumInputPlanes + inputPlaneId;
94:                     output[resultIndex] = sum;
95:                     //if (globalId == 2) output[0] = resultIndex;
96: //                    output[resultIndex] = outRow;
97:                 }
98: //                output[localId] = _localFilterPlane[localId];
99:             }
100:         }
101:     }
102: }
103:
104:

Out of resources, code -5
forward try kernel 7
   ... seems valid
drm_intel_gem_bo_context_exec() failed: Input/output error
ForwardAuto: kernel 7 this instance cant be used:
kernel source:
1: // from SpatialConvolutionMM.cu:
2:
3: // CL: grid stride looping
4: #define CL_KERNEL_LOOP(i, n)                        \
5:   for (int i = get_group_id(0) * get_local_size(0) + get_local_id(0); \
6:       i < (n);                                       \
7:       i += get_local_size(0) * get_num_groups(0))
8:
9: //#define gPadding 0
10: //#define gStride 1
11: //#define gColSize 1
12: //#define gFilterSize 1
13: //#define gSize 1
14:
15: // Kernel for fast unfold+copy
16: // (adapted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
17: kernel void im2col(
18:     const int n,
19:     global float const * im_data, int im_offset,
20:     global float* data_col) {
21:   global const float *data_im = im_data + im_offset;
22:
23:   CL_KERNEL_LOOP(index, n) {
24:     int w_out = index % 1;
25:     index /= 1;
26:     int h_out = index % 1;
27:     int channel_in = index / 1;
28:     int channel_out = channel_in * 1 * 1;
29:     int h_in = h_out * 1 - 0;
30:     int w_in = w_out * 1 - 0;
31:     data_col += (channel_out * 1 + h_out) * 1 + w_out;
32:     data_im += (channel_in * 1 + h_in) * 1 + w_in;
33:     for (int i = 0; i < 1; ++i) {
34:       for (int j = 0; j < 1; ++j) {
35:         int h = h_in + i;
36:         int w = w_in + j;
37:         *data_col = (h >= 0 && w >= 0 && h < 1 && w < 1) ?
38:           data_im[i * 1 + j] : 0;
39:         data_col += 1 * 1;
40:       }
41:     }
42:   }
43: }
44:
45: kernel void col2im(
46:     const int n,
47:     global float const *data_col,
48:     global float* im_data, int im_offset) {
49:   global float *data_im = im_data + im_offset;
50:
51:   for (int index = get_group_id(0) * get_local_size(0) + get_local_id(0); index < (n); index += get_local_size(0) * get_num_groups(0)) {
52:     float val = 0;
53:     int w = index % 1 + 0;
54:     int h = (index / 1) % 1 + 0;
55:     int c = index / (1 * 1);
56:     // compute the start and end of the output
57:     int w_col_start = (w < 1) ? 0 : (w - 1) / 1 + 1;
58:     int w_col_end = min(w / 1 + 1, 1);
59:     int h_col_start = (h < 1) ? 0 : (h - 1) / 1 + 1;
60:     int h_col_end = min(h / 1 + 1, 1);
61:
62:     int offset = (c * 1 * 1 + h * 1 + w) * 1 * 1;
63:     int coeff_h_col = (1 - 1 * 1 * 1) * 1;
64:     int coeff_w_col = (1 - 1 * 1 * 1);
65:     for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
66:       for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
67:         val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
68:       }
69:     }
70:     data_im[index] = val;
71:   }
72: }
73:
74:

Out of resources, code -5
   forward kernel 0: cannot be used
   forward kernel 1 time: 0ms
   forward kernel 2: cannot be used
   forward kernel 3: cannot be used
   forward kernel 4: cannot be used
   forward kernel 5: cannot be used
   forward kernel 6: cannot be used
   forward kernel 7: cannot be used
   forward layer selected kernel 1
drm_intel_gem_bo_context_exec() failed: Input/output error
terminate called after throwing an instance of 'std::runtime_error'
  what():
kernel source:
1: // Copyright Hugh Perkins 2014, 2015 hughperkins at gmail
2: //
3: // This Source Code Form is subject to the terms of the Mozilla Public License,
4: // v. 2.0. If a copy of the MPL was not distributed with this file, You can
5: // obtain one at http://mozilla.org/MPL/2.0/.
6:
7: // notes on non-odd filtersizes:
8: // for odd, imagesize and filtersize 3, padZeros = 0:
9: // output is a single square
10: // m and n should vary between -1,0,1
11: // for even, imagesize and filtersize 2, padzeros = 0
12: // output is a single square, which we can position at topleft or bottomrigth
13: // lets position it in bottomright
14: // then m and n should vary as -1,0
15: //
16: // for even, imagesize and filtersize 2, padzeros = 1
17: // output is 2 by 2
18: // well... if it is even:
19: // - if we are not padding zeros, then we simply move our filter around the image somehow
20: // - if we are padding zeros, then we conceptually pad the bottom and right edge of the image with zeros by 1
21: // filtersize remains the same
22: //      m will vary as -1,0,1
23: //       outputrow is fixed by globalid
24: //       inputrow should be unchanged...
25: // padzeros = 0:
26: //  x x .  . . .
27: //  x x .  . x x
28: //  . . .  . x x
29: // when filtersize even:
30: //    new imagesize = oldimagesize - filtersize + 1
31: // when filtersize odd:
32: //    x x x .
33: //    x x x .
34: //    x x x .
35: //    . . . .
36: //    new imagesize = oldimagesize - filtersize + 1
37: // padzeros = 1:
38: // x x
39: // x x . .   x x .    . . .     . . .
40: //   . . .   x x .    . x x     . . .
41: //   . . .   . . .    . x x     . . x x
42: // outrow=0 outrow=1  outrow=2      x x
43: // outcol=0 outcol=1  outcol=2    outrow=3
44: //                                outcol=3
45: // when filtersize is even, and padzeros, imagesize grows by 1 each time...
46: //    imagesize = oldimagesize + 1
47: // when filtersize is odd
48: //  x x x
49: //  x x x .   x x x    . . .
50: //  x x x .   x x x    . x x x
51: //    . . .   x x x    . x x x
52: //                       x x x
53:
54: // images are organized like [imageId][plane][row][col]
55: // filters are organized like [filterid][inplane][filterrow][filtercol]
56: // output are organized like [imageid][filterid][row][col]
57: // global id is organized like output, ie: [imageid][outplane][outrow][outcol]
58: // - no local memory used currently
59: // - each thread:
60: //     - loads a whole upstream cube
61: //     - loads a whole filter cube
62: //     - writes one output...
63: void kernel convolve_imagecubes_float2(
64:     const int numExamples,
65:       global const float *inputs, global const float *filters,
66:     global float *output) {
67:     int globalId = get_global_id(0);
68:
69:     int outputImage2Id = globalId / gOutputSizeSquared;
70:     int exampleId = outputImage2Id / gNumFilters;
71:     int filterId = outputImage2Id % gNumFilters;
72:
73:     // intraimage coords
74:     int localid = globalId % gOutputSizeSquared;
75:     int outputRow = localid / gOutputSize;
76:     int outputCol = localid % gOutputSize;
77:
78:     global float const*inputCube = inputs + exampleId * gNumInputPlanes * gInputSizeSquared;
79:     global float const*filterCube = filters + filterId * gNumInputPlanes * gFilterSizeSquared;
80:
81:     float sum = 0;
82:     if (exampleId < numExamples) {
83:         for (int inputPlaneIdx = 0; inputPlaneIdx < gNumInputPlanes; inputPlaneIdx++) {
84:             global float const*inputPlane = inputCube + inputPlaneIdx * gInputSizeSquared;
85:             global float const*filterPlane = filterCube + inputPlaneIdx * gFilterSizeSquared;
86:             for (int u = -gHalfFilterSize; u <= gHalfFilterSize - gEven; u++) {
87:                 // trying to reduce register pressure...
88:                 #if gPadZeros == 1
89:                     #define inputRowIdx (outputRow + u)
90:                 #else
91:                     #define inputRowIdx (outputRow + u + gHalfFilterSize)
92:                 #endif
93:                 global float const *inputRow = inputPlane + inputRowIdx * gInputSize;
94:                 global float const *filterRow = filterPlane + (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;
95:                 bool rowOk = inputRowIdx >= 0 && inputRowIdx < gInputSize;
96:                 #pragma unroll
97:                 for (int v = -gHalfFilterSize; v <= gHalfFilterSize - gEven; v++) {
98:                     #if gPadZeros == 1
99:                         #define inputColIdx (outputCol + v)
100:                     #else
101:                         #define inputColIdx (outputCol + v + gHalfFilterSize)
102:                     #endif
103:                     bool process = rowOk && inputColIdx >= 0 && inputColIdx < gInputSize;
104:                     if (process) {
105:                             sum += inputRow[inputColIdx] * filterRow[v];
106:                     }
107:                 }
108:             }
109:         }
110:     }
111:
112:     if (exampleId < numExamples) {
113:         output[globalId] = sum;
114:     }
115: }
116:
117:

Out of resources, code -5