dniku/Still running MNIST maxout

## Still running MNIST maxout
Using gpu device 0: GRID K520
error: XDG_RUNTIME_DIR not set in the environment.
error: XDG_RUNTIME_DIR not set in the environment.

(train.py:2434): Gdk-CRITICAL **: gdk_cursor_new_for_display: assertion 'GDK_IS_DISPLAY (display)' failed
/home/ubuntu/pylearn2/pylearn2/utils/image.py:16: UserWarning: Unable to import matplotlib. Some features unavailable. Original exception: constructor returned NULL
  "Original exception: " + str(matplotlib_exception))

(train.py:2434): Gdk-CRITICAL **: gdk_cursor_new_for_display: assertion 'GDK_IS_DISPLAY (display)' failed
/home/ubuntu/pylearn2/pylearn2/sandbox/cuda_convnet/__init__.py:66: UserWarning: You are using probably a too old Theano version. That will cause compilation crash. If so, update Theano.
  "You are using probably a too old Theano version. That"
Input shape: (28, 28)
Detector space: (21, 21)
1 /*
2  * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without modification,
6  * are permitted provided that the following conditions are met:
7  *
8  * - Redistributions of source code must retain the above copyright notice,
9  *   this list of conditions and the following disclaimer.
10  *
11  * - Redistributions in binary form must reproduce the above copyright notice,
12  *   this list of conditions and the following disclaimer in the documentation
13  *   and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26
27 #include <stdio.h>
28 #include <cuda_runtime.h>
29 #include <nvmatrix_kernels.cuh>
30
31 __global__ void kTile(const float* src, float* tgt, const uint srcWidth, const uint srcHeight, const uint tgtWidth, const uint tgtHeight) {
32     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
33     const int numThreads = blockDim.x * gridDim.x;
34     //    const unsigned int numEls = tgtWidth * tgtHeight;
35     for (uint i = idx; i < tgtWidth * tgtHeight; i += numThreads) {
36         const uint y = i / tgtWidth;
37         const uint x = i % tgtWidth;
38         const uint srcY = y % srcHeight;
39         const uint srcX = x % srcWidth;
40         tgt[i] = src[srcY * srcWidth + srcX];
41     }
42 }
43
44 __global__ void kDotProduct_r(float* a, float* b, float* target, const uint numCols, const uint numElements) {
45     __shared__ float shmem[DP_BLOCKSIZE];
46
47     uint eidx = DP_BLOCKSIZE * blockIdx.x + threadIdx.x;
48     shmem[threadIdx.x] = 0;
49     if (eidx < numCols) {
50         for (; eidx < numElements; eidx += numCols) {
51             shmem[threadIdx.x] += a[eidx] * b[eidx];
52         }
53     }
54     __syncthreads();
55     if (threadIdx.x < 256) {
56         shmem[threadIdx.x] += shmem[threadIdx.x + 256];
57     }
58     __syncthreads();
59     if (threadIdx.x < 128) {
60         shmem[threadIdx.x] += shmem[threadIdx.x + 128];
61     }
62     __syncthreads();
63     if (threadIdx.x < 64) {
64         shmem[threadIdx.x] += shmem[threadIdx.x + 64];
65     }
66     __syncthreads();
67     if (threadIdx.x < 32) {
68         volatile float* mysh = &shmem[threadIdx.x];
69         *mysh += mysh[32];
70         *mysh += mysh[16];
71         *mysh += mysh[8];
72         *mysh += mysh[4];
73         *mysh += mysh[2];
74         *mysh += mysh[1];
75         if (threadIdx.x == 0) {
76             target[blockIdx.x] = *mysh;
77         }
78     }
79 }
80
81 __global__ void kSetupCurand(curandState *state, unsigned long long seed) {
82     const uint tidx = NUM_RND_THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;
83     /* Each thread gets same seed, a different sequence number,
84      no offset */
85     curand_init(seed, tidx, 0, &state[tidx]);
86 }
87
88
89 /*
90  * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
91  * All rights reserved.
92  *
93  * Redistribution and use in source and binary forms, with or without modification,
94  * are permitted provided that the following conditions are met:
95  *
96  * - Redistributions of source code must retain the above copyright notice,
97  *   this list of conditions and the following disclaimer.
98  *
99  * - Redistributions in binary form must reproduce the above copyright notice,
100  *   this list of conditions and the following disclaimer in the documentation
101  *   and/or other materials provided with the distribution.
102  *
103  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
104  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
105  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
106  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
107  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
108  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
109  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
110  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
111  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
112  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
113  */
114
115 #ifndef _NVMATRIX_EXPORT
116 #define _NVMATRIX_EXPORT
117 #endif
118
119 #include <set>
120 #include <vector>
121 #include <assert.h>
122 #include <cublas_v2.h>
123 #include <cutil_inline.h>
124 #include <stdlib.h>
125 #include <stdio.h>
126 #include <fstream>
127 #include <iostream>
128 #include <algorithm>
129 #include <typeinfo>
130 #include <nvmatrix.cuh>
131 #include <nvmatrix_operators.cuh>
132 #include <map>
133
134 using namespace std;
135
136 /*
137  * Device random number generator pointers.
138  */
139 //map<int,curandGenerator_t> NVMatrix::rndGen;
140 map<int,curandState*> NVMatrix::rndDevStates;
141 pthread_mutex_t* NVMatrix::_rndMutex = makeMutex();
142
143 pthread_mutex_t* NVMatrix::makeMutex() {
144     pthread_mutex_t* m = (pthread_mutex_t*) malloc(sizeof(pthread_mutex_t));
145     pthread_mutex_init(m, NULL);
146     return m;
147 }
148
149 NVMatrix::NVMatrix(const CudaNdarray * view,
150                 int numRows, int numCols, const char * msg)
151 {
152     if (!CudaNdarray_is_c_contiguous(view))
153     {
154         printf("Non contiguous input: %s\n", msg);
155         printf("Dims: ");
156         for (int i=0; i < view->nd; i++)
157             printf("%d ",CudaNdarray_HOST_STRIDES(view)[i]);
158         printf("\n");
159         assert(false);
160     }
161
162     //Check that view actually contains numRows * numCols elements
163     const int * dims = CudaNdarray_HOST_DIMS(view);
164     int total = 1;
165     for (int i = 0; i < view->nd; i++)
166     {
167         total *= dims[i];
168     }
169     if (total != numRows * numCols)
170     {
171             fprintf(stderr, "NVMatrix asked to make a view of a CudaNdarray with %d elements",total);
172             fprintf(stderr, " but told to arrange these in a %d x %d rectangle (of total size %d).\n",
173                             numRows, numCols, numRows * numCols);
174             fprintf(stderr, "CudaNdarray dims: ");
175             for (int i = 0; i < view->nd; i++)
176                     fprintf(stderr, "%d ", dims[i]);
177             fprintf(stderr, "\n");
178             assert(false);
179     }
180
181     //Make the view
182     _numRows = numRows;
183     _numCols = numCols;
184     _numElements = numRows * numCols;
185     _ownsData = false;
186     _isTrans = false;
187     _devData = view->devdata;
188     _stride = getLeadingDim();
189 }
190
191 void NVMatrix::_init(int numRows, int numCols, int stride, bool isTrans) {
192     _numRows = numRows;
193     _numCols = numCols;
194     _numElements = numRows * numCols;
195     _ownsData = true;
196
197     _isTrans = isTrans;
198     _devData = NULL;
199     if (_numElements > 0) {
200         cudaError_t err = cudaMalloc((void**) &_devData,
201                                      _numElements * sizeof(float));
202         if (cudaSuccess != err){
203           fprintf(stderr, "!!!! device memory allocation error\n", NULL);
204             exit(EXIT_FAILURE);
205       }
206     }
207     _stride = stride < 0 ? getLeadingDim() : stride;
208 }
209
210 NVMatrix::NVMatrix() {
211     _init(0, 0, -1, false);
212 }
213
214 NVMatrix::NVMatrix(bool isTrans) {
215     _init(0, 0, -1, isTrans);
216 }
217
218 NVMatrix::NVMatrix(int numRows, int numCols, bool isTrans) {
219     _init(numRows, numCols, -1, isTrans);
220 }
221
222 /*
223 NVMatrix::NVMatrix(const Matrix& like, bool copy) {
224     _init(like.getNumRows(), like.getNumCols(), -1, like.isTrans());
225     if (copy) {
226         copyFromHost(like);
227     }
228 }
229 */
230
231 NVMatrix::NVMatrix(const NVMatrix& like, bool copy) {
232     _init(like.getNumRows(), like.getNumCols(), -1, like.isTrans());
233     if (copy) {
234         like.copy(*this);
235     }
236 }
237
238 /*
239  * Initializes NVMatrix with same dimensions as given matrix but
240  * does not copy any data.
241  */
242 NVMatrix::NVMatrix(const NVMatrix& like) {
243     _init(like.getNumRows(), like.getNumCols(), -1, like.isTrans());
244 }
245
246 /*
247  * Initializes NVMatrix with same dimensions as given matrix but
248  * does not copy any data.
249 NVMatrix::NVMatrix(const Matrix& like) {
250     _init(like.getNumRows(), like.getNumCols(), -1, false);
251 }
252  */
253
254 NVMatrix::NVMatrix(float* devData, int numRows, int numCols, int stride, bool isTrans) :
255     _numRows(numRows),
256     _numCols(numCols),
257     _numElements(numRows*numCols),
258     _ownsData(false),
259     _devData(devData),
260     _isTrans(isTrans) {
261     _stride = stride < 0 ? getLeadingDim() : stride;
262 }
263
264 NVMatrix::~NVMatrix() {
265     if(_ownsData && _numElements > 0) {
266         // This line was modified by Ian Goodfellow to use device_free
267         // so that theano may keep track of device memory usage
268         int status = device_free(_devData);
269         if (status != 0) {
270             fprintf(stderr, "!!!! memory free error\n");
271             exit(EXIT_FAILURE);
272         }
273     }
274 }
275
276 /*
277 void NVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeDeviceMatrix) {
278     if (resizeDeviceMatrix) {
279         resize(hostMatrix);
280     }
281     copyFromHost(hostMatrix);
282 }
283
284 void NVMatrix::copyFromHost(const Matrix& hostMatrix) {
285 //    assert(getStride() == getLeadingDim());
286     assert(isSameDims(hostMatrix));
287     setTrans(hostMatrix.isTrans());
288
289     if (getNumElements() > 0) {
290         cublasStatus status = cublasSetMatrix(hostMatrix.getLeadingDim(), hostMatrix.getFollowingDim(), sizeof(float),
291                                               hostMatrix.getData(), hostMatrix.getLeadingDim(), _devData, _stride);
292         if (status != CUBLAS_STATUS_SUCCESS) {
293             fprintf(stderr, "!!!! device access error (write)\n");
294             exit( EXIT_FAILURE);
295         }
296     }
297 }
298
299 void NVMatrix::copyToHost(Matrix& hostMatrix) const {
300 //    assert(getStride() == getLeadingDim());
301     assert(isSameDims(hostMatrix));
302     hostMatrix.setTrans(_isTrans);
303     if (getNumElements() > 0) {
304     //    printf("rows: %d, cols: %d, stride: %d\n", getNumRows(), getNumCols(), getStride());
305         cublasStatus status = cublasGetMatrix(getLeadingDim(),getFollowingDim(), sizeof(float),
306                                              _devData, getStride(), hostMatrix.getData(), hostMatrix.getLeadingDim());
307         if (status != CUBLAS_STATUS_SUCCESS) {
308             fprintf(stderr, "!!!! device access error (read)\n");
309             exit( EXIT_FAILURE);
310         }
311     }
312 }
313
314 void NVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget) const {
315     if (resizeTarget) {
316         hostMatrix.resize(_numRows, _numCols);
317     }
318     copyToHost(hostMatrix);
319 }
320 */
321
322 void NVMatrix::copy(NVMatrix& dest) const {
323     dest.resize(*this);
324     copy(dest, 0, -1, 0, -1, 0, 0);
325 }
326
327 NVMatrix& NVMatrix::copy() const {
328     NVMatrix* c = new NVMatrix();
329     copy(*c);
330     return *c;
331 }
332
333 void NVMatrix::rightMult(const NVMatrix &b, float scaleAB, NVMatrix &target) const {
334     assert(isContiguous() && b.isContiguous() && target.isContiguous());
335 //    assert(&target != &b);
336     assert(_numCols == b.getNumRows());
337     if(&target != this) {
338         target.resize(_numRows, b.getNumCols());
339         target.setTrans(true);
340     }
341     assert(target.getNumRows() == _numRows);
342     assert(target.getNumCols() == b.getNumCols());
343     if(_numRows % 64 != 0 || _numCols % 64 != 0 || b.getNumCols() % 64 != 0) {
344         WARN("Matrix dimensions not divisible by 64 -- cublasSgemm performance may suffer.");
345     }
346     cublasStatus_t err;
347     float zero = 0;
348     err = cublasSgemm(handle, getTransOp(), b.getTransOp(),
349                       _numRows, b.getNumCols(), _numCols,
350                       &scaleAB, _devData, getLeadingDim(), b.getDevData(),
351                       b.getLeadingDim(),
352                       &zero, target.getDevData(), getNumRows());
353     checkCublasError(err, "cublasSgemm failed");
354 //    cudaThreadSynchronize();
355 }
356
357 void NVMatrix::rightMult(const NVMatrix &b, float scaleAB) {
358     rightMult(b, scaleAB, *this);
359 }
360
361 void NVMatrix::rightMult(const NVMatrix &b, NVMatrix& target) const {
362     rightMult(b, 1, target);
363 }
364
365 /*
366  * This will only work if this matrix is in column-major order! In other words,
367  * if isTrans() returns true.
368  */
369 void NVMatrix::addProduct(const NVMatrix& a, const NVMatrix &b, float scaleThis, float scaleAB) {
370     if (scaleThis == 0) {
371         a.rightMult(b, scaleAB, *this);
372         return;
373     }
374     assert(isContiguous());
375     assert(a.getNumCols() == b.getNumRows());
376     assert(this->getNumRows() == a.getNumRows());
377     assert(this->getNumCols() == b.getNumCols());
378     assert(_isTrans);
379     if(a.getNumRows() % 64 != 0 || a.getNumCols() % 64 != 0 || b.getNumCols() % 64 != 0) {
380         WARN("Matrix dimensions not divisible by 64 -- cublasSgemm performance may suffer.");
381     }
382     cublasStatus_t err;
383     err = cublasSgemm(handle, a.getTransOp(), b.getTransOp(),
384                       a.getNumRows(), b.getNumCols(), a.getNumCols(),
385                       &scaleAB, a.getDevData(), a.getLeadingDim(),
386                       b.getDevData(), b.getLeadingDim(),
387                       &scaleThis, _devData, getLeadingDim());
388     checkCublasError(err, "cublasSgemm failed");
389 //    cudaThreadSynchronize();
390 }
391
392 void NVMatrix::addProduct(const NVMatrix& a, const NVMatrix &b) {
393     addProduct(a, b, 1, 1);
394 }
395
396 template <class Randomizer>
397 void NVMatrix::_unaryRandomize(NVMatrix& target, Randomizer rnd) {
398     assert(isRndInitialized());
399     assert(isContiguous() && target.isContiguous());
400     if (!isSameDims(target)) {
401         target.resize(*this);
402     }
403     assert(isTrans() == target.isTrans());
404     kUnaryRandomize<<<NUM_RND_BLOCKS,NUM_RND_THREADS_PER_BLOCK>>>(getDevData(), target.getDevData(), getCurandState(), getNumElements(), rnd);
405     cutilCheckMsg("kUnaryRandomize: Kernel execution failed");
406 }
407
408 template <class Randomizer>
409 void NVMatrix::_binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd) {
410     assert(isRndInitialized());
411     assert(isContiguous() && data2.isContiguous() && target.isContiguous());
412     assert(isSameDims(data2));
413     assert(isTrans() == data2.isTrans());
414     if (!isSameDims(target)) {
415         target.resize(*this);
416     }
417     assert(isTrans() == target.isTrans());
418     kBinaryRandomize<<<NUM_RND_BLOCKS,NUM_RND_THREADS_PER_BLOCK>>>(getDevData(), data2.getDevData(), target.getDevData(), getCurandState(), getNumElements(), rnd);
419     cutilCheckMsg("kBinaryRandomize: Kernel execution failed");
420 }
421
422 /* Function removed by Ian Goodfellow.
423 We do not need this function in theano / pylearn2 and it uses cudaMalloc directly.
424 If you need to enable it, modify it to use device_malloc instead.
425 Otherwise, theano will not be able to keep track of how much memory is used on
426 the device.
427 void NVMatrix::initRandom(unsigned long long seed) {
428     assert(!isRndInitialized());
429     pthread_mutex_lock(_rndMutex);
430     int d = getDeviceID();
431     rndDevStates[d] = NULL;
432     CUDA_CALL(cudaMalloc((void **)&rndDevStates[d], NUM_RND_STREAMS * sizeof(curandState)));
433     pthread_mutex_unlock(_rndMutex);
434     kSetupCurand<<<NUM_RND_BLOCKS, NUM_RND_THREADS_PER_BLOCK>>>(getCurandState(), 1 + seed*2); // so there's no chance it'll be correlated with the other one
435     cutilCheckMsg("initRandom: Kernel execution failed");
436 }
437
438 void NVMatrix::initRandom() {
439     NVMatrix::initRandom(time(0));
440 }
441 */
442
443 curandState* NVMatrix::getCurandState() {
444     pthread_mutex_lock(_rndMutex);
445     int d = getDeviceID();
446     assert(rndDevStates.count(d) != 0);
447     curandState* r = rndDevStates[d];
448     pthread_mutex_unlock(_rndMutex);
449     return r;
450 }
451
452 int NVMatrix::getDeviceID() {
453     int d;
454     cudaGetDevice(&d);
455     return d;
456 }
457
458 bool NVMatrix::isRndInitialized() {
459     pthread_mutex_lock(_rndMutex);
460     bool b = rndDevStates.count(getDeviceID()) != 0;
461     pthread_mutex_unlock(_rndMutex);
462     return b;
463 }
464
465 /* Function removed by Ian Goodfellow due to not needing
466    it and it using cudaFree instead of device_free
467 void NVMatrix::destroyRandom() {
468     assert(isRndInitialized());
469     int d = getDeviceID();
470
471     pthread_mutex_lock(_rndMutex);
472     CUDA_CALL(cudaFree(rndDevStates[d]));
473     rndDevStates.erase(d);
474     pthread_mutex_unlock(_rndMutex);
475 } */
476
477 void NVMatrix::binarizeProbs() {
478     binarizeProbs(*this);
479 }
480
481 void NVMatrix::binarizeProbs(NVMatrix& target) {
482     _unaryRandomize(target, BinarizeUnaryRandomizer());
483 }
484
485 void NVMatrix::randomizeUniform() {
486     assert(isContiguous());
487     assert(isRndInitialized());
488 //    CURAND_CALL(curandGenerateUniform(rndGen, _devData, getNumElements()));
489     _unaryRandomize(*this, UniformUnaryRandomizer());
490 }
491
492 void NVMatrix::randomizeGaussian() {
493     randomizeGaussian(1);
494 }
495
496 void NVMatrix::randomizeGaussian(float stdev) {
497     randomizeGaussian(0, stdev);
498 }
499
500 void NVMatrix::randomizeGaussian(float mean, float stdev) {
501     assert(isContiguous());
502     assert(isRndInitialized());
503 //    CURAND_CALL(curandGenerateNormal(rndGen, _devData, getNumElements(), mean, stdev));
504     _unaryRandomize(*this, GaussianUnaryRandomizer(mean, stdev));
505 }
506
507 /*
508  * Kind of a hack since we don't actually need the contents of this matrix for it,
509  * so we don't really need a binary randomizer.
510  */
511 void NVMatrix::randomizeGaussian(NVMatrix& stdevs) {
512     _binaryRandomize(stdevs, *this, GaussianBinaryRandomizer());
513 }
514
515 void NVMatrix::addGaussianNoise() {
516     addGaussianNoise(1);
517 }
518
519 void NVMatrix::addGaussianNoise(float stdev) {
520     addGaussianNoise(stdev, *this);
521 }
522
523 void NVMatrix::addGaussianNoise(float stdev, NVMatrix& target) {
524     _unaryRandomize(target, AddGaussianUnaryRandomizer(stdev));
525 }
526
527 void NVMatrix::addGaussianNoise(NVMatrix& stdevs, bool var) {
528     addGaussianNoise(stdevs, var, *this);
529 }
530
531 void NVMatrix::addGaussianNoise(NVMatrix& stdevs) {
532     addGaussianNoise(stdevs, false, *this);
533 }
534
535 void NVMatrix::addGaussianNoise(NVMatrix& stdevs, bool var, NVMatrix& target) {
536     if (var) {
537         _binaryRandomize(stdevs, target, AddGaussianBinaryRandomizer<true>());
538     } else {
539         _binaryRandomize(stdevs, target, AddGaussianBinaryRandomizer<false>());
540     }
541 }
542
543 void NVMatrix::biggerThan(NVMatrix& b, NVMatrix& target) {
544     applyBinary(NVMatrixBinaryOps::BiggerThan(), b, target);
545 }
546
547 void NVMatrix::biggerThan(NVMatrix& b) {
548     biggerThan(b, *this);
549 }
550
551 void NVMatrix::equals(NVMatrix& b, NVMatrix& target) {
552     applyBinary(NVMatrixBinaryOps::Equals(), b, target);
553 }
554
555 void NVMatrix::equals(NVMatrix& m) {
556     equals(m, *this);
557 }
558
559 void NVMatrix::biggerThanVector(NVMatrix& vec, NVMatrix& target) {
560     applyBinaryV(NVMatrixBinaryOps::BiggerThan(), vec, target);
561 }
562
563 void NVMatrix::biggerThanVector(NVMatrix& vec) {
564     biggerThanVector(vec, *this);
565 }
566
567 void NVMatrix::_checkBounds(int startRow, int endRow, int startCol, int endCol) const {
568     assert(startRow >= 0 && startRow < _numRows);
569     assert(endRow > startRow && endRow <= _numRows);
570     assert(startCol >= 0 && startCol < _numCols);
571     assert(endCol > startCol && endCol <= _numCols);
572 }
573
574 /*
575  * The only place where stride is supported for now!
576  * Will ALWAYS return a view of the original data, sometimes non-contiguous.
577  */
578 NVMatrix& NVMatrix::slice(int startRow, int endRow, int startCol, int endCol) const {
579     endRow = endRow < 0 ? this->_numRows : endRow;
580     endCol = endCol < 0 ? this->_numCols : endCol;
581     _checkBounds(startRow, endRow, startCol, endCol);
582     if (!isTrans()) {
583         return *new NVMatrix(this->_devData + startRow * _stride + startCol, endRow - startRow, endCol - startCol, _stride, false);
584     }
585     return *new NVMatrix(this->_devData + startCol * _stride + startRow, endRow - startRow, endCol - startCol, _stride, true);
586 }
587
588 /* this will NEVER return a view */
589 void NVMatrix::slice(int startRow, int endRow, int startCol, int endCol, NVMatrix& target) const {
590     endRow = endRow < 0 ? this->_numRows : endRow;
591     endCol = endCol < 0 ? this->_numCols : endCol;
592     _checkBounds(startRow, endRow, startCol, endCol);
593
594     int sliceRows = endRow - startRow, sliceCols = endCol - startCol;
595     if (target.getNumRows() != sliceRows || target.getNumCols() != sliceCols) {
596         target.resize(sliceRows, sliceCols);
597     }
598     this->copy(target, startRow, endRow, startCol, endCol, 0, 0);
599 }
600
601 NVMatrix& NVMatrix::sliceRows(int startRow, int endRow) const {
602     return slice(startRow, endRow, 0, -1);
603 }
604
605 void NVMatrix::sliceRows(int startRow, int endRow, NVMatrix& target) const {
606     slice(startRow, endRow, 0, -1, target);
607 }
608
609 NVMatrix& NVMatrix::sliceCols(int startCol, int endCol) const {
610     return slice(0, -1, startCol, endCol);
611 }
612
613 void NVMatrix::sliceCols(int startCol, int endCol, NVMatrix& target) const {
614     slice(0, -1, startCol, endCol, target);
615 }
616
617 /*
618  * Guaranteed to not change the data if the number of elements doesn't change.
619  * So you can use this to "reshape" a matrix.
620  */
621
622 bool NVMatrix::resize(int numRows, int numCols) {
623     bool reallocated = false;
624     if (numRows != _numRows || numCols != _numCols) {
625         // this assertion was removed by Ian Goodfellow because it seems to come too early
626         // assert(_ownsData);
627         if (_numElements != numRows * numCols) {
628             assert(_ownsData); // assert moved here by Ian Goodfellow
629             if (_numElements > 0) { // free old memory
630                         // This line was modified by Ian Goodfellow to use device_free so theano may track device memory usage accurately
631                 int status = device_free(_devData);
632                 if (status != 0) {
633                     fprintf(stderr, "!!!! memory free error: %X\n", status);
634                     exit(EXIT_FAILURE);
635                 }
636             }
637             if (numRows * numCols > 0) { // allocate new memory
638               cudaError_t status = cudaMalloc((void**) &_devData,
639                                               numCols * numRows * sizeof(float));
640               if (status != cudaSuccess) {
641                 fprintf(stderr, "!!!! device memory allocation error\n");
642                 exit(EXIT_FAILURE);
643               }
644             } else {
645                 _devData = NULL;
646             }
647             reallocated = true;
648         }
649         _numRows = numRows;
650         _numCols = numCols;
651         _numElements = numRows * numCols;
652         _stride = getLeadingDim();
653     }
654     return reallocated;
655 }
656
657 bool NVMatrix::resize(const NVMatrix& like) {
658     setTrans(like.isTrans());
659     return resize(like.getNumRows(), like.getNumCols());
660 }
661
662 /*
663 bool NVMatrix::resize(const Matrix& like) {
664     setTrans(like.isTrans());
665     return resize(like.getNumRows(), like.getNumCols());
666 }
667 */
668
669 void NVMatrix::reshape(int numRows, int numCols) {
670     assert(isContiguous());
671     assert(_numElements == numRows*numCols);
672     _numRows = numRows;
673     _numCols = numCols;
674     _stride = getLeadingDim();
675 }
676
677 NVMatrix& NVMatrix::reshaped(int numRows, int numCols) {
678     assert(isContiguous());
679     assert(_numElements == numRows*numCols);
680     return *new NVMatrix(_devData, numRows, numCols, -1, _isTrans);
681 }
682
683 void NVMatrix::copy(NVMatrix &dest, int srcStartRow, int srcEndRow,
684                     int srcStartCol, int srcEndCol,
685                     int destStartRow, int destStartCol) const {
686     srcEndRow = srcEndRow < 0 ? _numRows : srcEndRow;
687     srcEndCol = srcEndCol < 0 ? _numCols : srcEndCol;
688     NVMatrix* srcSlice = &slice(srcStartRow, srcEndRow, srcStartCol, srcEndCol);
689     NVMatrix* destSlice = &dest.slice(destStartRow, destStartRow + srcEndRow - srcStartRow, destStartCol, destStartCol + srcEndCol - srcStartCol);
690     srcSlice->apply(NVMatrixOps::Identity(), *destSlice);
691     delete srcSlice;
692     delete destSlice;
693 }
694
695
696 NVMatrix& NVMatrix::getTranspose() {
697     return *new NVMatrix(_devData, _numCols, _numRows, _stride, !_isTrans);;
698 }
699
700 void NVMatrix::transpose(NVMatrix& target) {
701     flipTrans(target);
702     target.setTrans(!target.isTrans());
703     target.reshape(target.getNumCols(), target.getNumRows());
704 }
705
706 void NVMatrix::transpose() {
707     int tmp = _numCols;
708     _numCols = _numRows;
709     _numRows = tmp;
710     _isTrans = !_isTrans;
711 }
712
713 bool NVMatrix::transpose(bool trans) {
714     bool oldTrans = _isTrans;
715     if (oldTrans != trans) {
716         transpose();
717     }
718     return oldTrans;
719 }
720
721 /*
722  * Flips the ordering of the matrix from row-major to column-major and vice versa.
723  * This creates temporary storage -- not a cheap operation.
724  *
725  * This is not equivalent to a "hard transpose". The resultant matrix still has
726  * the same dimensions, its layout in memory just changes.
727  */
728 NVMatrix& NVMatrix::flipTrans() {
729     NVMatrix* meTrans = new NVMatrix(*this);
730     flipTrans(*meTrans);
731     return *meTrans;
732 }
733
734 void NVMatrix::flipTrans(NVMatrix& target) {
735     assert(&target != this);
736     target.resize(_numRows, _numCols);
737     target.setTrans(!isTrans());
738     apply(NVMatrixOps::Identity(), target);
739 }
740
741 void NVMatrix::squaredDiff(NVMatrix& b) {
742     squaredDiff(b, *this);
743 }
744
745 void NVMatrix::squaredDiff(NVMatrix& b, NVMatrix& target) {
746     applyBinary(NVMatrixBinaryOps::SquaredDiff(), b, target);
747 }
748
749 void NVMatrix::add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target) {
750     if (scaleA == 0) {
751         b.scale(scaleB, target);
752         return;
753     }
754     if (scaleA == 1 && scaleB == 1) { // slight optimization
755         applyBinary(NVMatrixBinaryOps::Add(), b, target);
756     } else {
757         applyBinary(NVMatrixBinaryOps::WeightedAdd(scaleA, scaleB), b, target);
758     }
759 }
760
761 void NVMatrix::add(NVMatrix& b, float scaleB, NVMatrix& target) {
762     add(b, 1, scaleB, target);
763 }
764
765 void NVMatrix::add(NVMatrix& b, NVMatrix& target) {
766     add(b, 1, target);
767 }
768
769 void NVMatrix::add(NVMatrix& b, float scaleB) {
770     add(b, scaleB, *this);
771 }
772
773 void NVMatrix::add(NVMatrix& b, float scaleA, float scaleB) {
774     add(b, scaleA, scaleB, *this);
775 }
776
777 void NVMatrix::add(NVMatrix& b) {
778     add(b, 1, *this);
779 }
780
781 void NVMatrix::subtract(NVMatrix& b, NVMatrix& target) {
782     add(b, -1, target);
783 }
784
785 void NVMatrix::subtract(NVMatrix& b) {
786     add(b, -1);
787 }
788
789 void NVMatrix::eltwiseMult(NVMatrix& b, NVMatrix& target) {
790     applyBinary(NVMatrixBinaryOps::Multiply(), b, target);
791 }
792
793 void NVMatrix::eltwiseMult(NVMatrix& b) {
794     eltwiseMult(b, *this);
795 }
796
797 void NVMatrix::eltwiseDivide(NVMatrix& b, NVMatrix& target) {
798     applyBinary(NVMatrixBinaryOps::Divide(), b, target);
799 }
800
801 void NVMatrix::eltwiseDivide(NVMatrix& b) {
802     eltwiseDivide(b, *this);
803 }
804
805 void NVMatrix::tile(int timesY, int timesX, NVMatrix& target) {
806     assert(isContiguous() && target.isContiguous());
807     assert(timesX > 0 && timesY > 0);
808     target.resize(_numRows*timesY, _numCols*timesX);
809     target.setTrans(_isTrans);
810     if(!isTrans()) {
811         kTile<<<NUM_TILE_BLOCKS,NUM_TILE_THREADS_PER_BLOCK>>>(_devData, target._devData, _numCols, _numRows, target._numCols, target._numRows);
812     } else {
813         kTile<<<NUM_TILE_BLOCKS,NUM_TILE_THREADS_PER_BLOCK>>>(_devData, target._devData, _numRows, _numCols, target._numRows, target._numCols);
814     }
815     cutilCheckMsg("Kernel execution failed");
816 }
817
818 void NVMatrix::addVector(NVMatrix& vec, float scaleVec, NVMatrix& target) {
819     applyBinaryV(NVMatrixBinaryOps::WeightedAdd(1, scaleVec), vec, target);
820 }
821
822 void NVMatrix::addVector(NVMatrix& vec) {
823     addVector(vec, 1, *this);
824 }
825
826 void NVMatrix::addVector(NVMatrix& vec, float scaleVec) {
827     addVector(vec, scaleVec, *this);
828 }
829
830 void NVMatrix::addVector(NVMatrix& vec, NVMatrix& target) {
831     addVector(vec, 1, target);
832 }
833
834 void NVMatrix::equalsVector(NVMatrix& vec, NVMatrix& target) {
835     applyBinaryV(NVMatrixBinaryOps::Equals(), vec, target);
836 }
837
838 void NVMatrix::equalsVector(NVMatrix& vec) {
839     equalsVector(vec, *this);
840 }
841
842 void NVMatrix::eltwiseMultByVector(NVMatrix& vec, NVMatrix& target) {
843     applyBinaryV(NVMatrixBinaryOps::Multiply(), vec, target);
844 }
845
846 void NVMatrix::eltwiseMultByVector(NVMatrix& vec) {
847     eltwiseMultByVector(vec, *this);
848 }
849
850 void NVMatrix::eltwiseDivideByVector(NVMatrix& vec) {
851     eltwiseDivideByVector(vec,  *this);
852 }
853
854 void NVMatrix::eltwiseDivideByVector(NVMatrix& vec, NVMatrix& target) {
855     applyBinaryV(NVMatrixBinaryOps::Divide(), vec, target);
856 }
857
858 /*
859  * num threads per block is ignored when summing rows (axis=1) because
860  * it has to be a power of 2.
861  *
862  * TODO: this is a mess, fix it. it works pretty fast but it's too ugly.
863  * TODO: this function is _really_ bad for very long aggregations of few columns.
864  */
865 template<class Agg, class BinaryOp>
866 void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp op) {
867     assert(axis == 0 || axis == 1);
868     assert(isContiguous()  && target.isContiguous());
869     assert(&target != this);
870     int width = _isTrans ? _numRows : _numCols;
871     int height = _isTrans ? _numCols : _numRows;
872
873     target.setTrans(_isTrans);
874     assert(width > 0);
875     assert(height > 0);
876     if(axis == 0 && !_isTrans || axis == 1 && _isTrans) { //col sum
877         target.resize(!_isTrans ? 1 : _numRows, !_isTrans ? _numCols : 1);
878         int numBlocks = DIVUP(width, NUM_SUM_COLS_THREADS_PER_BLOCK);
879         assert(numBlocks * NUM_SUM_COLS_THREADS_PER_BLOCK >= width);
880         assert(numBlocks < NUM_BLOCKS_MAX);
881         kDumbAggCols<Agg, BinaryOp><<<numBlocks,NUM_SUM_COLS_THREADS_PER_BLOCK>>>(_devData, target._devData, width, height, agg, op);
882         cutilCheckMsg("kDumbAggCols: Kernel execution failed");
883     } else { // row sum
884         target.resize(_isTrans ? 1 : _numRows, _isTrans ? _numCols : 1);
885         if (width > 1) {
886             if (height >= 16384) { // linear aggregation
887                 int numBlocksX = 1;
888                 int numBlocksY = DIVUP(height, AGG_SHORT_ROWS_THREADS_Y*AGG_SHORT_ROWS_LOOPS_Y);
889                 int numThreadsX = width <= 4 ? 4 : width <= 8 ? 8 : width <= 12 ? 12 : width <= 16 ? 16 : AGG_SHORT_ROWS_THREADS_X;
890                 int numThreadsY = AGG_SHORT_ROWS_THREADS_Y;
891                 while (numBlocksY > NUM_BLOCKS_MAX) {
892                     numBlocksY = DIVUP(numBlocksY,2);
893                     numBlocksX *= 2;
894                 }
895                 dim3 grid(numBlocksX, numBlocksY), threads(numThreadsX, numThreadsY);
896                 if(width <= 16) {
897                     if(width <= 4) {
898                         kAggShortRows<Agg, BinaryOp, 1, 4><<<grid, threads>>>(_devData, target._devData,width, height, agg, op);
899                     } else if(width <= 8) {
900                         kAggShortRows<Agg, BinaryOp, 1, 8><<<grid, threads>>>(_devData, target._devData,width, height, agg, op);
901                     } else if(width <= 12) {
902                         kAggShortRows<Agg, BinaryOp, 1, 12><<<grid, threads>>>(_devData, target._devData,width, height, agg, op);
903                     } else {
904                         kAggShortRows<Agg, BinaryOp, 1, 16><<<grid, threads>>>(_devData, target._devData,width, height, agg, op);
905                     }
906                 } else if(width <= 32) {
907                     kAggShortRows<Agg, BinaryOp, 2, AGG_SHORT_ROWS_THREADS_X><<<grid, threads>>>(_devData, target._devData,width, height, agg, op);
908                 } else if(width <= 48){
909                     kAggShortRows<Agg, BinaryOp, 3, AGG_SHORT_ROWS_THREADS_X><<<grid, threads>>>(_devData, target._devData,width, height, agg, op);
910                 } else if(width <= 64){
911                     kAggShortRows<Agg, BinaryOp, 4, AGG_SHORT_ROWS_THREADS_X><<<grid, threads>>>(_devData, target._devData,width, height, agg, op);
912                 } else {
913                     kAggShortRows2<Agg, BinaryOp><<<grid, threads>>>(_devData, target._devData,width, height, agg, op);
914                 }
915             } else {
916                 if (width >= 512) {
917                     dim3 threads(AWR_NUM_THREADS);
918                     dim3 blocks(1, std::min(1024, height));
919                     kAggRows_wholerow_nosync<<<blocks, threads>>>(_devData, target._devData, width, height, agg, op);
920 //                    dim3 threads(AWR_NUM_THREADS);
921 //                    dim3 blocks(1, std::min(1024, height));
922 //                    kAggRows_wholerow<<<blocks, threads>>>(_devData, target._devData, width, height, agg, op);
923
924                 } else {
925 //                    dim3 threads(AWR_NUM_THREADS);
926 //                    dim3 blocks(1, std::min(1024, height));
927 //                    kAggRows_wholerow<<<blocks, threads>>>(_devData, target._devData, width, height, agg, op);
928                     NVMatrix *prevSum = this;
929                     while (prevSum->getLeadingDim() > 1) {
930                         int numThreadsX = width <= 64 ? 32 : (width <= 128 ? 64 : (width <= 256 ? 128 : (width <= 512 ? 256 : 512)));
931                         int numThreadsY = 1;
932                         int numBlocksX = DIVUP(width, 2*numThreadsX);
933                         int numBlocksY = std::min(height, NUM_BLOCKS_MAX);
934                         NVMatrix *nvSumAccum = target.getFollowingDim() == height && target.getLeadingDim() == numBlocksX ? &target : new NVMatrix(height, numBlocksX, false);
935
936                         dim3 grid(numBlocksX, numBlocksY), threads(numThreadsX, numThreadsY);
937                         assert(numBlocksX <= NUM_BLOCKS_MAX);
938                         assert(numBlocksY <= NUM_BLOCKS_MAX);
939
940                         if(width <= 64) {
941                             kAggRows<Agg, BinaryOp, 32><<<grid, threads>>>(prevSum->_devData, nvSumAccum->_devData,
942                                                        width, height, nvSumAccum->getLeadingDim(), agg, op);
943                         } else if(width <= 128) {
944                             kAggRows<Agg, BinaryOp, 64><<<grid, threads>>>(prevSum->_devData, nvSumAccum->_devData,
945                                                        width, height, nvSumAccum->getLeadingDim(), agg, op);
946                         } else if(width <= 256) {
947                             kAggRows<Agg, BinaryOp, 128><<<grid, threads>>>(prevSum->_devData, nvSumAccum->_devData,
948                                                        width, height, nvSumAccum->getLeadingDim(), agg, op);
949                         } else if(width <= 512) {
950                             kAggRows<Agg, BinaryOp, 256><<<grid, threads>>>(prevSum->_devData, nvSumAccum->_devData,
951                                                        width, height, nvSumAccum->getLeadingDim(), agg, op);
952                         } else {
953                             kAggRows<Agg, BinaryOp, 512><<<grid, threads>>>(prevSum->_devData, nvSumAccum->_devData,
954                                                        width, height, nvSumAccum->getLeadingDim(), agg, op);
955                         }
956                         cutilCheckMsg("agg rows: Kernel execution failed");
957                         cudaThreadSynchronize();
958                         width = numBlocksX; // only true in reduction agg, but for linear agg this doesn't matter anyway
959
960                         if (prevSum != this) {
961                             delete prevSum;
962                         }
963                         prevSum = nvSumAccum;
964                     }
965                 }
966             }
967         } else {
968             copy(target);
969         }
970     }
971 }
972
973 void NVMatrix::inRangeInc(float lower, float upper) {
974     inRangeInc(lower, upper, *this);
975 }
976 void NVMatrix::inRangeInc(float lower, float upper, NVMatrix& target) {
977     apply(NVMatrixOps::InRange<false>(lower, upper), target);
978 }
979
980 void NVMatrix::inRangeExc(float lower, float upper) {
981     inRangeExc(lower, upper, *this);
982 }
983
984 void NVMatrix::inRangeExc(float lower, float upper, NVMatrix& target) {
985     apply(NVMatrixOps::InRange<true>(lower, upper), target);
986 }
987
988 void NVMatrix::biggerThanScalar(float scalar) {
989     biggerThanScalar(scalar, *this);
990 }
991
992 void NVMatrix::biggerThanScalar(float scalar, NVMatrix& target) {
993     apply(NVMatrixOps::BiggerThanScalar(scalar), target);
994 }
995
996 void NVMatrix::smallerThanScalar(float scalar) {
997     smallerThanScalar(scalar, *this);
998 }
999
1000 void NVMatrix::smallerThanScalar(float scalar, NVMatrix& target) {
1001     apply(NVMatrixOps::SmallerThanScalar(scalar), target);
1002 }
1003
1004 void NVMatrix::addScalar(float scaleThis, float scalar, NVMatrix& target) {
1005     apply(NVMatrixOps::WeightedAddScalar(scaleThis, scalar), target);
1006 }
1007
1008 void NVMatrix::addScalar(float scalar, NVMatrix& target) {
1009     apply(NVMatrixOps::AddScalar(scalar), target);
1010 }
1011
1012 void NVMatrix::addScalar(float scalar) {
1013     addScalar(scalar, *this);
1014 }
1015
1016 void NVMatrix::minWithScalar(float scalar, NVMatrix& target) {
1017     apply(NVMatrixOps::MinWithScalar(scalar), target);
1018 }
1019
1020 void NVMatrix::minWithScalar(float scalar) {
1021     minWithScalar(scalar, *this);
1022 }
1023
1024 void NVMatrix::maxWithScalar(float scalar, NVMatrix& target) {
1025     apply(NVMatrixOps::MaxWithScalar(scalar), target);
1026 }
1027
1028 void NVMatrix::maxWithScalar(float scalar) {
1029     maxWithScalar(scalar, *this);
1030 }
1031
1032 void NVMatrix::pow(float p, NVMatrix& target) {
1033     apply(NVMatrixOps::Pow(p), target);
1034 }
1035
1036 void NVMatrix::pow(float p) {
1037     pow(p, *this);
1038 }
1039
1040 void NVMatrix::scale(float _scale) {
1041     scale(_scale, *this);
1042 }
1043
1044 void NVMatrix::scale(float _scale, NVMatrix& target) {
1045     if (_scale != 1 || &target != this) { // optimize away scale by 1
1046         apply(NVMatrixOps::MultByScalar(_scale), target);
1047     }
1048 }
1049
1050 template<class Agg, class BinaryOp>
1051 NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp op) {
1052     NVMatrix *sumVec = new NVMatrix();
1053     _aggregate<Agg, BinaryOp>(axis, *sumVec, agg, op);
1054     return *sumVec;
1055 }
1056
1057 void NVMatrix::max(int axis, NVMatrix& target) {
1058     _aggregate(axis, target, NVMatrixAggs::Max(), NVMatrixBinaryOps::Second());
1059 }
1060
1061 void NVMatrix::addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum) {
1062     if (scaleThis != 0) {
1063         a._aggregate(axis, *this, NVMatrixAggs::Sum(), NVMatrixBinaryOps::WeightedAdd(scaleThis, scaleSum));
1064     } else {
1065         a._aggregate(axis, *this, NVMatrixAggs::Sum(), NVMatrixBinaryOps::SecondScaled(scaleSum));
1066     }
1067 }
1068 void NVMatrix::sum(int axis, NVMatrix& target) {
1069     _aggregate(axis, target, NVMatrixAggs::Sum(), NVMatrixBinaryOps::Second());
1070 }
1071 /*
1072 void NVMatrix::min(int axis, NVMatrix& target) {
1073     _aggregate(axis, target, NVMatrixAggs::Min(), NVMatrixBinaryOps::Second());
1074 }
1075
1076 NVMatrix& NVMatrix::max(int axis) {
1077     return _aggregate(axis, NVMatrixAggs::Max(), NVMatrixBinaryOps::Second());
1078 }
1079
1080 NVMatrix& NVMatrix::sum(int axis) {
1081     return _aggregate(axis, NVMatrixAggs::Sum(), NVMatrixBinaryOps::Second());
1082 }
1083
1084 NVMatrix& NVMatrix::min(int axis) {
1085     return _aggregate(axis, NVMatrixAggs::Min(), NVMatrixBinaryOps::Second());
1086 }
1087 */
1088
1089 void NVMatrix::_sum_setParams(int n, dim3* blocks, dim3* threads, int* numCols) {
1090     int logn = int(ceil(log(double(n)) / log(2.)));
1091     *numCols = DIVUP(n, logn);
1092     int numThreads = *numCols;
1093     *blocks = dim3(DIVUP(numThreads, DP_BLOCKSIZE));
1094     *threads = dim3(DP_BLOCKSIZE);
1095 }
1096
1097 /*
1098 float NVMatrix::mean() {
1099     return sum() / getNumElements();
1100 }
1101
1102 float NVMatrix::sum() {
1103     return _totalAgg(NVMatrixAggs::Sum());
1104 }
1105
1106 float NVMatrix::max() {
1107     return _totalAgg(NVMatrixAggs::Max());
1108 }
1109
1110 float NVMatrix::min() {
1111     return _totalAgg(NVMatrixAggs::Min());
1112 }
1113
1114 template<class Agg>
1115 float NVMatrix::_totalAgg(Agg agg) {
1116     assert(isContiguous());
1117     dim3 blocks, threads;
1118     int numCols;
1119     // Sum most of it on GPU
1120     NVMatrix* src = this;
1121     for (NVMatrix* target = NULL; src->getNumElements() > CPUSUM_MAX; src = target) {
1122         _sum_setParams(src->getNumElements(), &blocks, &threads, &numCols);
1123         target = new NVMatrix(1, blocks.x);
1124         kTotalAgg<<<blocks, threads>>>(src->getDevData(), target->getDevData(), numCols, src->getNumElements(), agg);
1125         cutilCheckMsg("kTotalAgg: Kernel execution failed");
1126         cudaThreadSynchronize(); // not really necessary?
1127         delete (src == this ? NULL : src);
1128     }
1129
1130     Matrix srcCPU(src->getNumRows(), src->getNumCols());
1131     src->copyToHost(srcCPU);
1132     if (src->getNumElements() > 1) { // Sum remainder on CPU
1133         delete (src == this ? NULL : src);
1134         if (typeid(Agg) == typeid(NVMatrixAggs::Sum)) {
1135             return srcCPU.sum();
1136         } else if (typeid(Agg) == typeid(NVMatrixAggs::Max)) {
1137             return srcCPU.max();
1138         } else if (typeid(Agg) == typeid(NVMatrixAggs::Min)) {
1139             return srcCPU.min();
1140         } else {
1141             assert(false);
1142         }
1143     }
1144     return srcCPU(0,0);
1145 }
1146 */
1147
1148 /*
1149  * Fast dot product only for matrices with same transposedness.
1150 float NVMatrix::dotProduct(NVMatrix& b) {
1151     assert(isContiguous() && b.isContiguous());
1152     assert(isSameDims(b));
1153     assert(isTrans() == b.isTrans()); // see?
1154     dim3 blocks, threads;
1155     int numCols;
1156     _sum_setParams(getNumElements(), &blocks, &threads, &numCols);
1157     NVMatrix target(1, blocks.x);
1158     kDotProduct_r<<<blocks, threads>>>(getDevData(), b.getDevData(), target.getDevData(), numCols, getNumElements());
1159     cutilCheckMsg("kDotProduct: Kernel execution failed");
1160     cudaThreadSynchronize();
1161     return target.sum();
1162 }
1163
1164 float NVMatrix::norm2() {
1165     return dotProduct(*this);
1166 }
1167
1168 float NVMatrix::norm() {
1169     return sqrt(norm2());
1170 }
1171  */
1172
1173 /*
1174 void NVMatrix::print(int startRow, int rows, int startCol, int cols) const {
1175     cudaThreadSynchronize();
1176     Matrix hm = Matrix(_numRows, _numCols);
1177     copyToHost(hm);
1178     hm.print(startRow, rows, startCol, cols);
1179 }
1180
1181 void NVMatrix::print(int rows, int cols) const {
1182     print(0, rows, 0, cols);
1183 }
1184 */
1185
1186 void NVMatrix::printShape(const char* name) const {
1187     printf("%s: %dx%d\n", name, _numRows, _numCols);
1188 }
1189
1190 /*
1191  * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
1192  * All rights reserved.
1193  *
1194  * Redistribution and use in source and binary forms, with or without modification,
1195  * are permitted provided that the following conditions are met:
1196  *
1197  * - Redistributions of source code must retain the above copyright notice,
1198  *   this list of conditions and the following disclaimer.
1199  *
1200  * - Redistributions in binary form must reproduce the above copyright notice,
1201  *   this list of conditions and the following disclaimer in the documentation
1202  *   and/or other materials provided with the distribution.
1203  *
1204  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
1205  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1206  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1207  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
1208  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1209  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
1210  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
1211  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
1212  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
1213  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1214  */
1215
1216 #ifndef _CONV_UTIL_EXPORT
1217 #define _CONV_UTIL_EXPORT
1218 #endif
1219
1220 #include <iostream>
1221 #include <assert.h>
1222 #include <nvmatrix_kernels.cuh>
1223 #include <nvmatrix.cuh>
1224 #include <conv_util.cuh>
1225
1226 using namespace std;
1227
1228 __device__ inline float square(const float a) {
1229     return a * a;
1230 }
1231
1232 /*
1233  * blockIdx.y determines module in batches of B_Y
1234  * blockIdx.x determines filter in batches of B_X * filtersPerThread
1235  *
1236  * weights: (numModules, numColors, filterPixels, numFilters)
1237  * Not fully coalesced if B_X < 32, so use cache.
1238  */
1239 template <int B_Y, int B_X, int filtersPerThread>
1240 __global__ void kNormalizeLCWeights(float* weights, const uint numFilters, const int numModules, const uint weightsPerFilter, const float norm) {
1241     const uint moduleIdx = B_Y * blockIdx.y + threadIdx.y;
1242     const uint filterIdx = B_X * blockIdx.x + threadIdx.x;
1243
1244     float prod[filtersPerThread];
1245     #pragma unroll
1246     for (uint i = 0; i < filtersPerThread; ++i) {
1247         prod[i] = 0;
1248     }
1249     if (moduleIdx < numModules) {
1250         weights += moduleIdx * weightsPerFilter * numFilters + filterIdx;
1251         for (uint p = 0; p < weightsPerFilter; ++p) {
1252             #pragma unroll
1253             for (uint i = 0; i < filtersPerThread; ++i) {
1254                 prod[i] += square(weights[p * numFilters + i * B_X]);
1255             }
1256         }
1257
1258         #pragma unroll
1259         for (uint i = 0; i < filtersPerThread; ++i) {
1260             prod[i] = sqrtf(prod[i]);
1261             prod[i] = prod[i] > norm ? __fdividef(norm, prod[i]) : 1.0f;
1262         }
1263
1264         for (uint p = 0; p < weightsPerFilter; ++p) {
1265             #pragma unroll
1266             for (uint i = 0; i < filtersPerThread; ++i) {
1267                 weights[p * numFilters + i * B_X] *= prod[i];
1268             }
1269         }
1270     }
1271 }
1272
1273 /*
1274  * weights: (numModules, numColors, filterPixels, numFilters)
1275  */
1276 void normalizeLocalWeights(NVMatrix& weights, int numModules, float norm) {
1277     int numFilters = weights.getNumCols();
1278     int weightsPerFilter = weights.getNumRows() / numModules;
1279     assert(numModules * weightsPerFilter == weights.getNumRows());
1280
1281     assert(!weights.isTrans());
1282     assert(weights.isContiguous());
1283     assert(numFilters % 16 == 0);
1284
1285     int bx = numFilters % 32 == 0 ? 32 : 16;
1286     int by = bx == 32 ? 4 : 8;
1287
1288     int filtersPerThread = numFilters % 128 == 0 ? 4 : numFilters % 64 == 0 ? 2 : 1;
1289     dim3 blocks(numFilters / (bx * filtersPerThread), DIVUP(numModules, by));
1290     dim3 threads(bx, by);
1291     if (filtersPerThread == 4) {
1292         cudaFuncSetCacheConfig(kNormalizeLCWeights<4, 32, 4>, cudaFuncCachePreferL1);
1293         kNormalizeLCWeights<4, 32, 4><<<blocks, threads>>>(weights.getDevData(), numFilters, numModules, weightsPerFilter, norm);
1294     } else if (filtersPerThread == 2) {
1295         cudaFuncSetCacheConfig(kNormalizeLCWeights<4, 32, 2>, cudaFuncCachePreferL1);
1296         kNormalizeLCWeights<4, 32, 2><<<blocks, threads>>>(weights.getDevData(), numFilters, numModules, weightsPerFilter, norm);
1297     } else {
1298         if (numFilters % 32 == 0) {
1299             cudaFuncSetCacheConfig(kNormalizeLCWeights<4, 32, 1>, cudaFuncCachePreferL1);
1300             kNormalizeLCWeights<4, 32, 1><<<blocks, threads>>>(weights.getDevData(), numFilters, numModules, weightsPerFilter, norm);
1301         } else {
1302             cudaFuncSetCacheConfig(kNormalizeLCWeights<8, 16, 1>, cudaFuncCachePreferL1);
1303             kNormalizeLCWeights<8, 16, 1><<<blocks, threads>>>(weights.getDevData(), numFilters, numModules, weightsPerFilter, norm);
1304         }
1305     }
1306 }
1307
1308 /*
1309  * Block size 4x32
1310  * blockIdx.x determines img idx in batches of 32*imgsPerThread
1311  * blockIdx.y determines channel idx, pixel idx in batches of 4
1312  *
1313  * threadIdx.x determins case idx
1314  * threadIdx.y determines pixel idx
1315  *
1316  * imgs:    (numChannels, imgPixels, numImages) with given imgStride
1317  * target:  (numChannels, tgtPixels, numImages)
1318  */
1319 template <int imgsPerThread, bool checkCaseBounds>
1320 __global__ void kCrop(float* imgs, float* target, const uint numImages, const int imgStride,
1321                       const uint imgSize, const uint tgtSize, const uint startY, const uint startX) {
1322     const uint imgPixels = imgSize * imgSize;
1323     const uint tgtPixels = tgtSize * tgtSize;
1324     const uint caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
1325     const uint blockChanIdx = blockIdx.y / DIVUP(tgtPixels, 4);
1326     const uint tgtPixelIdx = 4*(blockIdx.y % DIVUP(tgtPixels, 4)) + threadIdx.y;
1327     const uint tgtPxY = tgtPixelIdx / tgtSize;
1328     const uint tgtPxX = tgtPixelIdx % tgtSize;
1329     const uint srcPixelIdx = (startY + tgtPxY) * imgSize + startX + tgtPxX;
1330
1331     if (tgtPixelIdx < tgtPixels) {
1332         imgs += (blockChanIdx * imgPixels + srcPixelIdx) * imgStride + caseIdx;
1333         target += (blockChanIdx * tgtPixels + tgtPixelIdx) * numImages + caseIdx;
1334
1335         #pragma unroll
1336         for (uint i = 0; i < imgsPerThread; ++i) {
1337             if (!checkCaseBounds || (caseIdx + 32 * i < numImages)) {
1338                 target[i * 32] = imgs[i * 32];
1339             }
1340         }
1341     }
1342 }
1343
1344 /*
1345  * Block size 4x32
1346  * blockIdx.y determines pixel idx in batches of 4
1347  * blockIdx.x determines case idx in batches of 32*imgsPerThread
1348  * threadIdx.y determines pixel idx
1349  * threadIdx.x determines case idx
1350  *
1351  * imgs:        (3, imgPixels, numImages) with given imgStride
1352  * target:      (3, imgPixels, numImages)
1353  *
1354  * Each thread produces (y,u,v) values for a particular (r,g,b) pixel
1355  *
1356  * The RGB --> YUV transform is (http://en.wikipedia.org/wiki/YUV):
1357  *
1358  * [Y]      [0.2126     0.7152      0.0722  ][R]
1359  * [U]  =   [-0.09991   -0.33609    0.436   ][G]
1360  * [V]      [0.615      -0.55861    -0.05639][B]
1361  */
1362 template <int imgsPerThread, bool checkCaseBounds>
1363 __global__ void kRGBToYUV(float* imgs, float* target, const int imgPixels, const int numImages, const int imgStride) {
1364     const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
1365     const int pxIdx = blockIdx.y * 4 + threadIdx.y;
1366
1367     if (pxIdx < imgPixels) {
1368         const int imgChannelStride = imgPixels * imgStride;
1369         const int tgtChannelStride = imgPixels * numImages;
1370         imgs += pxIdx * imgStride + caseIdx;
1371         target += pxIdx * numImages + caseIdx;
1372
1373         #pragma unroll
1374         for (int i = 0; i < imgsPerThread; ++i) {
1375             if (!checkCaseBounds || caseIdx + i * 32 < numImages) {
1376                 const float R = imgs[0 * imgChannelStride + i * 32];
1377                 const float G = imgs[1 * imgChannelStride + i * 32];
1378                 const float B = imgs[2 * imgChannelStride + i * 32];
1379                 target[0 * tgtChannelStride + i * 32] = 0.2126f * R + 0.7152f * G + 0.0722f * B;      // Y
1380                 target[1 * tgtChannelStride + i * 32] = -0.09991f * R + -0.33609f * G + 0.436f * B;   // U
1381                 target[2 * tgtChannelStride + i * 32] = 0.615f * R + -0.55861f * G + -0.05639f * B;   // V
1382             }
1383         }
1384     }
1385 }
1386
1387 __device__ inline float labf(const float x) {
1388     if (x > 0.0088564517f) {
1389         return __powf(x, 0.3333f);
1390     }
1391     return 7.787037f * x + 0.13793103f;
1392 }
1393
1394 /*
1395  * Block size 4x32
1396  * blockIdx.y determines pixel idx in batches of 4
1397  * blockIdx.x determines case idx in batches of 32*imgsPerThread
1398  * threadIdx.y determines pixel idx
1399  * threadIdx.x determines case idx
1400  *
1401  * imgs:        (3, imgPixels, numImages) with given imgStride
1402  * target:      (3, imgPixels, numImages)
1403  *
1404  * This proceeds in two steps.
1405  *
1406  * - First, RGB values are linearly transformed to XYZ as per
1407  *   http://en.wikipedia.org/wiki/CIE_XYZ_color_space
1408  * - Second, XYZ values are nonlinearly transformed to L*a*b* as per
1409  *   http://en.wikipedia.org/wiki/Lab_color_space#The_forward_transformation
1410  *
1411  * Each thread produces (L*,a*,b*) values for a particular (r,g,b) pixel
1412  *
1413  * The RGB --> XYZ transform is:
1414  *
1415  * [X]                  [0.49       0.31        0.2     ][R]
1416  * [Y]  =   5.6506753 * [0.17697    0.8124      0.01063 ][G]
1417  * [Z]                  [0          0.01        0.99    ][B]
1418  *
1419  * NOTE: The input should be in the range 0-1. Don't do mean-subtraction beforehand.
1420  *
1421  * Then X_max, Y_max, Z_max = 5.6506753.
1422  *
1423  * The range of the L* values is [0, 100].
1424  * If the center flag is given, the range will be [-50, 50].
1425  *
1426  */
1427 template <int imgsPerThread, bool checkCaseBounds, bool center>
1428 __global__ void kRGBToLAB(float* imgs, float* target, const int imgPixels, const int numImages, const int imgStride) {
1429     const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
1430     const int pxIdx = blockIdx.y * 4 + threadIdx.y;
1431
1432     if (pxIdx < imgPixels) {
1433         const int imgChannelStride = imgPixels * imgStride;
1434         const int tgtChannelStride = imgPixels * numImages;
1435         imgs += pxIdx * imgStride + caseIdx;
1436         target += pxIdx * numImages + caseIdx;
1437
1438         #pragma unroll
1439         for (int i = 0; i < imgsPerThread; ++i) {
1440             if (!checkCaseBounds || caseIdx + i * 32 < numImages) {
1441                 const float R = imgs[0 * imgChannelStride + i * 32];
1442                 const float G = imgs[1 * imgChannelStride + i * 32];
1443                 const float B = imgs[2 * imgChannelStride + i * 32];
1444
1445                 const float X = (0.49f * R + 0.31f * G + 0.2f * B);
1446                 const float Y = (0.17697f * R + 0.8124f * G + 0.01063f * B);
1447                 const float Z = (0.01f * G + 0.99f * B);
1448
1449                 const float labX = labf(X);
1450                 const float labY = labf(Y);
1451                 const float labZ = labf(Z);
1452
1453                 target[0 * tgtChannelStride + i * 32] = 116.0f * labY - 16.0f - (center ? 50.0f : 0);  // L*
1454                 target[1 * tgtChannelStride + i * 32] = 500.0f * (labX - labY); // a*
1455                 target[2 * tgtChannelStride + i * 32] = 200.0f * (labY - labZ); // b*
1456             }
1457         }
1458     }
1459 }
1460
1461 /*
1462  * Block size 16x32.
1463  * Each block produces a 4x4 chunk of the output image.
1464  * threadIdx.y determines pixel idx in 4x4 chunk.
1465  * threadIdx.x determines case idx.
1466  * blockIdx.x determines case idx in batches of 32*imgsPerThread.
1467  * blockIdx.y determines 4x4 chunk idx, channel idx.
1468  *
1469  * imgs:        (numChannels, imgPixels, numImages) with given imgStride
1470  * target:      (numChannels, tgtPixels, numImages)
1471  *
1472  * imgSize = scale * tgtSize (roughly)
1473  *
1474  * This is a rather naive kernel that relies on cache for speed. But all it's doing
1475  * is basic texture manipulation, which is very local in nature, so it should be ok.
1476  * Also, it will in practice be a tiny fraction of the runtime of a large convnet.
1477  *
1478  * So that is my justification for being lazy here.
1479  */
1480 template <int imgsPerThread, bool checkCaseBounds>
1481 __global__ void kResizeBilinear(float* imgs, float* target, const int imgSize, const int tgtSize,
1482                                 const int numImages, const int imgStride, const float scale,
1483                                 const float centerScale) {
1484     const int numChunksX = DIVUP(tgtSize, 4);
1485     const int numChunks = numChunksX * numChunksX;
1486     const int channelIdx = blockIdx.y / numChunks;
1487     const int chunkIdx = blockIdx.y % numChunks;
1488     const int chunkIdxX = chunkIdx % numChunksX;
1489     const int chunkIdxY = chunkIdx / numChunksX;
1490     const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
1491     const int imgPixels = imgSize * imgSize;
1492     const int tgtPixels = tgtSize * tgtSize;
1493
1494     const int pxX = 4 * chunkIdxX + threadIdx.y % 4;
1495     const int pxY = 4 * chunkIdxY + threadIdx.y / 4;
1496
1497     if (pxY < tgtSize && pxX < tgtSize) {
1498         const int pxIdx = pxY * tgtSize + pxX;
1499
1500         imgs += channelIdx * imgPixels * imgStride + caseIdx;
1501         target += channelIdx * tgtPixels * numImages + pxIdx * numImages + caseIdx;
1502
1503         // This will cause slight distortions at the edges when upsampling in some cases.
1504         // But I think that's not a big deal.
1505         const float srcPxX = fmaxf(0.0f, fminf(__int2float_rn(imgSize) - 1.01f, __int2float_rn(pxX) * scale + centerScale));
1506         const float srcPxY = fmaxf(0.0f, fminf(__int2float_rn(imgSize) - 1.01f, __int2float_rn(pxY) * scale + centerScale));
1507
1508         const float u = floorf(srcPxX + 1) - srcPxX;
1509         const float w = srcPxY - floorf(srcPxY);
1510
1511         // Consider doing max(0, min(imgSize, x)) here
1512         const int srcPx0 = (__float2int_rd(srcPxY) * imgSize + __float2int_rd(srcPxX)); // top-left
1513         const int srcPx1 = srcPx0 + 1; // top-right
1514         const int srcPx2 = srcPx0 + imgSize; // bottom-left
1515         const int srcPx3 = srcPx2 + 1; // bottom-right
1516
1517         #pragma unroll
1518         for (int c = 0; c < imgsPerThread; ++c) {
1519             if (!checkCaseBounds || caseIdx + c * 32 < numImages) {
1520                 const float val0 = imgs[srcPx0 * imgStride + c * 32];
1521                 const float val1 = imgs[srcPx1 * imgStride + c * 32];
1522                 const float val2 = imgs[srcPx2 * imgStride + c * 32];
1523                 const float val3 = imgs[srcPx3 * imgStride + c * 32];
1524
1525                 const float c0 = u * (val0 - val1) + val1;
1526                 const float c1 = u * (val2 - val3) + val3;
1527
1528                 target[32 * c] = w * (c1 - c0) + c0;
1529             }
1530         }
1531     }
1532 }
1533
1534 /*
1535  * Block size B_YxB_X.
1536  * B_X*imgsPerThread*blockIdx.x + threadIdx.x determines img idx
1537  * B_Y*blockIdx.y + threadIdx.y determines img row (col if !horiz), channel idx
1538  *
1539  * imgs:        (numChannels, imgPixels, numImages) with given imgStride
1540  * filter:      (1, 2*radius + 1)
1541  * target:      (numChannels, imgPixels, numImages)
1542  *
1543  * target can be the same matrix as imgs.
1544  * radius must be one of 3, 5, 7, 9.
1545  *
1546  * Tried imgsPerThread, slower.
1547  */
1548 template<int B_Y, int B_X, int radius>
1549 __global__ void kGaussianBlur(float* imgs, float* filter, float* target, const int imgSize,
1550                               const int numImages, const int imgStride,
1551                               const bool horiz,
1552                               const float scaleTargets, const float scaleOutputs) {
1553     __shared__ float shFilter[radius];
1554
1555     const int imgPixels = imgSize * imgSize;
1556     const int ty = B_Y * blockIdx.y + threadIdx.y;
1557     const int channelIdx = ty / imgSize;
1558     const int rowIdx = ty % imgSize;
1559     const int imgIdx = B_X*blockIdx.x + threadIdx.x;
1560     const int filterWidth = 2*radius+1;
1561 //    const int tidx = B_Y * threadIdx.y + threadIdx.x;
1562     if (horiz) {
1563         imgs += channelIdx * imgPixels * imgStride + rowIdx * imgSize * imgStride + imgIdx;
1564         target += channelIdx * imgPixels * numImages + rowIdx * imgSize * numImages + imgIdx;
1565     } else {
1566         imgs += channelIdx * imgPixels * imgStride + rowIdx * imgStride + imgIdx;
1567         target += channelIdx * imgPixels * numImages + rowIdx * numImages + imgIdx;
1568     }
1569     float outputs[filterWidth-1];
1570     #pragma unroll
1571     for (int r = 0; r < filterWidth-1; r++) {
1572         outputs[r] = 0;
1573     }
1574     if (threadIdx.x < filterWidth-1) {
1575         shFilter[threadIdx.x] = filter[threadIdx.x];
1576     }
1577     __syncthreads();
1578
1579     if (imgIdx < numImages) {
1580         // This writes radius*2 = filterWidth - 1 values to outputs
1581         #pragma unroll
1582         for (int col = 0; col < radius; col++) {
1583             float px = imgs[0];
1584             #pragma unroll
1585             for (int r = 0; r < radius + 1 + col; r++) {
1586                 outputs[r] += px * shFilter[radius + col - r];
1587             }
1588             imgs += horiz ? imgStride : imgStride * imgSize;
1589         }
1590
1591         // Unfortunately this has to be at this level of granularity
1592         if (scaleTargets != 0) {
1593             for (int col = radius; col < imgSize ; col++) { // loop over img columns
1594                 float px = imgs[0];
1595                 target[0] = scaleTargets * target[0] + scaleOutputs * (outputs[0] + px * shFilter[0]);
1596
1597                 #pragma unroll
1598                 for (int r = 1; r < radius*2; r++) {
1599                     outputs[r-1] = outputs[r] + px * shFilter[r];
1600                 }
1601                 outputs[filterWidth - 2] = px * shFilter[0];
1602
1603                 imgs += horiz ? imgStride : imgStride * imgSize;
1604                 target += horiz ? numImages : numImages * imgSize;
1605             }
1606
1607             #pragma unroll
1608             for (int r = 0; r < radius; r++) {
1609                 float* t = &target[0];
1610                 t[0] = scaleTargets * t[0] + scaleOutputs * outputs[r];
1611                 target += horiz ? numImages : numImages * imgSize;
1612             }
1613         } else {
1614             for (int col = radius; col < imgSize ; col++) { // loop over img columns
1615                 float px = imgs[0];
1616                 target[0] = scaleOutputs * (outputs[0] + px * shFilter[0]);
1617                 #pragma unroll
1618                 for (int r = 1; r < radius*2; r++) {
1619                     outputs[r-1] = outputs[r] + px * shFilter[r];
1620                 }
1621                 outputs[filterWidth - 2] = px * shFilter[0];
1622
1623                 imgs += horiz ? imgStride : imgStride * imgSize;
1624                 target += horiz ? numImages : numImages * imgSize;
1625             }
1626
1627             #pragma unroll
1628             for (int r = 0; r < radius; r++) {
1629                 target[0] = scaleOutputs * outputs[r];
1630                 target += horiz ? numImages : numImages * imgSize;
1631             }
1632         }
1633     }
1634 }
1635
1636 /*
1637  * Block size B_YxB_X
1638  * blockIdx.x determines output.x, image idx in batches of B_X*imgsPerThread
1639  * blockIdx.y determines output.y, filter idx in batches of B_Y*filtersPerThread
1640  *
1641  * So each block does one output for some number of images/filters.
1642  *
1643  * threadIdx.x determines img idx
1644  * threadIdx.y determines filter idx
1645  *
1646  * imgs:        (numChannels, imgPixels, numImages)
1647  * target:      (numChannels, numOutputs, numImages)
1648  *
1649  * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
1650  * numFilters must be divisible by filtersPerThread
1651  */
1652
1653 template<int B_Y, int B_X, int imgsPerThread, int chansPerThread, bool checkCaseBounds>
1654 __global__ void kBedOfNails(float* imgs, float* target, const int imgSize, const int numChannels,
1655                            const int numImages, const int startX, const int strideX, const int outputsX,
1656                            const bool reverse, const float scaleTargets, const float scaleOutput) {
1657     const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
1658     const int numChanBlocks = DIVUP(numChannels, B_Y*chansPerThread);
1659     const int outputIdxX = blockIdx.x / numImgBlocks;
1660     const int outputIdxY = blockIdx.y / numChanBlocks;
1661     const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
1662     const int blockChanIdx = (blockIdx.y % numChanBlocks) * B_Y * chansPerThread;
1663     const int myChanIdx = (blockChanIdx + threadIdx.y*chansPerThread);
1664     if (myChanIdx >= numChannels) {
1665         return;
1666     }
1667 //    if (blockIdx.x != 0 || blockIdx.y != 0) {
1668 //        return;
1669 //    }
1670     const int outputIdx = outputIdxY * outputsX + outputIdxX;
1671     const int numOutputs = outputsX * outputsX;
1672     const int imgPixels = imgSize * imgSize;
1673
1674     const int startImgPxX = startX + outputIdxX * strideX;
1675     const int startImgPxY = startX + outputIdxY * strideX;
1676     const int imgIdx = blockImgIdx + threadIdx.x;
1677     const int imgPx = startImgPxY * imgSize + startImgPxX;
1678
1679     imgs += myChanIdx * imgPixels * numImages + imgPx * numImages + imgIdx;
1680     target += (myChanIdx * numOutputs + outputIdx) * numImages + imgIdx;
1681
1682     if (scaleTargets != 0) {
1683         if (!reverse) {
1684             #pragma unroll
1685             for (int i = 0; i < imgsPerThread; i++) {
1686                 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
1687                     #pragma unroll
1688                     for (int c = 0; c < chansPerThread; c++) {
1689                         target[c * numOutputs * numImages + i * B_X] = scaleTargets * target[c * numOutputs * numImages + i * B_X] + scaleOutput * imgs[c * imgPixels * numImages + i * B_X];
1690                     }
1691                 }
1692             }
1693         } else {
1694             #pragma unroll
1695             for (int i = 0; i < imgsPerThread; i++) {
1696                 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
1697                     #pragma unroll
1698                     for (int c = 0; c < chansPerThread; c++) {
1699                         imgs[c * imgPixels * numImages + i * B_X] = scaleTargets * imgs[c * imgPixels * numImages + i * B_X] + scaleOutput * target[c * numOutputs * numImages + i * B_X];
1700                     }
1701                 }
1702             }
1703         }
1704     } else {
1705         if (!reverse) {
1706             #pragma unroll
1707             for (int i = 0; i < imgsPerThread; i++) {
1708                 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
1709                     #pragma unroll
1710                     for (int c = 0; c < chansPerThread; c++) {
1711                         target[c * numOutputs * numImages + i * B_X] = scaleOutput * imgs[c * imgPixels * numImages + i * B_X];
1712                     }
1713                 }
1714             }
1715         } else {
1716             #pragma unroll
1717             for (int i = 0; i < imgsPerThread; i++) {
1718                 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
1719                     #pragma unroll
1720                     for (int c = 0; c < chansPerThread; c++) {
1721                         imgs[c * imgPixels * numImages + i * B_X] = scaleOutput * target[c * numOutputs * numImages + i * B_X];
1722                     }
1723                 }
1724             }
1725         }
1726     }
1727
1728 }
1729
1730 /*
1731  * imgs:        (numChannels, imgPixels, numImages)
1732  * target:      (numChannels, outputs, numImages)
1733  */
1734 void _convBedOfNails(NVMatrix& images, NVMatrix& target, int numChannels, int imgSize, int startX, int strideX,
1735                      bool reverse, float scaleTargets, float scaleOutput) {
1736     int numImages = reverse ? target.getNumCols() : images.getNumCols();
1737     int imgPixels = imgSize * imgSize;
1738
1739     assert(!images.isTrans());
1740     assert(!target.isTrans());
1741     assert(images.isContiguous());
1742     assert(target.isContiguous());
1743     assert(strideX > 1);
1744
1745     int outputsX = DIVUP(imgSize, strideX);
1746     int outputs = outputsX * outputsX;
1747     if (reverse) {
1748         assert(target.getNumRows() == numChannels * outputs);
1749     } else  {
1750         assert(images.getNumRows() == numChannels * imgPixels);
1751     }
1752
1753     if (scaleTargets == 0) {
1754         if (reverse) {
1755             images.resize(numChannels * imgPixels, numImages);
1756             images.apply(NVMatrixOps::Zero());
1757         } else {
1758             target.resize(numChannels*outputs, numImages);
1759         }
1760     } else {
1761         if (reverse) {
1762             assert(images.getNumRows() == numChannels * outputs);
1763             assert(images.getNumCols() == numImages);
1764         } else {
1765             assert(target.getNumRows() == numChannels * outputs);
1766             assert(target.getNumCols() == numImages);
1767         }
1768     }
1769
1770
1771     int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
1772     bool checkCaseBounds = numImages % (32*imgsPerThread) != 0;
1773     int chansPerThread = numChannels % 8 == 0 ? 2 : 1;
1774     dim3 threads(32, 4);
1775     dim3 blocks(DIVUP(numImages,32*imgsPerThread) * outputsX, DIVUP(numChannels, 4 * chansPerThread) * outputsX);
1776
1777     if (imgsPerThread == 4) {
1778         if (chansPerThread == 1) {
1779             if (checkCaseBounds) {
1780                 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 4, 1, true>, cudaFuncCachePreferL1);
1781                 kBedOfNails<4, 32, 4, 1, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1782                                                                     imgSize, numChannels, numImages, startX, strideX, outputsX,
1783                                                                     reverse, scaleTargets, scaleOutput);
1784             } else {
1785                 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 4, 1, false>, cudaFuncCachePreferL1);
1786                 kBedOfNails<4, 32, 4, 1, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1787                                                                      imgSize, numChannels, numImages, startX, strideX, outputsX,
1788                                                                      reverse, scaleTargets, scaleOutput);
1789             }
1790         } else {
1791             if (checkCaseBounds) {
1792                 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 4, 2, true>, cudaFuncCachePreferL1);
1793                 kBedOfNails<4, 32, 4, 2, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1794                                                                     imgSize, numChannels, numImages, startX, strideX, outputsX,
1795                                                                     reverse, scaleTargets, scaleOutput);
1796             } else {
1797                 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 4, 2, false>, cudaFuncCachePreferL1);
1798                 kBedOfNails<4, 32, 4, 2, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1799                                                                      imgSize, numChannels, numImages, startX, strideX, outputsX,
1800                                                                      reverse, scaleTargets, scaleOutput);
1801             }
1802         }
1803     } else if (imgsPerThread == 2) {
1804         if (chansPerThread == 1) {
1805             if (checkCaseBounds) {
1806                 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 2, 1, true>, cudaFuncCachePreferL1);
1807                 kBedOfNails<4, 32, 2, 1, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1808                                                                     imgSize, numChannels, numImages, startX, strideX, outputsX,
1809                                                                     reverse, scaleTargets, scaleOutput);
1810             } else {
1811                 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 2, 1, false>, cudaFuncCachePreferL1);
1812                 kBedOfNails<4, 32, 2, 1, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1813                                                                      imgSize, numChannels, numImages, startX, strideX, outputsX,
1814                                                                      reverse, scaleTargets, scaleOutput);
1815             }
1816         } else {
1817             if (checkCaseBounds) {
1818                 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 2, 2, true>, cudaFuncCachePreferL1);
1819                 kBedOfNails<4, 32, 2, 2, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1820                                                                     imgSize, numChannels, numImages, startX, strideX, outputsX,
1821                                                                     reverse, scaleTargets, scaleOutput);
1822             } else {
1823                 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 2, 2, false>, cudaFuncCachePreferL1);
1824                 kBedOfNails<4, 32, 2, 2, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1825                                                                      imgSize, numChannels, numImages, startX, strideX, outputsX,
1826                                                                      reverse, scaleTargets, scaleOutput);
1827             }
1828         }
1829     } else {
1830         if (chansPerThread == 1) {
1831             if (checkCaseBounds) {
1832                 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 1, 1, true>, cudaFuncCachePreferL1);
1833                 kBedOfNails<4, 32, 1, 1, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1834                                                                     imgSize, numChannels, numImages, startX, strideX, outputsX,
1835                                                                     reverse, scaleTargets, scaleOutput);
1836             } else {
1837                 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 1, 1, false>, cudaFuncCachePreferL1);
1838                 kBedOfNails<4, 32, 1, 1, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1839                                                                      imgSize, numChannels, numImages, startX, strideX, outputsX,
1840                                                                      reverse, scaleTargets, scaleOutput);
1841             }
1842         } else {
1843             if (checkCaseBounds) {
1844                 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 1, 2, true>, cudaFuncCachePreferL1);
1845                 kBedOfNails<4, 32, 1, 2, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1846                                                                     imgSize, numChannels, numImages, startX, strideX, outputsX,
1847                                                                     reverse, scaleTargets, scaleOutput);
1848             } else {
1849                 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 1, 2, false>, cudaFuncCachePreferL1);
1850                 kBedOfNails<4, 32, 1, 2, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1851                                                                      imgSize, numChannels, numImages, startX, strideX, outputsX,
1852                                                                      reverse, scaleTargets, scaleOutput);
1853             }
1854         }
1855     }
1856 }
1857
1858 void convBedOfNails(NVMatrix& images, NVMatrix& target, int numChannels, int imgSize, int startX,
1859                     int strideX, float scaleTargets, float scaleOutput) {
1860     _convBedOfNails(images, target, numChannels, imgSize, startX, strideX, false, scaleTargets, scaleOutput);
1861 }
1862
1863 void convBedOfNailsUndo(NVMatrix& actsGrad, NVMatrix& target, int numChannels, int imgSize,
1864                         int startX, int strideX, float scaleTargets, float scaleOutput) {
1865
1866     _convBedOfNails(target, actsGrad, numChannels, imgSize, startX, strideX, true, scaleTargets, scaleOutput);
1867 }
1868
1869
1870 /*
1871  * imgs:        (numChannels, imgPixels, numImages) with given imgStride
1872  * filter:      (1, 2*radius + 1)
1873  * target:      (numChannels, imgPixels, numImages)
1874  */
1875 void convGaussianBlur(NVMatrix& images, NVMatrix& filter, NVMatrix& target, bool horiz, int numChannels,
1876                       float scaleTargets, float scaleOutputs) {
1877     int numImages = images.getNumCols();
1878     int radius = filter.getNumCols() / 2;
1879     int imgPixels = images.getNumRows() / numChannels;
1880     int imgSize = int(sqrt((double)imgPixels));
1881
1882     assert(imgPixels == imgSize * imgSize);
1883     assert(radius >= 1 && radius <= 4);
1884     assert(imgSize >= 2 * radius + 1);
1885     assert(filter.getNumRows() == 1);
1886     assert(images.getNumRows() == numChannels * imgPixels);
1887     assert(!images.isTrans());
1888     assert(!filter.isTrans());
1889     assert(!target.isTrans());
1890     assert(target.isContiguous());
1891     if (scaleTargets == 0) {
1892         target.resize(images);
1893     } else {
1894         assert(target.isSameDims(images));
1895     }
1896
1897     dim3 threads(32, 4);
1898     dim3 blocks(DIVUP(numImages, threads.x), DIVUP(numChannels*imgSize, threads.y));
1899
1900     if (radius == 1) {
1901         cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 1>, cudaFuncCachePreferL1);
1902         kGaussianBlur<4, 32, 1><<<blocks, threads>>>(images.getDevData(), filter.getDevData(), target.getDevData(),
1903                                                            imgSize, numImages, images.getStride(), horiz, scaleTargets, scaleOutputs);
1904
1905     } else if (radius == 2) {
1906         cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 2>, cudaFuncCachePreferL1);
1907         kGaussianBlur<4, 32, 2><<<blocks, threads>>>(images.getDevData(), filter.getDevData(), target.getDevData(),
1908                                                            imgSize, numImages, images.getStride(), horiz, scaleTargets, scaleOutputs);
1909
1910     } else if (radius == 3) {
1911         cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 3>, cudaFuncCachePreferL1);
1912         kGaussianBlur<4, 32, 3><<<blocks, threads>>>(images.getDevData(), filter.getDevData(), target.getDevData(),
1913                                                            imgSize, numImages, images.getStride(), horiz, scaleTargets, scaleOutputs);
1914     } else if (radius == 4) {
1915         cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 4>, cudaFuncCachePreferL1);
1916         kGaussianBlur<4, 32, 4><<<blocks, threads>>>(images.getDevData(), filter.getDevData(), target.getDevData(),
1917                                                            imgSize, numImages, images.getStride(), horiz, scaleTargets, scaleOutputs);
1918     }
1919 }
1920
1921 /*
1922  * Block size 1x128
1923  * blockIdx.x determines pixel.x, image idx in batches of 128*imgsPerThread
1924  * blockIdx.y determines pixel.y
1925  *
1926  * So each block does one output for some number of images and all the fliters.
1927  *
1928  * threadIdx.x determines img idx
1929  *
1930  * imgs:        (numFilters, imgPixels, numImages)
1931  * meanDiffs:   (numFilters, imgPixels, numImages)
1932  * denoms:      (numFilters, imgPixels, numImages) (out)
1933  * target:      (numFilters, imgPixels, numImages) (out)
1934  *
1935  * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
1936  * numFilters must be divisible by B_Y*filtersPerThread
1937  */
1938
1939 template<int imgsPerThread, int numFilters, bool checkCaseBounds>
1940 __global__ void kCNorm_fewfilter(float* imgs, float* meanDiffs, float* denoms, float* target, const int imgSize,
1941                                   const int numImages, const int sizeX, const float addScale, const float powScale) {
1942
1943     const int imgPixels = imgSize * imgSize;
1944     const int numImgBlocks = DIVUP(numImages, 128*imgsPerThread);
1945     const int pxIdxX = blockIdx.x / numImgBlocks;
1946     const int pxIdxY = blockIdx.y;
1947     const int blockImgIdx = (blockIdx.x % numImgBlocks) * 128 * imgsPerThread;
1948
1949     const int pxIdx = pxIdxY * imgSize + pxIdxX;
1950
1951     const int startPxX = -sizeX/2 + pxIdxX;
1952     const int startPxY = -sizeX/2 + pxIdxY;
1953     const int imgIdx = blockImgIdx + threadIdx.x;
1954
1955     imgs += pxIdx * numImages + imgIdx;
1956     denoms += pxIdx * numImages + imgIdx;
1957     meanDiffs  += imgIdx;
1958     target += pxIdx * numImages + imgIdx;
1959
1960     float prod[numFilters][imgsPerThread];
1961     #pragma unroll
1962     for (int i = 0; i < imgsPerThread; i++) {
1963         if (!checkCaseBounds || imgIdx + i * 128 < numImages) {
1964             #pragma unroll
1965             for (int f = 0; f < numFilters; f++) {
1966                 prod[f][i] = 0;
1967             }
1968         }
1969     }
1970     const int loopStartY = MAX(0, startPxY);
1971     const int loopStartX = MAX(0, startPxX);
1972     const int loopEndY = MIN(imgSize, startPxY + sizeX);
1973     const int loopEndX = MIN(imgSize, startPxX + sizeX);
1974
1975     for (int y = loopStartY; y < loopEndY; y++) {
1976         for (int x = loopStartX; x < loopEndX; x++) {
1977             const int imgPx = y * imgSize + x;
1978             #pragma unroll
1979             for (int i = 0; i < imgsPerThread; i++) {
1980                 if (!checkCaseBounds || imgIdx + i * 128 < numImages) {
1981                     #pragma unroll
1982                     for (int f = 0; f < numFilters; f++) {
1983                         prod[f][i] += square(meanDiffs[(f * imgPixels + imgPx) * numImages + i * 128]);
1984                     }
1985                 }
1986             }
1987         }
1988     }
1989
1990     #pragma unroll
1991     for (int i = 0; i < imgsPerThread; i++) {
1992         if (!checkCaseBounds || imgIdx + i * 128 < numImages) {
1993             #pragma unroll
1994             for (int f = 0; f < numFilters; f++) {
1995                 prod[f][i] = 1 + addScale * prod[f][i];
1996                 denoms[f * imgPixels * numImages + i * 128] = prod[f][i];
1997                 target[f * imgPixels * numImages + i * 128] = imgs[f * imgPixels * numImages + i * 128] * __powf(prod[f][i], -powScale);
1998             }
1999         }
2000     }
2001 }
2002
2003 /*
2004  * Block size B_YxB_X
2005  * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
2006  * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
2007  *
2008  * So each block does one pixel for some number of images/filters.
2009  *
2010  * threadIdx.x determines img idx
2011  * threadIdx.y determines filter idx
2012  *
2013  * imgs:        (numFilters, imgPixels, numImages)
2014  * means:       (numFilters, imgPixels, numImages)
2015  * denoms:      (numFilters, imgPixels, numImages) (out)
2016  * target:      (numFilters, imgPixels, numImages) (out)
2017  *
2018  * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
2019  * numFilters must be divisible by B_Y*filtersPerThread
2020  */
2021 template<int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool checkCaseBounds>
2022 __global__ void kCNorm_manyfilter(float* imgs, float* meanDiffs, float* denoms, float* target, const int imgSize,
2023                                   const int numFilters, const int numImages, const int sizeX,
2024                                   const float addScale, const float powScale) {
2025     const int imgPixels = imgSize * imgSize;
2026     const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread);
2027     const int numFilterBlocks = numFilters/(B_Y*filtersPerThread);
2028     const int pxIdxX = blockIdx.x / numImgBlocks;
2029     const int pxIdxY = blockIdx.y / numFilterBlocks;
2030     const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2031     const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
2032
2033     const int pxIdx = pxIdxY * imgSize + pxIdxX;
2034
2035     const int startPxX = -sizeX/2 + pxIdxX;
2036     const int startPxY = -sizeX/2 + pxIdxY;
2037     const int imgIdx = blockImgIdx + threadIdx.x;
2038
2039     imgs += ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
2040     meanDiffs += (blockFilterIdx + threadIdx.y) * imgPixels * numImages + imgIdx;
2041     denoms += ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
2042     target += ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
2043
2044     float prod[filtersPerThread][imgsPerThread];
2045     #pragma unroll
2046     for (int i = 0; i < imgsPerThread; i++) {
2047         if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2048             #pragma unroll
2049             for (int f = 0; f < filtersPerThread; f++) {
2050                 prod[f][i] = 0;
2051             }
2052         }
2053     }
2054
2055     const int loopStartY = MAX(0, startPxY);
2056     const int loopStartX = MAX(0, startPxX);
2057     const int loopEndY = MIN(imgSize, startPxY + sizeX);
2058     const int loopEndX = MIN(imgSize, startPxX + sizeX);
2059
2060     for (int y = loopStartY; y < loopEndY; y++) {
2061         for (int x = loopStartX; x < loopEndX; x++) {
2062             const int imgPx = y * imgSize + x;
2063             #pragma unroll
2064             for (int i = 0; i < imgsPerThread; i++) {
2065                 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2066                     #pragma unroll
2067                     for (int f = 0; f < filtersPerThread; f++) {
2068                         prod[f][i] += square(meanDiffs[(f * B_Y * imgPixels + imgPx) * numImages + i * B_X]);
2069                     }
2070                 }
2071             }
2072         }
2073     }
2074
2075     #pragma unroll
2076     for (int i = 0; i < imgsPerThread; i++) {
2077         if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2078             #pragma unroll
2079             for (int f = 0; f < filtersPerThread; f++) {
2080                 prod[f][i] = 1 + addScale * prod[f][i];
2081                 denoms[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
2082                 target[f * B_Y * imgPixels * numImages + i * B_X] = imgs[f * B_Y * imgPixels * numImages + i * B_X] * __powf(prod[f][i], -powScale);
2083             }
2084         }
2085     }
2086 }
2087
2088
2089 /*
2090  * Block size 16xB_X
2091  * blockIdx.x determines 4x4 pixel.x region, image idx in batches of B_X*imgsPerThread
2092  * blockIdx.y determines 4x4 pixel.y region, filter idx in batches of filtersPerThread
2093  *
2094  * So each block does 4x4 region of pixels for some number of images/filters.
2095  *
2096  * threadIdx.x determines img idx
2097  * threadIdx.y determines pixel idx
2098  *
2099  * imgs:        (numFilters, imgPixels, numImages)
2100  * means:       (numFilters, imgPixels, numImages)
2101  * denoms:      (numFilters, imgPixels, numImages) (out)
2102  * target:      (numFilters, imgPixels, numImages) (out)
2103  *
2104  * B_X one of 8, 16, 32
2105  * imgsPerThread one of 1, 2, 4, 8, 16
2106  *
2107  * B_XximgsPerThread MUST be divisible by 32.
2108  * Number of filters MUST be divisible by filtersPerThread.
2109  *
2110  * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
2111  * numFilters must be divisible by filtersPerThread
2112  *
2113  * Final write-out will not be fully coalesced unless B_X is 32. But there's a lot more
2114  * reading than writing here, and the reading is all coalesced, so it should be OK.
2115  */
2116 template<int B_X, int imgsPerThread, int filtersPerThread, bool checkCaseBounds>
2117 __global__ void kCNorm2(float* imgs, float* meanDiffs, float* denoms, float* target, const int imgSize,
2118                          const int numFilters, const int numImages, const int sizeX, const float addScale, const float powScale) {
2119     __shared__ float shDiffs[filtersPerThread][B_X*imgsPerThread];
2120     const int imgPixels = imgSize * imgSize;
2121     const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread);
2122     const int numFilterBlocks = numFilters/(filtersPerThread);
2123     const int blockPxX = 4*(blockIdx.x / numImgBlocks);
2124     const int blockPxY = 4*(blockIdx.y / numFilterBlocks);
2125     const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2126     const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread;
2127
2128     const int tidx = threadIdx.y * B_X + threadIdx.x;
2129     const int loadY = tidx / 32, loadX = tidx % 32;
2130
2131     const int startPxX = MAX(0, -sizeX/2 + blockPxX);
2132     const int startPxY = MAX(0, -sizeX/2 + blockPxY);
2133     const int endPxX = MIN(imgSize, blockPxX + DIVUP(sizeX, 2) + 3);
2134     const int endPxY = MIN(imgSize, blockPxY + DIVUP(sizeX, 2) + 3);
2135
2136     const int myPxX = blockPxX + threadIdx.y % 4;
2137     const int myPxY = blockPxY + threadIdx.y / 4;
2138     const int myPxIdx = myPxY * imgSize + myPxX;
2139 //    const bool doWork = myPxX < imgSize && myPxY < imgSize;
2140     const int myStartPxY = -sizeX/2 + myPxY;
2141     const int myStartPxX = -sizeX/2 + myPxX;
2142     const int myEndPxY = myPxY + DIVUP(sizeX, 2);
2143     const int myEndPxX = myPxX + DIVUP(sizeX, 2);
2144
2145     const int imgIdx = blockImgIdx + threadIdx.x;
2146
2147     imgs        += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
2148     meanDiffs   += (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX;
2149     denoms      += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
2150     target      += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
2151
2152     float prod[filtersPerThread][imgsPerThread];
2153     #pragma unroll
2154     for (int i = 0; i < imgsPerThread; i++) {
2155         if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2156             #pragma unroll
2157             for (int f = 0; f < filtersPerThread; f++) {
2158                 prod[f][i] = 0;
2159             }
2160         }
2161     }
2162
2163     for (int y = startPxY; y < endPxY; y++) {
2164         const bool isInY = y >= myStartPxY && y < myEndPxY;
2165         for (int x = startPxX; x < endPxX; x++) {
2166             const int px = y * imgSize + x;
2167             // All the threads load a pixel from memory
2168             #pragma unroll
2169             for (int ly = 0; ly < filtersPerThread; ly += B_X/2) {
2170                 if (filtersPerThread % (B_X/2) == 0 || ly + loadY < filtersPerThread) {
2171                     #pragma unroll
2172                     for (int lx = 0; lx < B_X*imgsPerThread; lx += 32) {
2173                         if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) {
2174                             shDiffs[ly + loadY][lx + loadX] = meanDiffs[(ly * imgPixels + px) * numImages + lx];
2175                         }
2176                     }
2177                 }
2178             }
2179             __syncthreads();
2180
2181             // Each row of threads decides if it's interested in this pixel
2182             if (isInY && x >= myStartPxX && x < myEndPxX) {
2183                 #pragma unroll
2184                 for (int i = 0; i < imgsPerThread; i++) {
2185                     if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2186                         #pragma unroll
2187                         for (int f = 0; f < filtersPerThread; f++) {
2188                             prod[f][i] += square(shDiffs[f][threadIdx.x + i * B_X]);
2189                         }
2190                     }
2191                 }
2192             }
2193             __syncthreads();
2194         }
2195     }
2196 //    imgs -= (loadY * imgPixels - myPxIdx) * numImages + loadX;
2197 //    imgs += threadIdx.x;
2198     if (myPxX < imgSize && myPxY < imgSize) {
2199         #pragma unroll
2200         for (int i = 0; i < imgsPerThread; i++) {
2201             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2202                 #pragma unroll
2203                 for (int f = 0; f < filtersPerThread; f++) {
2204                     prod[f][i] = 1 + addScale * prod[f][i];
2205                     denoms[f * imgPixels * numImages + i * B_X] = prod[f][i];
2206                     target[f * imgPixels * numImages + i * B_X] = imgs[f * imgPixels * numImages + i * B_X] * __powf(prod[f][i], -powScale);
2207                 }
2208             }
2209         }
2210     }
2211 }
2212
2213 /*
2214  * Block size B_YxB_X
2215  * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
2216  * blockIdx.y determines pixel.y, filter idx in batches of B_Y
2217  *
2218  * So each block does one pixel for some number of images/filters.
2219  *
2220  * threadIdx.x determines img idx
2221  * threadIdx.y determines filter idx
2222  *
2223  * imgs:        (numFilters, imgPixels, numImages)
2224  * meanDiffs:   (numFilters, imgPixels, numImages)
2225  * denoms:      (numFilters, imgPixels, numImages) (out)
2226  * target:      (numFilters, imgPixels, numImages) (out)
2227  *
2228  * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
2229  * numFilters must be divisible by B_Y
2230  */
2231 template<int B_Y, int B_X, int imgsPerThread, bool checkCaseBounds, bool blocked>
2232 __global__ void kFCNorm(float* imgs, float* meanDiffs, float* denoms, float* target, const int imgSize,
2233                                   const int numFilters, const int numImages, const int sizeF,
2234                                   const float addScale, const float powScale) {
2235     const int imgPixels = imgSize * imgSize;
2236     const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread);
2237     const int numFilterBlocks = numFilters/B_Y;
2238     const int pxIdxX = blockIdx.x / numImgBlocks;
2239     const int pxIdxY = blockIdx.y / numFilterBlocks;
2240     const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2241     const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y;
2242
2243     const int pxIdx = pxIdxY * imgSize + pxIdxX;
2244
2245
2246     const int imgIdx = blockImgIdx + threadIdx.x;
2247
2248     imgs += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx;
2249     meanDiffs += pxIdx * numImages + imgIdx;
2250     denoms += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx;
2251     target += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx;
2252
2253     float prod[imgsPerThread];
2254     #pragma unroll
2255     for (int i = 0; i < imgsPerThread; i++) {
2256         if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2257             prod[i] = 0;
2258         }
2259     }
2260
2261     const int startF = blocked ? (filterIdx / sizeF) * sizeF : -sizeF/2 + filterIdx;
2262     const int loopStartF = blocked ? startF : MAX(0, startF);
2263     const int loopEndF = MIN(numFilters, startF + sizeF);
2264
2265     for (int f = loopStartF; f < loopEndF; ++f) {
2266         #pragma unroll
2267         for (int i = 0; i < imgsPerThread; i++) {
2268             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2269                 prod[i] += square(meanDiffs[f * imgPixels * numImages + i * B_X]);
2270             }
2271         }
2272     }
2273
2274     #pragma unroll
2275     for (int i = 0; i < imgsPerThread; i++) {
2276         if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2277             prod[i] = 1 + addScale * prod[i];
2278             denoms[i * B_X] = prod[i];
2279             target[i * B_X] = imgs[i * B_X] * __powf(prod[i], -powScale);
2280         }
2281     }
2282 }
2283
2284 /*
2285  * Block size B_YxB_X
2286  * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
2287  * blockIdx.y determines pixel.y, filter idx in batches of B_Y
2288  *
2289  * So each block does one output pixel for some number of images/filters.
2290  *
2291  * threadIdx.x determines img idx
2292  * threadIdx.y determines filter idx
2293  *
2294  * outGrads:        (numFilters, imgPixels, numImages)
2295  * denoms:          (numFilters, imgPixels, numImages)
2296  * inputs:          (numFilters, imgPixels, numImages)
2297  * acts:            (numFilters, imgPixels, numImages)
2298  * target:          (numFilters, imgPixels, numImages)
2299  *
2300  * numImages must be divisible by B_X*imgsPerThread
2301  * numFilters must be divisible by B_Y
2302  *
2303  * TODO: this isn't really ideal
2304  */
2305 template<int B_Y, int B_X, int imgsPerThread, bool add, bool checkCaseBounds, bool blocked>
2306 __global__ void kFRNormUndo(float* outGrads, float* denoms, float* inputs, float* acts, float* target, const int imgSize, const int numFilters,
2307                             const int numImages, const int sizeF, const float powScale, const float scaleTargets, const float scaleOutputs) {
2308     const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
2309     const int numFilterBlocks = numFilters/B_Y;
2310
2311     const int pxIdxX = blockIdx.x / numImgBlocks;
2312     const int pxIdxY = blockIdx.y / numFilterBlocks;
2313     const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2314     const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y;
2315
2316     const int imgPixels = imgSize * imgSize;
2317     const int pxIdx = pxIdxY * imgSize + pxIdxX;
2318     const int imgIdx = blockImgIdx + threadIdx.x;
2319
2320     acts        += pxIdx * numImages + imgIdx;
2321     inputs      += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx;
2322     denoms      += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx;
2323     outGrads    += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx;
2324     target      += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx;
2325
2326     float prod[imgsPerThread];
2327 //    if (imgIdx != 0 || pxIdx != 0 || filterIdx != 0) {
2328 //        return;
2329 //    }
2330     #pragma unroll
2331     for (int i = 0; i < imgsPerThread; i++) {
2332         prod[i] = 0;
2333     }
2334
2335     const int startF = blocked ? (filterIdx / sizeF) * sizeF : -sizeF + sizeF/2 + 1 + filterIdx;
2336     const int loopStartF = blocked ? startF : MAX(0, startF);
2337     const int loopEndF = MIN(numFilters, startF + sizeF);
2338
2339     for (int f = loopStartF; f < loopEndF; ++f) {
2340         #pragma unroll
2341         for (int i = 0; i < imgsPerThread; i++) {
2342             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2343                 prod[i] += acts[f * imgPixels * numImages + i * B_X];
2344             }
2345         }
2346     }
2347 //    printf("gpu f start: %d, end: %d\n", loopStartF, loopEndF);
2348
2349     if (!add) {
2350         #pragma unroll
2351         for (int i = 0; i < imgsPerThread; i++) {
2352             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2353                 const float inp = inputs[i * B_X];
2354                 const float out = outGrads[i * B_X];
2355                 const float den = denoms[i * B_X];
2356                 prod[i] = inp * prod[i] + out * __powf(den, -powScale);
2357                 target[i * B_X] = prod[i];
2358             }
2359         }
2360     } else {
2361         #pragma unroll
2362         for (int i = 0; i < imgsPerThread; i++) {
2363             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2364                 const float inp = inputs[i * B_X];
2365                 const float out = outGrads[i * B_X];
2366                 const float den = denoms[i * B_X];
2367                 prod[i] = inp * prod[i] + out * __powf(den, -powScale);
2368                 target[i * B_X] = scaleTargets * target[i * B_X] + scaleOutputs * prod[i];
2369             }
2370         }
2371     }
2372 }
2373
2374 /*
2375  * Block size B_YxB_X
2376  * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
2377  * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
2378  *
2379  * So each block does one pixel for some number of images/filters.
2380  *
2381  * threadIdx.x determines img idx
2382  * threadIdx.y determines filter idx
2383  *
2384  * imgs:        (numFilters, imgPixels, numImages)
2385  * target:      (numFilters, imgPixels, numImages)
2386  *
2387  * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
2388  * numFilters must be divisible by B_Y*filtersPerThread
2389  *
2390  * sizeX should be something like 3 or 5 for this function. Not much more.
2391  * TODO: write variant where each block does 4x4 region or so (this'll be based on kCNorm2).
2392  */
2393 template<int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool checkCaseBounds>
2394 __global__ void kTICA_manyfilter(float* imgs, float* target, const int imgSize,
2395                                      const int numFilters, const int numImages, const int sizeX,
2396                                      const float scaleTarget, const float scaleOutput) {
2397     const int imgPixels = imgSize * imgSize;
2398     const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread);
2399     const int numFilterBlocks = numFilters/(B_Y*filtersPerThread);
2400     const int pxIdxX = blockIdx.x / numImgBlocks;
2401     const int pxIdxY = blockIdx.y / numFilterBlocks;
2402     const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2403     const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
2404
2405     const int pxIdx = pxIdxY * imgSize + pxIdxX;
2406
2407     const int startPxX = -sizeX/2 + pxIdxX;
2408     const int startPxY = -sizeX/2 + pxIdxY;
2409     const int imgIdx = blockImgIdx + threadIdx.x;
2410
2411     imgs += ((blockFilterIdx + threadIdx.y) * imgPixels) * numImages + imgIdx;
2412     target += ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
2413
2414     float prod[filtersPerThread][imgsPerThread];
2415     #pragma unroll
2416     for (int i = 0; i < imgsPerThread; i++) {
2417         if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2418             #pragma unroll
2419             for (int f = 0; f < filtersPerThread; f++) {
2420                 prod[f][i] = 0;
2421             }
2422         }
2423     }
2424     const int loopStartY = MAX(0, startPxY);
2425     const int loopStartX = MAX(0, startPxX);
2426     const int loopEndY = MIN(imgSize, startPxY + sizeX);
2427     const int loopEndX = MIN(imgSize, startPxX + sizeX);
2428
2429     for (int y = loopStartY; y < loopEndY; y++) {
2430         for (int x = loopStartX; x < loopEndX; x++) {
2431
2432             const int imgPx = y * imgSize + x;
2433             #pragma unroll
2434             for (int i = 0; i < imgsPerThread; i++) {
2435
2436                 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2437                     #pragma unroll
2438                     for (int f = 0; f < filtersPerThread; f++) {
2439                         prod[f][i] += square(imgs[(f * B_Y * imgPixels + imgPx) * numImages + i * B_X]);
2440                     }
2441                 }
2442             }
2443         }
2444     }
2445     imgs += pxIdx * numImages;
2446     if (scaleTarget == 0) {
2447         #pragma unroll
2448         for (int i = 0; i < imgsPerThread; i++) {
2449             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2450                 #pragma unroll
2451                 for (int f = 0; f < filtersPerThread; f++) {
2452                     target[f * B_Y * imgPixels * numImages + i * B_X] = scaleOutput * __fdividef(1.0f, 0.001 + sqrtf(prod[f][i]));
2453                 }
2454             }
2455         }
2456     } else {
2457         #pragma unroll
2458         for (int i = 0; i < imgsPerThread; i++) {
2459             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2460                 #pragma unroll
2461                 for (int f = 0; f < filtersPerThread; f++) {
2462                     target[f * B_Y * imgPixels * numImages + i * B_X] = scaleTarget * target[f * B_Y * imgPixels * numImages + i * B_X] + scaleOutput * __fdividef(1.0f, 0.001 + sqrtf(prod[f][i]));
2463                 }
2464             }
2465         }
2466     }
2467 }
2468
2469 /*
2470  * Block size B_YxB_X
2471  * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
2472  * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
2473  *
2474  * So each block does one pixel for some number of images/filters.
2475  *
2476  * threadIdx.x determines img idx
2477  * threadIdx.y determines filter idx
2478  *
2479  * imgs:        (numFilters, imgPixels, numImages)
2480  * ticas:       (numFilters, imgPixels, numImages)
2481  * target:      (numFilters, imgPixels, numImages)
2482  *
2483  * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
2484  * numFilters must be divisible by B_Y*filtersPerThread
2485  *
2486  * sizeX should be something like 3 or 5 for this function. Not much more.
2487  * TODO: write variant where each block does 4x4 region or so (this'll be based on kCNorm2).
2488  */
2489 template<int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool checkCaseBounds>
2490 __global__ void kTICAGrad_manyfilter(float* imgs, float* ticas, float* target, const int imgSize,
2491                                      const int numFilters, const int numImages, const int sizeX,
2492                                      const float scaleTarget, const float scaleOutput) {
2493     const int imgPixels = imgSize * imgSize;
2494     const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread);
2495     const int numFilterBlocks = numFilters/(B_Y*filtersPerThread);
2496     const int pxIdxX = blockIdx.x / numImgBlocks;
2497     const int pxIdxY = blockIdx.y / numFilterBlocks;
2498     const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2499     const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
2500
2501     const int pxIdx = pxIdxY * imgSize + pxIdxX;
2502
2503     const int startPxX = -sizeX/2 + pxIdxX;
2504     const int startPxY = -sizeX/2 + pxIdxY;
2505     const int imgIdx = blockImgIdx + threadIdx.x;
2506
2507     imgs += ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
2508     ticas += ((blockFilterIdx + threadIdx.y) * imgPixels) * numImages + imgIdx;
2509     target += ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
2510
2511     float prod[filtersPerThread][imgsPerThread];
2512     #pragma unroll
2513     for (int i = 0; i < imgsPerThread; i++) {
2514         if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2515             #pragma unroll
2516             for (int f = 0; f < filtersPerThread; f++) {
2517                 prod[f][i] = 0;
2518             }
2519         }
2520     }
2521     const int loopStartY = MAX(0, startPxY);
2522     const int loopStartX = MAX(0, startPxX);
2523     const int loopEndY = MIN(imgSize, startPxY + sizeX);
2524     const int loopEndX = MIN(imgSize, startPxX + sizeX);
2525
2526     for (int y = loopStartY; y < loopEndY; y++) {
2527         for (int x = loopStartX; x < loopEndX; x++) {
2528
2529             const int imgPx = y * imgSize + x;
2530             #pragma unroll
2531             for (int i = 0; i < imgsPerThread; i++) {
2532
2533                 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2534                     #pragma unroll
2535                     for (int f = 0; f < filtersPerThread; f++) {
2536                         // adding 1/S values
2537                         prod[f][i] += ticas[(f * B_Y * imgPixels + imgPx) * numImages + i * B_X];
2538                     }
2539                 }
2540             }
2541         }
2542     }
2543     if (scaleTarget == 0) {
2544         #pragma unroll
2545         for (int i = 0; i < imgsPerThread; i++) {
2546             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2547                 #pragma unroll
2548                 for (int f = 0; f < filtersPerThread; f++) {
2549                     target[f * B_Y * imgPixels * numImages + i * B_X] = scaleOutput * -imgs[f * B_Y * imgPixels * numImages + i * B_X] * prod[f][i];
2550                 }
2551             }
2552         }
2553     } else {
2554         #pragma unroll
2555         for (int i = 0; i < imgsPerThread; i++) {
2556             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2557                 #pragma unroll
2558                 for (int f = 0; f < filtersPerThread; f++) {
2559                     target[f * B_Y * imgPixels * numImages + i * B_X] = scaleTarget * target[f * B_Y * imgPixels * numImages + i * B_X] + scaleOutput * -imgs[f * B_Y * imgPixels * numImages + i * B_X] * sqrtf(prod[f][i]);
2560                 }
2561             }
2562         }
2563     }
2564 }
2565
2566 /*
2567  * Block size B_YxB_X
2568  * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
2569  * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
2570  *
2571  * So each block does one output pixel for some number of images/filters.
2572  *
2573  * threadIdx.x determines img idx
2574  * threadIdx.y determines filter idx
2575  *
2576  * imgs:        (numFilters, imgPixels, numImages)
2577  * maxGrads:    (numFilters, numOutputs, numImages)
2578  * rMaxActs:    (numFilters, numOutputs, numImages)
2579  * target:      (numFilters, imgPixels, numImages)
2580  *
2581  * numImages must be divisible by B_X*imgsPerThread
2582  * numFilters must be divisible by B_Y*filtersPerThread
2583  */
2584
2585 template<int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool add, bool checkCaseBounds>
2586 __global__ void kLocalAvgUndo(float* avgGrads, float* target, const int imgSize, const int numFilters,
2587                               const int numImages, const int subsX, const int startX, const int strideX, const int outputsX,
2588                               const float scaleTargets, const float scaleOutputs) {
2589     const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
2590     const int blockPxX = blockIdx.x / numImgBlocks;
2591     const int blockPxY = blockIdx.y / (numFilters/(B_Y*filtersPerThread));
2592
2593     const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2594     const int blockFilterIdx = (blockIdx.y % (numFilters/(B_Y*filtersPerThread))) * B_Y * filtersPerThread;
2595
2596     const int blockPx = blockPxY * imgSize + blockPxX;
2597     const int numOutputs = outputsX * outputsX;
2598     const int imgPixels = imgSize * imgSize;
2599
2600     const int startOutputY = blockPxY - startX < subsX ? 0 : 1 + (blockPxY - startX - subsX) / strideX;
2601     const int endOutputY = MIN(outputsX, 1 + (blockPxY - startX) / strideX);
2602     const int startOutputX = blockPxX - startX < subsX ? 0 : 1 + (blockPxX - startX - subsX) / strideX;
2603     const int endOutputX = MIN(outputsX, 1 + (blockPxX - startX) / strideX);
2604
2605     const int imgIdx = blockImgIdx + threadIdx.x;
2606
2607     avgGrads += ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages + imgIdx;
2608     target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx;
2609
2610     float prod[filtersPerThread][imgsPerThread];
2611     #pragma unroll
2612     for (int f = 0; f < filtersPerThread; f++) {
2613         #pragma unroll
2614         for (int i = 0; i < imgsPerThread; i++) {
2615             prod[f][i] = 0;
2616         }
2617     }
2618
2619     if (blockPxX >= startX && blockPxX < startX + strideX * (outputsX-1) + subsX
2620             && blockPxY >= startX && blockPxY < startX + strideX * (outputsX-1) + subsX) {
2621
2622         for (int my = startOutputY; my < endOutputY; my++) {
2623             const float regionStartY = fmaxf(0, startX + my * strideX);
2624             const float regionEndY = fminf(imgSize, startX + my * strideX + subsX);
2625             const float regionSizeY = regionEndY - regionStartY;
2626             for (int mx = startOutputX; mx < endOutputX; mx++) {
2627                 const int outputIdx = my * outputsX + mx;
2628                 const float regionStartX = fmaxf(0, startX + mx * strideX);
2629                 const float regionEndX = fminf(imgSize, startX + mx * strideX + subsX);
2630                 const float regionSizeX = regionEndX - regionStartX;
2631                 // It's important to do the division here, because pushing division into the below
2632                 // loops makes the code 4x slower.
2633                 const float regionSizeInv = 1.0f / (regionSizeX * regionSizeY);
2634                 #pragma unroll
2635                 for (int i = 0; i < imgsPerThread; i++) {
2636                     if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2637                         #pragma unroll
2638                         for (int f = 0; f < filtersPerThread; f++) {
2639                             prod[f][i] += avgGrads[(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X] * regionSizeInv;
2640                         }
2641                     }
2642                 }
2643             }
2644         }
2645     }
2646
2647     if (!add) {
2648         #pragma unroll
2649         for (int i = 0; i < imgsPerThread; i++) {
2650             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2651                 #pragma unroll
2652                 for (int f = 0; f < filtersPerThread; f++) {
2653                     target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
2654                 }
2655             }
2656         }
2657     } else {
2658         #pragma unroll
2659         for (int i = 0; i < imgsPerThread; i++) {
2660             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2661                 #pragma unroll
2662                 for (int f = 0; f < filtersPerThread; f++) {
2663                     target[f * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[f][i];
2664                 }
2665             }
2666         }
2667     }
2668 }
2669
2670 /*
2671  * Block size B_YxB_X
2672  * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
2673  * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
2674  *
2675  * So each block does one output pixel for some number of images/filters.
2676  *
2677  * threadIdx.x determines img idx
2678  * threadIdx.y determines filter idx
2679  *
2680  * imgs:        (numFilters, imgPixels, numImages)
2681  * maxGrads:    (numFilters, numOutputs, numImages)
2682  * maxActs:    (numFilters, numOutputs, numImages)
2683  * target:      (numFilters, imgPixels, numImages)
2684  *
2685  * numImages must be divisible by B_X*imgsPerThread
2686  * numFilters must be divisible by B_Y*filtersPerThread
2687  */
2688
2689 template<int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool add, bool checkCaseBounds>
2690 __global__ void kLocalMaxUndo(float* imgs, float* maxGrads, float* maxActs, float* target, const int imgSize, const int numFilters,
2691                               const int numImages, const int subsX, const int startX, const int strideX, const int outputsX,
2692                               const float scaleTargets, const float scaleOutputs) {
2693     __shared__ float shImgs[B_Y*filtersPerThread][B_X*imgsPerThread];
2694     const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
2695     const int blockPxX = blockIdx.x / numImgBlocks;
2696     const int blockPxY = blockIdx.y / (numFilters/(B_Y*filtersPerThread));
2697
2698     const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2699     const int blockFilterIdx = (blockIdx.y % (numFilters/(B_Y*filtersPerThread))) * B_Y * filtersPerThread;
2700
2701     const int blockPx = blockPxY * imgSize + blockPxX;
2702     const int numOutputs = outputsX * outputsX;
2703     const int imgPixels = imgSize * imgSize;
2704
2705     const int startOutputY = blockPxY - startX < subsX ? 0 : 1 + (blockPxY - startX - subsX) / strideX;
2706     const int endOutputY = MIN(outputsX, 1 + (blockPxY - startX) / strideX);
2707     const int startOutputX = blockPxX - startX < subsX ? 0 : 1 + (blockPxX - startX - subsX) / strideX;
2708     const int endOutputX = MIN(outputsX, 1 + (blockPxX - startX) / strideX);
2709
2710     const int imgIdx = blockImgIdx + threadIdx.x;
2711
2712     imgs += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx;
2713     maxGrads += ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages
2714             + imgIdx;
2715     maxActs += ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages
2716             + imgIdx;
2717
2718     target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx;
2719
2720     float prod[filtersPerThread][imgsPerThread];
2721     #pragma unroll
2722     for (int f = 0; f < filtersPerThread; f++) {
2723         #pragma unroll
2724         for (int i = 0; i < imgsPerThread; i++) {
2725             prod[f][i] = 0;
2726         }
2727     }
2728
2729     if  (blockPxX >= startX && blockPxX < startX + strideX * (outputsX-1) + subsX
2730          && blockPxY >= startX && blockPxY < startX + strideX * (outputsX-1) + subsX) {
2731         #pragma unroll
2732         for (int i = 0; i < imgsPerThread; i++) {
2733             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2734                 #pragma unroll
2735                 for (int f = 0; f < filtersPerThread; f++) {
2736                     shImgs[threadIdx.y + B_Y * f][threadIdx.x + B_X * i] = imgs[f * B_Y * imgPixels * numImages + i * B_X];
2737                 }
2738             }
2739         }
2740         for (int my = startOutputY; my < endOutputY; my++) {
2741             for (int mx = startOutputX; mx < endOutputX; mx++) {
2742                 const int outputIdx = my * outputsX + mx;
2743                 #pragma unroll
2744                 for (int i = 0; i < imgsPerThread; i++) {
2745                     if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2746                         #pragma unroll
2747                         for (int f = 0; f < filtersPerThread; f++) {
2748                             const float ma = maxActs[(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X];
2749                             const float mg = maxGrads[(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X];
2750                             const float img = shImgs[threadIdx.y + B_Y * f][threadIdx.x + B_X * i];
2751
2752                             prod[f][i] += (img == ma) * mg;
2753                         }
2754                     }
2755                 }
2756             }
2757         }
2758     }
2759     if (!add) {
2760         #pragma unroll
2761         for (int i = 0; i < imgsPerThread; i++) {
2762             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2763                 #pragma unroll
2764                 for (int f = 0; f < filtersPerThread; f++) {
2765                     target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
2766                 }
2767             }
2768         }
2769     } else {
2770         #pragma unroll
2771         for (int i = 0; i < imgsPerThread; i++) {
2772             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2773                 #pragma unroll
2774                 for (int f = 0; f < filtersPerThread; f++) {
2775                     target[f * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[f][i];
2776                 }
2777             }
2778         }
2779     }
2780 }
2781
2782
2783
2784
2785 template<int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool checkCaseBounds>
2786 __global__ void kLocalProbMaxUndo(float* maxout_h,  float* maxout_p, float* hGrads, float* pGrads, float* target_z, float* target_t, const int imgSize, const int numFilters, const int numImages, const int subsX, const int startX, const int strideX, const int outputsX, float * gp_iszero, float * gh_iszero) {
2787
2788     const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
2789     const int numFilterBlocks = DIVUP(numFilters, B_Y*filtersPerThread);
2790     const int outputIdxX = blockIdx.x / numImgBlocks;
2791     const int outputIdxY = blockIdx.y / numFilterBlocks;
2792     const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2793     const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
2794     const int myFilterIdx = (blockFilterIdx + threadIdx.y*filtersPerThread);
2795     if (myFilterIdx >= numFilters) {
2796         return;
2797     }
2798
2799     const int outputIdx = outputIdxY * outputsX + outputIdxX;
2800     const int numOutputs = outputsX * outputsX;
2801     const int imgPixels = imgSize * imgSize;
2802
2803     const int startImgPxX = startX + outputIdxX * strideX;
2804     const int startImgPxY = startX + outputIdxY * strideX;
2805     const int imgIdx = blockImgIdx + threadIdx.x;
2806
2807     maxout_h += myFilterIdx * imgPixels * numImages + imgIdx;
2808     hGrads += myFilterIdx * imgPixels * numImages + imgIdx;
2809     target_z += myFilterIdx * imgPixels * numImages + imgIdx;
2810     maxout_p += (myFilterIdx * numOutputs + outputIdx) * numImages + imgIdx;
2811     pGrads += (myFilterIdx * numOutputs + outputIdx) * numImages + imgIdx;
2812     target_t += (myFilterIdx * numOutputs + outputIdx) * numImages + imgIdx;
2813
2814     float prod[filtersPerThread][imgsPerThread];
2815     for (int f = 0; f < filtersPerThread; f++) {
2816         for (int i = 0; i < imgsPerThread; i++) {
2817             prod[f][i] = 0;
2818         }
2819     }
2820
2821     const int loopStartY = MAX(0, startImgPxY);
2822     const int loopStartX = MAX(0, startImgPxX);
2823     const int loopEndY = MIN(imgSize, startImgPxY + subsX);
2824     const int loopEndX = MIN(imgSize, startImgPxX + subsX);
2825
2826
2827     for (int y = loopStartY; y < loopEndY; y++) {
2828         for (int x = loopStartX; x < loopEndX; x++) {
2829             const int imgPx = y * imgSize + x;
2830             for (int i = 0; i < imgsPerThread; i++) {
2831                 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2832                     for (int f = 0; f < filtersPerThread; f++) {
2833                         const float ma = maxout_h[(f * imgPixels + imgPx) * numImages + i * B_X];
2834                         const float mg = hGrads[(f * imgPixels + imgPx) * numImages + i * B_X];
2835                         prod[f][i] += ma * mg;
2836                     }
2837                 }
2838             }
2839         }
2840     }
2841
2842
2843     for (int i = 0; i < imgsPerThread; i++) {
2844         if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2845             for (int f = 0; f < filtersPerThread; f++) {
2846                 prod[f][i] -= (1 - maxout_p[f*numOutputs*numImages + i * B_X]) * pGrads[f*numOutputs*numImages + i * B_X];
2847             }
2848         }
2849     }
2850
2851
2852     for (int y = loopStartY; y < loopEndY; y++) {
2853         for (int x = loopStartX; x < loopEndX; x++) {
2854             const int imgPx = y * imgSize + x;
2855             for (int i = 0; i < imgsPerThread; i++) {
2856                 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2857                     for (int f = 0; f < filtersPerThread; f++) {
2858                         const float ma = maxout_h[(f * imgPixels + imgPx) * numImages + i * B_X];
2859                         const float mg = hGrads[(f * imgPixels + imgPx) * numImages + i * B_X];
2860                         target_z[(f*imgPixels + imgPx) * numImages + i * B_X] = ma * mg - (prod[f][i] * ma);
2861                     }
2862                 }
2863             }
2864         }
2865     }
2866
2867     // it's wierd
2868     for (int i = 0; i < imgsPerThread; i++) {
2869         if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2870             for (int f = 0; f < filtersPerThread; f++) {
2871                 const float ma = maxout_p[f*numOutputs*numImages + i * B_X];
2872                 float mg = pGrads[f*numOutputs*numImages + i * B_X];
2873                 if (*gh_iszero == 1) {
2874                     target_t[f*numOutputs*numImages + i * B_X] = - prod[f][i] * ma;
2875                 } else if (*gp_iszero == 1) {
2876                     target_t[f*numOutputs*numImages + i * B_X] = ma - prod[f][i] * ma;
2877                 } else {
2878                     target_t[f*numOutputs*numImages + i * B_X] = ma * mg - prod[f][i] * ma;
2879                 }
2880             }
2881         }
2882     }
2883 }
2884
2885
2886 /*
2887  * acts := -2 x scale x acts x outGrads / denoms
2888  */
2889 template<int B_X, int eltsPerThread>
2890 __global__ void kRNormUndoPrelims(float* acts, float* denoms, float* outGrads,
2891                                   const uint numElements, const float scale) {
2892     const uint e = B_X * blockIdx.x * eltsPerThread + threadIdx.x;
2893     const uint numThreads = B_X * gridDim.x;
2894     for (uint i = e; i < numElements; i += numThreads*eltsPerThread) {
2895         #pragma unroll
2896         for (uint k = 0; k < eltsPerThread; k++) {
2897             if (i + k * B_X < numElements) {
2898                 acts[i + k * B_X] = __fdividef(scale*outGrads[i + k * B_X] * acts[i + k * B_X], denoms[i + k * B_X]);
2899             }
2900         }
2901     }
2902 }
2903
2904 /*
2905  * Block size B_YxB_X
2906  * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
2907  * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
2908  *
2909  * So each block does one output pixel for some number of images/filters.
2910  *
2911  * threadIdx.x determines img idx
2912  * threadIdx.y determines filter idx
2913  *
2914  * outGrads:        (numFilters, imgPixels, numImages)
2915  * denoms:          (numFilters, imgPixels, numImages)
2916  * inputs:          (numFilters, imgPixels, numImages)
2917  * acts:            (numFilters, imgPixels, numImages)
2918  * target:          (numFilters, imgPixels, numImages)
2919  *
2920  * numImages must be divisible by B_X*imgsPerThread
2921  * numFilters must be divisible by B_Y*filtersPerThread
2922  *
2923  * TODO: this isn't really ideal
2924  */
2925 template<int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool add, bool checkCaseBounds>
2926 __global__ void kRNormUndo(float* outGrads, float* denoms, float* inputs, float* acts, float* target, const int imgSize, const int numFilters,
2927                               const int numImages, const int sizeX, const float powScale, const float scaleTargets, const float scaleOutputs) {
2928     const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
2929     const int numFilterBlocks = numFilters/(B_Y*filtersPerThread);
2930
2931     const int blockPxX = blockIdx.x / numImgBlocks;
2932     const int blockPxY = blockIdx.y / numFilterBlocks;
2933
2934     const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2935     const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
2936
2937     const int blockPx = blockPxY * imgSize + blockPxX;
2938     const int imgPixels = imgSize * imgSize;
2939
2940     const int startY = MAX(0, blockPxY + sizeX/2 - sizeX + 1);
2941     const int startX = MAX(0, blockPxX + sizeX/2 - sizeX + 1);
2942     const int endY = MIN(imgSize, blockPxY + sizeX/2 + 1);
2943     const int endX = MIN(imgSize, blockPxX + sizeX/2 + 1);
2944
2945     const int imgIdx = blockImgIdx + threadIdx.x;
2946
2947     acts        += ((blockFilterIdx + threadIdx.y) * imgPixels) * numImages + imgIdx;
2948     inputs      += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx;
2949     denoms      += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx;
2950     outGrads    += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx;
2951     target      += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx;
2952
2953     float prod[filtersPerThread][imgsPerThread];
2954     #pragma unroll
2955     for (int f = 0; f < filtersPerThread; f++) {
2956         #pragma unroll
2957         for (int i = 0; i < imgsPerThread; i++) {
2958             prod[f][i] = 0;
2959         }
2960     }
2961
2962     for (int sy = startY; sy < endY; sy++) {
2963         for (int sx = startX; sx < endX; sx++) {
2964             const int outPx = sy * imgSize + sx;
2965
2966             #pragma unroll
2967             for (int i = 0; i < imgsPerThread; i++) {
2968                 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2969                     #pragma unroll
2970                     for (int f = 0; f < filtersPerThread; f++) {
2971                         prod[f][i] += acts[(f * B_Y * imgPixels + outPx) * numImages + i * B_X];
2972                     }
2973                 }
2974             }
2975         }
2976     }
2977 //    outGrads += blockPx * numImages;
2978     if (!add) {
2979         #pragma unroll
2980         for (int i = 0; i < imgsPerThread; i++) {
2981             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2982                 #pragma unroll
2983                 for (int f = 0; f < filtersPerThread; f++) {
2984                     const float inp = inputs[(f * B_Y * imgPixels) * numImages + i * B_X];
2985                     const float out = outGrads[(f * B_Y * imgPixels) * numImages + i * B_X];
2986                     const float den = denoms[(f * B_Y * imgPixels) * numImages + i * B_X];
2987                     prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale);
2988                     target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
2989                 }
2990             }
2991         }
2992     } else {
2993         #pragma unroll
2994         for (int i = 0; i < imgsPerThread; i++) {
2995             if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2996                 #pragma unroll
2997                 for (int f = 0; f < filtersPerThread; f++) {
2998                     const float inp = inputs[(f * B_Y * imgPixels) * numImages + i * B_X];
2999                     const float out = outGrads[(f * B_Y * imgPixels) * numImages + i * B_X];
3000                     const float den = denoms[(f * B_Y * imgPixels) * numImages + i * B_X];
3001                     prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale);
3002                     target[f * B_Y * imgPixels * numImages + i * B_X] =
3003                                                 scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X]
3004                                                 + scaleOutputs * prod[f][i];
3005                 }
3006             }
3007         }
3008     }
3009 }
3010
3011
3012 /*
3013  * Block size 16xB_X
3014  * blockIdx.x determines 4x4 pixel.x region, image idx in batches of B_X*imgsPerThread
3015  * blockIdx.y determines 4x4 pixel.y region, filter idx in batches of filtersPerThread
3016  *
3017  * So each block does 4x4 region for some number of images/filters.
3018  *
3019  * threadIdx.x determines img idx
3020  * threadIdx.y determines pixel idx
3021  *
3022  * outGrads:        (numFilters, imgPixels, numImages)
3023  * denoms:          (numFilters, imgPixels, numImages)
3024  * inputs:          (numFilters, imgPixels, numImages)
3025  * acts:            (numFilters, imgPixels, numImages)
3026  * target:          (numFilters, imgPixels, numImages)
3027  *
3028  * B_X one of 8, 16, 32
3029  * imgsPerThread one of 1, 2, 4, 8, 16
3030  *
3031  * B_XximgsPerThread MUST be divisible by 32.
3032  * Number of filters MUST be divisible by filtersPerThread.
3033  *
3034  * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
3035  * numFilters must be divisible by filtersPerThread
3036  *
3037  * Final write-out will not be fully coalesced unless B_X is 32. But there's a lot more
3038  * reading than writing here, and the reading is all coalesced, so it should be OK.
3039  */
3040 template<int B_X, int imgsPerThread, int filtersPerThread, bool add, bool checkCaseBounds>
3041 __global__ void kRNormUndo2(float* outGrads, float* denoms, float* inputs, float* acts, float* target, const int imgSize, const int numFilters,
3042                             const int numImages, const int sizeX, const float powScale, const float scaleTargets, const float scaleOutputs) {
3043     __shared__ float shActs[filtersPerThread][B_X*imgsPerThread];
3044     const int imgPixels = imgSize * imgSize;
3045     const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread);
3046     const int numFilterBlocks = numFilters/(filtersPerThread);
3047     const int blockPxX = 4*(blockIdx.x / numImgBlocks);
3048     const int blockPxY = 4*(blockIdx.y / numFilterBlocks);
3049     const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
3050     const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread;
3051
3052     const int tidx = threadIdx.y * B_X + threadIdx.x;
3053     const int loadY = tidx / 32, loadX = tidx % 32;
3054
3055     const int startPxX = MAX(0, -DIVUP(sizeX,2) + blockPxX + 1);
3056     const int startPxY = MAX(0, -DIVUP(sizeX,2) + blockPxY + 1);
3057     const int endPxX = MIN(imgSize, blockPxX + sizeX/2 + 4);
3058     const int endPxY = MIN(imgSize, blockPxY + sizeX/2 + 4);
3059
3060     const int myPxX = blockPxX + threadIdx.y % 4;
3061     const int myPxY = blockPxY + threadIdx.y / 4;
3062     const int myPxIdx = myPxY * imgSize + myPxX;
3063 //    const bool doWork = myPxX < imgSize && myPxY < imgSize;
3064     const int myStartPxY = -DIVUP(sizeX,2) + myPxY + 1;
3065     const int myStartPxX = -DIVUP(sizeX,2) + myPxX + 1;
3066     const int myEndPxY = myPxY + sizeX/2 + 1;
3067     const int myEndPxX = myPxX + sizeX/2 + 1;
3068
3069     const int imgIdx = blockImgIdx + threadIdx.x;
3070
3071     acts        += (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX;
3072     denoms      += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
3073     inputs      += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
3074     outGrads    += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
3075     target      += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
3076
3077     float prod[filtersPerThread][imgsPerThread];
3078     #pragma unroll
3079     for (int f = 0; f < filtersPerThread; f++) {
3080         #pragma unroll
3081         for (int i = 0; i < imgsPerThread; i++) {
3082             prod[f][i] = 0;
3083         }
3084     }
3085
3086     for (int y = startPxY; y < endPxY; y++) {
3087         const bool isInY = y >= myStartPxY && y < myEndPxY;
3088         for (int x = startPxX; x < endPxX; x++) {
3089             const int px = y * imgSize + x;
3090             // All the threads load a pixel from memory
3091             #pragma unroll
3092             for (int ly = 0; ly < filtersPerThread; ly += B_X/2) {
3093                 if (filtersPerThread % (B_X/2) == 0 || ly + loadY < filtersPerThread) {
3094                     #pragma unroll
3095                     for (int lx = 0; lx < B_X*imgsPerThread; lx += 32) {
3096                         if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) {
3097                             shActs[ly + loadY][lx + loadX] = acts[(ly * imgPixels + px) * numImages + lx];
3098                         }
3099                     }
3100                 }
3101             }
3102             __syncthreads();
3103
3104             // Each row of threads decides if it's interested in this pixel
3105             if (isInY && x >= myStartPxX && x < myEndPxX) {
3106                 #pragma unroll
3107                 for (int i = 0; i < imgsPerThread; i++) {
3108                     if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
3109                         #pragma unroll
3110                         for (int f = 0; f < filtersPerThread; f++) {
3111                             prod[f][i] += shActs[f][threadIdx.x + i * B_X];
3112                         }
3113                     }
3114                 }
3115             }
3116             __syncthreads();
3117         }
3118     }
3119     acts -= (loadY * imgPixels - myPxIdx) * numImages + loadX;
3120     acts += threadIdx.x;
3121     if (myPxX < imgSize && myPxY < imgSize) {
3122         if (!add) {
3123             #pragma unroll
3124             for (int i = 0; i < imgsPerThread; i++) {
3125                 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
3126                     #pragma unroll
3127                     for (int f = 0; f < filtersPerThread; f++) {
3128                         const float out = outGrads[f * imgPixels * numImages + i * B_X];
3129                         const float den = denoms[f * imgPixels * numImages + i * B_X];
3130                         const float inp = inputs[f * imgPixels * numImages + i * B_X];
3131                         prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale);
3132                         target[f * imgPixels * numImages + i * B_X] = prod[f][i];
3133                     }
3134                 }
3135             }
3136         } else {
3137             #pragma unroll
3138             for (int i = 0; i < imgsPerThread; i++) {
3139                 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
3140                     #pragma unroll
3141                     for (int f = 0; f < filtersPerThread; f++) {
3142                         const float out = outGrads[f * imgPixels * numImages + i * B_X];
3143                         const float den = denoms[f * imgPixels * numImages + i * B_X];
3144                         const float inp = inputs[f * imgPixels * numImages + i * B_X];
3145                         prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale);
3146                         target[f * imgPixels * numImages + i * B_X] = scaleTargets * target[f * imgPixels * numImages + i * B_X] + scaleOutputs * prod[f][i];
3147                     }
3148                 }
3149             }
3150         }
3151
3152     }
3153 }
3154
3155 void convLocalMaxUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target,
3156                       int subsX, int startX, int strideX, int outputsX) {
3157     convLocalMaxUndo(images, maxGrads, maxActs, target, subsX, startX, strideX, outputsX, 0, 1);
3158 }
3159
3160 /*
3161  * imgs:        (numFilters * imgPixels, numImages)
3162  * maxGrads:    (numFilters * numOutputs, numImages)
3163  * maxActs:    (numFilters * numOutputs, numImages)
3164  * target:      (numFilters * imgPixels, numImages)
3165  */
3166 void convLocalMaxUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target,
3167                       int subsX, int startX, int strideX, int outputsX, float scaleTargets, float scaleOutput) {
3168     int outputs = outputsX * outputsX;
3169     int numImages = images.getNumCols();
3170     int numFilters = maxGrads.getNumRows() / outputs;
3171     int imgPixels = images.getNumRows() / numFilters;
3172     assert(images.getNumRows() == numFilters * imgPixels);
3173     int imgSize = int(sqrt((double)imgPixels));
3174
3175     assert(imgSize * imgSize == imgPixels);
3176     assert(maxGrads.getNumRows() == numFilters * outputs);
3177     assert(maxGrads.getNumCols() == numImages);
3178     assert(!images.isTrans());
3179     assert(!target.isTrans());
3180     assert(!maxGrads.isTrans());
3181     assert(!maxActs.isTrans());
3182     assert(images.isContiguous());
3183     assert(maxGrads.isContiguous());
3184     assert(maxActs.isContiguous());
3185     assert(maxGrads.isSameDims(maxActs));
3186     assert(numFilters % 16 == 0);
3187 //    assert(numImages % 128 == 0);
3188
3189     assert(strideX <= subsX);
3190
3191     target.resize(images);
3192     assert(target.isContiguous());
3193     int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
3194     int checkCaseBounds = numImages % (32*imgsPerThread) != 0;
3195     dim3 threads(32, 4);
3196     dim3 blocks(DIVUP(numImages,32*imgsPerThread) * imgSize, (numFilters / (4 * 2)) * imgSize);
3197
3198     if (imgsPerThread == 4) {
3199         if  (checkCaseBounds) {
3200             if (scaleTargets == 0 && scaleOutput == 1) {
3201                 kLocalMaxUndo<4, 32, 4, 2, false, true><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3202                                                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3203             } else {
3204                 kLocalMaxUndo<4, 32, 4, 2, true, true><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3205                                                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3206             }
3207         } else {
3208             if (scaleTargets == 0 && scaleOutput == 1) {
3209                 kLocalMaxUndo<4, 32, 4, 2, false, false><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3210                                                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3211             } else {
3212                 kLocalMaxUndo<4, 32, 4, 2, true, false><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3213                                                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3214             }
3215         }
3216     } else if (imgsPerThread == 2) {
3217         if  (checkCaseBounds) {
3218             if (scaleTargets == 0 && scaleOutput == 1) {
3219                 kLocalMaxUndo<4, 32, 2, 2, false, true><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3220                                                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3221             } else {
3222                 kLocalMaxUndo<4, 32, 2, 2, true, true><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3223                                                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3224             }
3225         } else {
3226             if (scaleTargets == 0 && scaleOutput == 1) {
3227                 kLocalMaxUndo<4, 32, 2, 2, false, false><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3228                                                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3229             } else {
3230                 kLocalMaxUndo<4, 32, 2, 2, true, false><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3231                                                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3232             }
3233         }
3234     } else {
3235         if  (checkCaseBounds) {
3236             if (scaleTargets == 0 && scaleOutput == 1) {
3237                 kLocalMaxUndo<4, 32, 1, 2, false, true><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3238                                                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3239             } else {
3240                 kLocalMaxUndo<4, 32, 1, 2, true, true><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3241                                                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3242             }
3243         } else {
3244             if (scaleTargets == 0 && scaleOutput == 1) {
3245                 kLocalMaxUndo<4, 32, 1, 2, false, false><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3246                                                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3247             } else {
3248                 kLocalMaxUndo<4, 32, 1, 2, true, false><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3249                                                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3250             }
3251         }
3252     }
3253
3254     cutilCheckMsg("convLocalMaxUndo: kernel execution failed");
3255 }
3256
3257 void convLocalAvgUndo(NVMatrix& avgGrads, NVMatrix& target, int subsX, int startX, int strideX, int outputsX, int imgSize) {
3258     convLocalAvgUndo(avgGrads, target, subsX, startX, strideX, outputsX, imgSize, 0, 1);
3259 }
3260
3261 /*
3262  * avgGrads:    (numFilters, numOutputs, numImages)
3263  * target:      (numFilters, imgPixels, numImages)
3264  */
3265 void convLocalAvgUndo(NVMatrix& avgGrads, NVMatrix& target,
3266                       int subsX, int startX, int strideX, int outputsX, int imgSize,
3267                       float scaleTargets, float scaleOutput) {
3268     int numImages = avgGrads.getNumCols();
3269
3270     int outputs = outputsX * outputsX;
3271     int imgPixels = imgSize * imgSize;
3272     int numFilters = avgGrads.getNumRows() / outputs;
3273     assert(avgGrads.getNumRows() == numFilters * outputs);
3274
3275     assert(!target.isTrans());
3276     assert(!avgGrads.isTrans());
3277     assert(avgGrads.isContiguous());
3278     assert(numFilters % 16 == 0);
3279 //    assert(numImages % 128 == 0);
3280
3281     assert(strideX <= subsX);
3282
3283     target.resize(numFilters * imgPixels, numImages);
3284     assert(target.isContiguous());
3285     int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
3286     int checkCaseBounds = numImages % (32*imgsPerThread) != 0;
3287     dim3 threads(32, 4);
3288     dim3 blocks(DIVUP(numImages,32*imgsPerThread) * imgSize, (numFilters / (4 * 4)) * imgSize);
3289
3290     if (imgsPerThread == 4) {
3291         if (checkCaseBounds) {
3292             if (scaleTargets == 0 && scaleOutput == 1) {
3293                 kLocalAvgUndo<4, 32, 4, 4, false, true><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3294                                                                        imgSize, numFilters, numImages, subsX, startX, strideX,
3295                                                                        outputsX, scaleTargets, scaleOutput);
3296             } else {
3297                 kLocalAvgUndo<4, 32, 4, 4, true, true><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3298                                                                       imgSize, numFilters, numImages, subsX, startX, strideX,
3299                                                                       outputsX, scaleTargets, scaleOutput);
3300             }
3301         } else {
3302             if (scaleTargets == 0 && scaleOutput == 1) {
3303                 kLocalAvgUndo<4, 32, 4, 4, false, false><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3304                                                                        imgSize, numFilters, numImages, subsX, startX, strideX,
3305                                                                        outputsX, scaleTargets, scaleOutput);
3306             } else {
3307                 kLocalAvgUndo<4, 32, 4, 4, true, false><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3308                                                                       imgSize, numFilters, numImages, subsX, startX, strideX,
3309                                                                       outputsX, scaleTargets, scaleOutput);
3310             }
3311         }
3312     } else if (imgsPerThread == 2) {
3313         if (checkCaseBounds) {
3314             if (scaleTargets == 0 && scaleOutput == 1) {
3315                 kLocalAvgUndo<4, 32, 2, 4, false, true><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3316                                                                        imgSize, numFilters, numImages, subsX, startX, strideX,
3317                                                                        outputsX, scaleTargets, scaleOutput);
3318             } else {
3319                 kLocalAvgUndo<4, 32, 2, 4, true, true><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3320                                                                       imgSize, numFilters, numImages, subsX, startX, strideX,
3321                                                                       outputsX, scaleTargets, scaleOutput);
3322             }
3323         } else {
3324             if (scaleTargets == 0 && scaleOutput == 1) {
3325                 kLocalAvgUndo<4, 32, 2, 4, false, false><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3326                                                                        imgSize, numFilters, numImages, subsX, startX, strideX,
3327                                                                        outputsX, scaleTargets, scaleOutput);
3328             } else {
3329                 kLocalAvgUndo<4, 32, 2, 4, true, false><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3330                                                                       imgSize, numFilters, numImages, subsX, startX, strideX,
3331                                                                       outputsX, scaleTargets, scaleOutput);
3332             }
3333         }
3334     } else {
3335         if (checkCaseBounds) {
3336             if (scaleTargets == 0 && scaleOutput == 1) {
3337                 kLocalAvgUndo<4, 32, 1, 4, false, true><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3338                                                                        imgSize, numFilters, numImages, subsX, startX, strideX,
3339                                                                        outputsX, scaleTargets, scaleOutput);
3340             } else {
3341                 kLocalAvgUndo<4, 32, 1, 4, true, true><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3342                                                                       imgSize, numFilters, numImages, subsX, startX, strideX,
3343                                                                       outputsX, scaleTargets, scaleOutput);
3344             }
3345         } else {
3346             if (scaleTargets == 0 && scaleOutput == 1) {
3347                 kLocalAvgUndo<4, 32, 1, 4, false, false><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3348                                                                        imgSize, numFilters, numImages, subsX, startX, strideX,
3349                                                                        outputsX, scaleTargets, scaleOutput);
3350             } else {
3351                 kLocalAvgUndo<4, 32, 1, 4, true, false><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3352                                                                       imgSize, numFilters, numImages, subsX, startX, strideX,
3353                                                                       outputsX, scaleTargets, scaleOutput);
3354             }
3355         }
3356     }
3357
3358     cutilCheckMsg("convLocalAvgUndo: kernel execution failed");
3359 }
3360
3361 /*
3362 prob max undo
3363
3364 */
3365
3366 void localProbMaxUndo(NVMatrix& maxout_h, NVMatrix& maxout_p, NVMatrix& hGrads, NVMatrix& pGrads, NVMatrix& target_z,
3367                         NVMatrix& target_t, int subsX, int startX, int strideX, int outputsX, int imgSize, float * gp_iszero, float * gh_iszero) {
3368     int outputs = outputsX * outputsX;
3369     int imgPixels = imgSize * imgSize;
3370     int numImages = maxout_h.getNumCols();
3371     int numFilters = maxout_h.getNumRows() / imgPixels;
3372
3373     assert(maxout_h.getNumRows() / numFilters == imgPixels);
3374     assert(maxout_h.getNumRows() == numFilters * imgPixels);
3375     assert(imgSize * imgSize == imgPixels);
3376
3377     assert(hGrads.getNumRows() == numFilters * imgPixels);
3378     assert(hGrads.getNumCols() == numImages);
3379
3380     assert(target_z.getNumRows() == numFilters * imgPixels);
3381     assert(target_z.getNumCols() == numImages);
3382
3383     assert(maxout_p.getNumRows() == numFilters * outputs);
3384     assert(maxout_p.getNumCols() == numImages);
3385
3386     assert(pGrads.getNumRows() == numFilters * outputs);
3387     assert(pGrads.getNumCols() == numImages);
3388
3389     assert(target_t.getNumRows() == numFilters * outputs);
3390     assert(target_t.getNumCols() == numImages);
3391
3392     assert(!maxout_h.isTrans());
3393     assert(!maxout_p.isTrans());
3394     assert(!target_t.isTrans());
3395     assert(!target_z.isTrans());
3396     assert(!hGrads.isTrans());
3397     assert(!pGrads.isTrans());
3398     assert(maxout_h.isContiguous());
3399     assert(maxout_p.isContiguous());
3400     assert(hGrads.isContiguous());
3401     assert(pGrads.isContiguous());
3402     assert(target_z.isContiguous());
3403     assert(target_t.isContiguous());
3404
3405     assert(numFilters % 16 == 0);
3406     assert(strideX <= subsX);
3407
3408     target_z.resize(maxout_h);
3409     target_t.resize(maxout_p);
3410
3411     int filtersPerThread = numFilters % 8 == 0 ? 2 : 1;
3412     int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
3413     int checkCaseBounds = numImages % (32*imgsPerThread) != 0;
3414     dim3 threads(32, 4);
3415     dim3 blocks(DIVUP(numImages,32*imgsPerThread) * outputsX, DIVUP(numFilters, 4 * filtersPerThread) * outputsX);
3416
3417     if (imgsPerThread == 4) {
3418         if (filtersPerThread == 1) {
3419             if (checkCaseBounds) {
3420                     kLocalProbMaxUndo<4, 32, 4, 1, true><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3421                                 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3422                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3423             } else {
3424                     kLocalProbMaxUndo<4, 32, 4, 1, false><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3425                                 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3426                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3427             }
3428         } else {
3429             if (checkCaseBounds) {
3430                     kLocalProbMaxUndo<4, 32, 4, 2, true><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3431                                 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3432                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3433             } else {
3434                     kLocalProbMaxUndo<4, 32, 4, 2, false><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3435                                 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3436                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3437             }
3438         }
3439     }
3440     else if (imgsPerThread == 2) {
3441         if (filtersPerThread == 1) {
3442             if  (checkCaseBounds) {
3443                     kLocalProbMaxUndo<4, 32, 2, 1, true><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3444                                 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3445                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3446             } else {
3447                     kLocalProbMaxUndo<4, 32, 2, 1, false><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3448                                 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3449                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3450             }
3451         } else {
3452             if  (checkCaseBounds) {
3453                     kLocalProbMaxUndo<4, 32, 2, 2, true><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3454                                 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3455                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3456             } else {
3457                     kLocalProbMaxUndo<4, 32, 2, 2, false><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3458                                 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3459                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3460             }
3461         }
3462     }
3463     else {
3464         if (filtersPerThread == 1) {
3465             if  (checkCaseBounds) {
3466                     kLocalProbMaxUndo<4, 32, 1, 1, true><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3467                                 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3468                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3469             } else {
3470                     kLocalProbMaxUndo<4, 32, 1, 1, false><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3471                                 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3472                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3473             }
3474         } else {
3475             if  (checkCaseBounds) {
3476                     kLocalProbMaxUndo<4, 32, 1, 2, true><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3477                                 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3478                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3479             } else {
3480                     kLocalProbMaxUndo<4, 32, 1, 2, false><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3481                                 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3482                                 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3483             }
3484         }
3485     }
3486
3487     cutilCheckMsg("localProbMaxUndo: kernel execution failed");
3488 }
3489
3490
3491
3492 void convResponseNorm(NVMatrix& images, NVMatrix& denoms, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale) {
3493     convContrastNorm(images, images, denoms, target, numFilters, sizeX, addScale, powScale);
3494 }
3495
3496 /*
3497  * images:      (numFilters, imgPixels, numImages)
3498  * meanDiffs:   (numFilters, imgPixels, numImages)
3499  * denoms:      (numFilters, imgPixels, numImages) (out)
3500  * target:      (numFilters, imgPixels, numImages) (out)
3501  */
3502 void convContrastNorm(NVMatrix& images, NVMatrix& meanDiffs, NVMatrix& denoms, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale) {
3503     int numImages = images.getNumCols();
3504     int imgPixels = images.getNumRows() / numFilters;
3505     assert(images.getNumRows() == numFilters * imgPixels);
3506     int imgSize = int(sqrt((double)imgPixels));
3507     assert(imgSize * imgSize == imgPixels);
3508     assert(meanDiffs.isSameDims(images));
3509
3510     assert(!meanDiffs.isTrans());
3511     assert(!images.isTrans());
3512     assert(images.isContiguous());
3513     assert(meanDiffs.isContiguous());
3514     assert(numFilters % 16 == 0 || numFilters <= 8);
3515
3516     target.resize(images);
3517     denoms.resize(images);
3518     assert(target.isContiguous());
3519     if (sizeX >= 6 && numFilters % 4 == 0) {
3520         // This one is faster for large regions (my tests show regions >= 6...)
3521         int imgsPerThread = 8;
3522         int filtersPerThread = 4;
3523         int bx = 8;
3524         bool checkCaseBounds = numImages % (bx*imgsPerThread) != 0;
3525         assert((imgsPerThread * bx) % 32 == 0);
3526         assert(numFilters % filtersPerThread == 0);
3527         dim3 threads(bx, 16);
3528         dim3 blocks(DIVUP(imgSize, 4) * DIVUP(numImages, bx*imgsPerThread), DIVUP(imgSize, 4) * numFilters / filtersPerThread);
3529
3530         if (checkCaseBounds) {
3531             cudaFuncSetCacheConfig(kCNorm2<8, 8, 4, true>, cudaFuncCachePreferL1); // L1 faster here
3532             kCNorm2<8, 8, 4, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3533                                                            imgSize, numFilters, numImages, sizeX, addScale, powScale);
3534         } else {
3535             cudaFuncSetCacheConfig(kCNorm2<8, 8, 4, false>, cudaFuncCachePreferL1); // L1 faster here
3536             kCNorm2<8, 8, 4, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3537                                                            imgSize, numFilters, numImages, sizeX, addScale, powScale);
3538         }
3539     } else {
3540         bool checkCaseBounds = numImages % 128 != 0;
3541         if (numFilters <= 8) {
3542             dim3 threads(128);
3543             dim3 blocks(DIVUP(numImages,128) * imgSize, imgSize);
3544             if (numFilters == 1) {
3545                 if (checkCaseBounds) {
3546                     cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 1, true>, cudaFuncCachePreferL1);
3547                     kCNorm_fewfilter<1, 1, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3548                                                                       imgSize, numImages, sizeX, addScale, powScale);
3549                 } else {
3550                     cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 1, false>, cudaFuncCachePreferL1);
3551                     kCNorm_fewfilter<1, 1, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3552                                                                       imgSize, numImages, sizeX, addScale, powScale);
3553                 }
3554             } else  if (numFilters == 2) {
3555                 if (checkCaseBounds) {
3556                     cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 2, true>, cudaFuncCachePreferL1);
3557                     kCNorm_fewfilter<1, 2, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3558                                                                       imgSize, numImages, sizeX, addScale, powScale);
3559                 } else {
3560                     cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 2, false>, cudaFuncCachePreferL1);
3561                     kCNorm_fewfilter<1, 2, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3562                                                                       imgSize, numImages, sizeX, addScale, powScale);
3563                 }
3564             } else  if (numFilters == 3) {
3565                 if (checkCaseBounds) {
3566                     cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 3, true>, cudaFuncCachePreferL1);
3567                     kCNorm_fewfilter<1, 3, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3568                                                                       imgSize, numImages, sizeX, addScale, powScale);
3569                 } else {
3570                     cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 3, false>, cudaFuncCachePreferL1);
3571                     kCNorm_fewfilter<1, 3, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3572                                                                       imgSize, numImages, sizeX, addScale, powScale);
3573                 }
3574             } else  if (numFilters == 4) {
3575                 if (checkCaseBounds) {
3576                     cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 4, true>, cudaFuncCachePreferL1);
3577                     kCNorm_fewfilter<1, 4, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3578                                                                       imgSize, numImages, sizeX, addScale, powScale);
3579                 } else {
3580                     cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 4, false>, cudaFuncCachePreferL1);
3581                     kCNorm_fewfilter<1, 4, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3582                                                                       imgSize, numImages, sizeX, addScale, powScale);
3583                 }
3584             } else  if (numFilters == 5) {
3585                 if (checkCaseBounds) {
3586                     cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 5, true>, cudaFuncCachePreferL1);
3587                     kCNorm_fewfilter<1, 5, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3588                                                                       imgSize, numImages, sizeX, addScale, powScale);
3589                 } else {
3590                     cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 5, false>, cudaFuncCachePreferL1);
3591                     kCNorm_fewfilter<1, 5, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3592                                                                       imgSize, numImages, sizeX, addScale, powScale);
3593                 }
3594             } else  if (numFilters == 6) {
3595                 if (checkCaseBounds) {
3596                     cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 6, true>, cudaFuncCachePreferL1);
3597                     kCNorm_fewfilter<1, 6, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3598                                                                       imgSize, numImages, sizeX, addScale, powScale);
3599                 } else {
3600                     cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 6, false>, cudaFuncCachePreferL1);
3601                     kCNorm_fewfilter<1, 6, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3602                                                                       imgSize, numImages, sizeX, addScale, powScale);
3603                 }
3604             } else  if (numFilters == 7) {
3605                 if (checkCaseBounds) {
3606                     cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 7, true>, cudaFuncCachePreferL1);
3607                     kCNorm_fewfilter<1, 7, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3608                                                                       imgSize, numImages, sizeX, addScale, powScale);
3609                 } else {
3610                     cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 7, false>, cudaFuncCachePreferL1);
3611                     kCNorm_fewfilter<1, 7, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3612                                                                       imgSize, numImages, sizeX, addScale, powScale);
3613                 }
3614             } else  if (numFilters == 8) {
3615                 if (checkCaseBounds) {
3616                     cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 8, true>, cudaFuncCachePreferL1);
3617                     kCNorm_fewfilter<1, 8, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3618                                                                       imgSize, numImages, sizeX, addScale, powScale);
3619                 } else {
3620                     cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 8, false>, cudaFuncCachePreferL1);
3621                     kCNorm_fewfilter<1, 8, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3622                                                                       imgSize, numImages, sizeX, addScale, powScale);
3623                 }
3624             }
3625         } else {
3626             dim3 threads(32, 4);
3627             dim3 blocks(DIVUP(numImages,32*4) * imgSize, (numFilters / (4 * 2)) * imgSize);
3628             if (checkCaseBounds) {
3629                 cudaFuncSetCacheConfig(kCNorm_manyfilter<4, 32, 4, 2, true>, cudaFuncCachePreferL1);
3630                 kCNorm_manyfilter<4, 32, 4, 2, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3631                                                                   imgSize, numFilters, numImages, sizeX, addScale, powScale);
3632             } else {
3633                 cudaFuncSetCacheConfig(kCNorm_manyfilter<4, 32, 4, 2, false>, cudaFuncCachePreferL1);
3634                 kCNorm_manyfilter<4, 32, 4, 2, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3635                                                                   imgSize, numFilters, numImages, sizeX, addScale, powScale);
3636             }
3637         }
3638     }
3639     cutilCheckMsg("convResponseNorm: kernel execution failed");
3640 }
3641
3642 void convContrastNormUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& meanDiffs, NVMatrix& acts, NVMatrix& target, int numFilters,
3643                          int sizeX, float addScale, float powScale, float scaleTargets, float scaleOutput) {
3644     convResponseNormUndo(outGrads, denoms, meanDiffs, acts, target, numFilters, sizeX, addScale, powScale, scaleTargets, scaleOutput);
3645 }
3646
3647 /*
3648  * outGrads:    (numFilters, imgPixels, numImages)
3649  * denoms:      (numFilters, imgPixels, numImages)
3650  * inputs:      (numFilters, imgPixels, numImages)
3651  * acts:        (numFilters, imgPixels, numImages)
3652  * target:      (numFilters, imgPixels, numImages)
3653  *
3654  * THIS WILL OVERWRITE THE ACTS MATRIX.
3655  */
3656 void convResponseNormUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& inputs, NVMatrix& acts, NVMatrix& target, int numFilters,
3657                          int sizeX, float addScale, float powScale, float scaleTargets, float scaleOutput) {
3658     int numImages = outGrads.getNumCols();
3659     int imgPixels = outGrads.getNumRows() / numFilters;
3660
3661     int imgSize = int(sqrt((double)imgPixels));
3662     assert(imgSize * imgSize == imgPixels);
3663
3664     assert(outGrads.getNumRows() == numFilters * imgPixels);
3665
3666     assert(denoms.isSameDims(outGrads));
3667     assert(acts.isSameDims(denoms));
3668     assert(!denoms.isTrans());
3669     assert(!outGrads.isTrans());
3670     assert(!acts.isTrans());
3671     assert(!target.isTrans());
3672     assert(outGrads.isContiguous());
3673
3674     assert(numFilters % 16 == 0);
3675
3676     target.resize(outGrads);
3677     assert(target.isContiguous());
3678     // First do acts := -2 x scale x acts x outGrads / denoms
3679     // so that the main routine only has to do an addition in its inner loop.
3680     int prelimEltsPerThread = 4;
3681     dim3 threads(128);
3682     dim3 blocks(MIN(512, DIVUP(outGrads.getNumElements(),(threads.x * prelimEltsPerThread))));
3683     kRNormUndoPrelims<128, 4><<<blocks, threads>>>(acts.getDevData(), denoms.getDevData(), outGrads.getDevData(), outGrads.getNumElements(), -2*addScale*powScale);
3684
3685     // Now the main routine
3686     if (sizeX >= 6 && numFilters % 4 == 0) {
3687         // This one is faster for large regions (my tests show regions >= 6...)
3688         int imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2;
3689         int filtersPerThread = 4;
3690         int bx = 16;
3691         bool checkCaseBounds = numImages % (bx*imgsPerThread) != 0;
3692         assert((imgsPerThread * bx) % 32 == 0);
3693
3694         threads = dim3(bx, 16);
3695         blocks = dim3(DIVUP(imgSize, 4) * DIVUP(numImages, bx*imgsPerThread), DIVUP(imgSize, 4) * numFilters / filtersPerThread);
3696         if (imgsPerThread == 8) {
3697             if (checkCaseBounds) {
3698                 if (scaleTargets == 0 && scaleOutput == 1) {
3699                     cudaFuncSetCacheConfig(kRNormUndo2<16, 8, 4, true, true>, cudaFuncCachePreferL1);
3700                     kRNormUndo2<16, 8, 4, true, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3701                                                                                   target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3702                                                                                   scaleTargets, scaleOutput);
3703                 } else {
3704                     cudaFuncSetCacheConfig(kRNormUndo2<16, 8, 4, false, true>, cudaFuncCachePreferL1);
3705                     kRNormUndo2<16, 8, 4, false, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3706                                                                                   target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3707                                                                                   scaleTargets, scaleOutput);
3708                 }
3709             } else {
3710                 if (scaleTargets == 0 && scaleOutput == 1) {
3711                     cudaFuncSetCacheConfig(kRNormUndo2<16, 8, 4, true, false>, cudaFuncCachePreferL1);
3712                     kRNormUndo2<16, 8, 4, true, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3713                                                                                   target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3714                                                                                   scaleTargets, scaleOutput);
3715                 } else {
3716                     cudaFuncSetCacheConfig(kRNormUndo2<16, 8, 4, false, false>, cudaFuncCachePreferL1);
3717                     kRNormUndo2<16, 8, 4, false, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3718                                                                                   target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3719                                                                                   scaleTargets, scaleOutput);
3720                 }
3721             }
3722         } else if (imgsPerThread == 4) {
3723             if (checkCaseBounds) {
3724                 if (scaleTargets == 0 && scaleOutput == 1) {
3725                     cudaFuncSetCacheConfig(kRNormUndo2<16, 4, 4, true, true>, cudaFuncCachePreferL1);
3726                     kRNormUndo2<16, 4, 4, true, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3727                                                                                   target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3728                                                                                   scaleTargets, scaleOutput);
3729                 } else {
3730                     cudaFuncSetCacheConfig(kRNormUndo2<16, 4, 4, false, true>, cudaFuncCachePreferL1);
3731                     kRNormUndo2<16, 4, 4, false, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3732                                                                                   target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3733                                                                                   scaleTargets, scaleOutput);
3734                 }
3735             } else {
3736                 if (scaleTargets == 0 && scaleOutput == 1) {
3737                     cudaFuncSetCacheConfig(kRNormUndo2<16, 4, 4, true, false>, cudaFuncCachePreferL1);
3738                     kRNormUndo2<16, 4, 4, true, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3739                                                                                   target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3740                                                                                   scaleTargets, scaleOutput);
3741                 } else {
3742                     cudaFuncSetCacheConfig(kRNormUndo2<16, 4, 4, false, false>, cudaFuncCachePreferL1);
3743                     kRNormUndo2<16, 4, 4, false, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3744                                                                                   target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3745                                                                                   scaleTargets, scaleOutput);
3746                 }
3747             }
3748         } else {
3749             if (checkCaseBounds) {
3750                 if (scaleTargets == 0 && scaleOutput == 1) {
3751                     cudaFuncSetCacheConfig(kRNormUndo2<16, 2, 4, true, true>, cudaFuncCachePreferL1);
3752                     kRNormUndo2<16, 2, 4, true, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3753                                                                                   target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3754                                                                                   scaleTargets, scaleOutput);
3755                 } else {
3756                     cudaFuncSetCacheConfig(kRNormUndo2<16, 2, 4, false, true>, cudaFuncCachePreferL1);
3757                     kRNormUndo2<16, 2, 4, false, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3758                                                                                   target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3759                                                                                   scaleTargets, scaleOutput);
3760                 }
3761             } else {
3762                 if (scaleTargets == 0 && scaleOutput == 1) {
3763                     cudaFuncSetCacheConfig(kRNormUndo2<16, 2, 4, true, false>, cudaFuncCachePreferL1);
3764                     kRNormUndo2<16, 2, 4, true, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3765                                                                                   target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3766                                                                                   scaleTargets, scaleOutput);
3767                 } else {
3768                     cudaFuncSetCacheConfig(kRNormUndo2<16, 2, 4, false, false>, cudaFuncCachePreferL1);
3769                     kRNormUndo2<16, 2, 4, false, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3770                                                                                   target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3771                                                                                   scaleTargets, scaleOutput);
3772                 }
3773             }
3774         }
3775     } else {
3776         int imgsPerThread = numImages % 64 == 0 ? 2 : 1;
3777         bool checkCaseBounds = numImages % (32*imgsPerThread) != 0;
3778         threads = dim3(32, 4);
3779         blocks = dim3(DIVUP(numImages,32*imgsPerThread) * imgSize, (numFilters / (4 * 2)) * imgSize);
3780
3781         if (imgsPerThread == 2) {
3782             if (checkCaseBounds) {
3783                 if (scaleTargets == 0 && scaleOutput == 1) {
3784                     cudaFuncSetCacheConfig(kRNormUndo<4, 32, 2, 2, false, true>, cudaFuncCachePreferL1);
3785                     kRNormUndo<4, 32, 2, 2, false, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3786                                                                               target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3787                                                                               scaleTargets, scaleOutput);
3788                 } else {
3789                     cudaFuncSetCacheConfig(kRNormUndo<4, 32, 2, 2, true, true>, cudaFuncCachePreferL1);
3790                     kRNormUndo<4, 32, 2, 2, true, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3791                                                                               target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3792                                                                               scaleTargets, scaleOutput);
3793                 }
3794             } else {
3795                 if (scaleTargets == 0 && scaleOutput == 1) {
3796                     cudaFuncSetCacheConfig(kRNormUndo<4, 32, 2, 2, false, false>, cudaFuncCachePreferL1);
3797                     kRNormUndo<4, 32, 2, 2, false, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3798                                                                               target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3799                                                                               scaleTargets, scaleOutput);
3800                 } else {
3801                     cudaFuncSetCacheConfig(kRNormUndo<4, 32, 2, 2, true, false>, cudaFuncCachePreferL1);
3802                     kRNormUndo<4, 32, 2, 2, true, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3803                                                                               target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3804                                                                               scaleTargets, scaleOutput);
3805                 }
3806             }
3807         } else {
3808             if (checkCaseBounds) {
3809                 if (scaleTargets == 0 && scaleOutput == 1) {
3810                     cudaFuncSetCacheConfig(kRNormUndo<4, 32, 1, 2, false, true>, cudaFuncCachePreferL1);
3811                     kRNormUndo<4, 32, 1, 2, false, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3812                                                                               target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3813                                                                               scaleTargets, scaleOutput);
3814                 } else {
3815                     cudaFuncSetCacheConfig(kRNormUndo<4, 32, 1, 2, true, true>, cudaFuncCachePreferL1);
3816                     kRNormUndo<4, 32, 1, 2, true, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3817                                                                               target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3818                                                                               scaleTargets, scaleOutput);
3819                 }
3820             } else {
3821                 if (scaleTargets == 0 && scaleOutput == 1) {
3822                     cudaFuncSetCacheConfig(kRNormUndo<4, 32, 1, 2, false, false>, cudaFuncCachePreferL1);
3823                     kRNormUndo<4, 32, 1, 2, false, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3824                                                                               target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3825                                                                               scaleTargets, scaleOutput);
3826                 } else {
3827                     cudaFuncSetCacheConfig(kRNormUndo<4, 32, 1, 2, true, false>, cudaFuncCachePreferL1);
3828                     kRNormUndo<4, 32, 1, 2, true, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3829                                                                               target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3830                                                                               scaleTargets, scaleOutput);
3831                 }
3832             }
3833         }
3834     }
3835     cutilCheckMsg("kRNormUndo: kernel execution failed");
3836 }
3837
3838 /*
3839  * imgs:        (numChannels, imgPixels, numImages) with given imgStride
3840  * target:      (numChannels, tgtPixels, numImages)
3841  *
3842  * imgSize = scale * tgtSize
3843  */
3844 void convResizeBilinear(NVMatrix& images, NVMatrix& target, int imgSize, int tgtSize, float scale) {
3845     assert(!images.isTrans());
3846     assert(!target.isTrans());
3847     int imgPixels = imgSize * imgSize;
3848     int tgtPixels = tgtSize * tgtSize;
3849     int numChannels = images.getNumRows() / imgPixels;
3850     int numImages = images.getNumCols();
3851     assert(images.getNumRows() == numChannels * imgPixels);
3852
3853     target.resize(numChannels * tgtPixels, numImages);
3854     assert(target.isContiguous());
3855     int numChunksX = DIVUP(tgtSize, 4);
3856     int numChunks = numChunksX * numChunksX;
3857     double imgCenter = imgSize * 0.5;
3858     double tgtCenter = tgtSize * 0.5;
3859     double centerScale = imgCenter - tgtCenter * scale;
3860
3861     int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
3862     bool checkCaseBounds = numImages % (32*imgsPerThread) != 0;
3863
3864     dim3 threads(32, 16);
3865     dim3 blocks(DIVUP(numImages, imgsPerThread * 32), numChannels * numChunks);
3866     if (imgsPerThread == 4) {
3867         if (checkCaseBounds) {
3868             cudaFuncSetCacheConfig(kResizeBilinear<4, true>, cudaFuncCachePreferL1);
3869             kResizeBilinear<4, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale);
3870         } else {
3871             cudaFuncSetCacheConfig(kResizeBilinear<4, false>, cudaFuncCachePreferL1);
3872             kResizeBilinear<4, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale);
3873         }
3874     } else if (imgsPerThread == 2) {
3875         if (checkCaseBounds) {
3876             cudaFuncSetCacheConfig(kResizeBilinear<2, true>, cudaFuncCachePreferL1);
3877             kResizeBilinear<2, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale);
3878         } else {
3879             cudaFuncSetCacheConfig(kResizeBilinear<2, false>, cudaFuncCachePreferL1);
3880             kResizeBilinear<2, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale);
3881         }
3882     } else {
3883         if (checkCaseBounds) {
3884             cudaFuncSetCacheConfig(kResizeBilinear<1, true>, cudaFuncCachePreferL1);
3885             kResizeBilinear<1, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale);
3886         } else {
3887             cudaFuncSetCacheConfig(kResizeBilinear<1, false>, cudaFuncCachePreferL1);
3888             kResizeBilinear<1, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale);
3889         }
3890     }
3891     cutilCheckMsg("convResizeBilinear: kernel execution failed");
3892 }
3893
3894 /*
3895  * imgs:        (3, imgPixels, numImages) with given imgStride
3896  * target:      (3, imgPixels, numImages)
3897  */
3898 void convRGBToYUV(NVMatrix& images, NVMatrix& target) {
3899     assert(!images.isTrans());
3900     assert(!target.isTrans());
3901     int imgPixels = images.getNumRows() / 3;
3902     int numImages = images.getNumCols();
3903     assert(images.getNumRows() == 3 * imgPixels);
3904
3905     target.resize(3 * imgPixels, numImages);
3906     assert(target.isContiguous());
3907     int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
3908     bool checkCaseBounds = numImages % (32*imgsPerThread) != 0;
3909
3910     dim3 threads(32, 4);
3911     dim3 blocks(DIVUP(numImages, imgsPerThread * 32), DIVUP(imgPixels, 4));
3912     if (imgsPerThread == 4) {
3913         if (checkCaseBounds) {
3914             cudaFuncSetCacheConfig(kRGBToYUV<4, true>, cudaFuncCachePreferL1);
3915             kRGBToYUV<4, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3916         } else {
3917             cudaFuncSetCacheConfig(kRGBToYUV<4, false>, cudaFuncCachePreferL1);
3918             kRGBToYUV<4, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3919         }
3920     } else if (imgsPerThread == 2) {
3921         if (checkCaseBounds) {
3922             cudaFuncSetCacheConfig(kRGBToYUV<2, true>, cudaFuncCachePreferL1);
3923             kRGBToYUV<2, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3924         } else {
3925             cudaFuncSetCacheConfig(kRGBToYUV<2, false>, cudaFuncCachePreferL1);
3926             kRGBToYUV<2, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3927         }
3928     } else {
3929         if (checkCaseBounds) {
3930             cudaFuncSetCacheConfig(kRGBToYUV<1, true>, cudaFuncCachePreferL1);
3931             kRGBToYUV<1, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3932         } else {
3933             cudaFuncSetCacheConfig(kRGBToYUV<1, false>, cudaFuncCachePreferL1);
3934             kRGBToYUV<1, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3935         }
3936     }
3937     cutilCheckMsg("convRGBToYUV: kernel execution failed");
3938 }
3939
3940 /*
3941  * imgs:        (3, imgPixels, numImages) with given imgStride
3942  * target:      (3, imgPixels, numImages)
3943  */
3944 void convRGBToLAB(NVMatrix& images, NVMatrix& target, bool center) {
3945     assert(!images.isTrans());
3946     assert(!target.isTrans());
3947     int imgPixels = images.getNumRows() / 3;
3948     int numImages = images.getNumCols();
3949     assert(images.getNumRows() == 3 * imgPixels);
3950
3951     target.resize(3 * imgPixels, numImages);
3952     assert(target.isContiguous());
3953
3954     int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
3955     bool checkCaseBounds = numImages % (32*imgsPerThread) != 0;
3956     dim3 threads(32, 4);
3957     dim3 blocks(DIVUP(numImages, imgsPerThread * 32), DIVUP(imgPixels, 4));
3958
3959     if (imgsPerThread == 4) {
3960         if (center) {
3961             if (checkCaseBounds) {
3962                 cudaFuncSetCacheConfig(kRGBToLAB<4, true, true>, cudaFuncCachePreferL1);
3963                 kRGBToLAB<4, true, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3964             } else {
3965                 cudaFuncSetCacheConfig(kRGBToLAB<4, false, true>, cudaFuncCachePreferL1);
3966                 kRGBToLAB<4, false, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3967             }
3968         } else {
3969             if (checkCaseBounds) {
3970                 cudaFuncSetCacheConfig(kRGBToLAB<4, true, false>, cudaFuncCachePreferL1);
3971                 kRGBToLAB<4, true, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3972             } else {
3973                 cudaFuncSetCacheConfig(kRGBToLAB<4, false, false>, cudaFuncCachePreferL1);
3974                 kRGBToLAB<4, false, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3975             }
3976         }
3977     } else if (imgsPerThread == 2) {
3978         if (center) {
3979             if (checkCaseBounds) {
3980                 cudaFuncSetCacheConfig(kRGBToLAB<2, true, true>, cudaFuncCachePreferL1);
3981                 kRGBToLAB<2, true, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3982             } else {
3983                 cudaFuncSetCacheConfig(kRGBToLAB<2, false, true>, cudaFuncCachePreferL1);
3984                 kRGBToLAB<2, false, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3985             }
3986         } else {
3987             if (checkCaseBounds) {
3988                 cudaFuncSetCacheConfig(kRGBToLAB<2, true, false>, cudaFuncCachePreferL1);
3989                 kRGBToLAB<2, true, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3990             } else {
3991                 cudaFuncSetCacheConfig(kRGBToLAB<2, false, false>, cudaFuncCachePreferL1);
3992                 kRGBToLAB<2, false, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3993             }
3994         }
3995     } else {
3996         if (center) {
3997             if (checkCaseBounds) {
3998                 cudaFuncSetCacheConfig(kRGBToLAB<1, true, true>, cudaFuncCachePreferL1);
3999                 kRGBToLAB<1, true, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
4000             } else {
4001                 cudaFuncSetCacheConfig(kRGBToLAB<1, false, true>, cudaFuncCachePreferL1);
4002                 kRGBToLAB<1, false, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
4003             }
4004         } else {
4005             if (checkCaseBounds) {
4006                 cudaFuncSetCacheConfig(kRGBToLAB<1, true, false>, cudaFuncCachePreferL1);
4007                 kRGBToLAB<1, true, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
4008             } else {
4009                 cudaFuncSetCacheConfig(kRGBToLAB<1, false, false>, cudaFuncCachePreferL1);
4010                 kRGBToLAB<1, false, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
4011             }
4012         }
4013     }
4014     cutilCheckMsg("convRGBToLAB: kernel execution failed");
4015 }
4016
4017 /*
4018  * imgs:    (numChannels, imgPixels, numImages) with given imgStride
4019  * target:  (numChannels, tgtPixels, numImages)
4020  */
4021 void convCrop(NVMatrix& imgs, NVMatrix& target, int imgSize, int tgtSize, int startY, int startX) {
4022     int numImages = imgs.getNumCols();
4023     int imgPixels = imgSize * imgSize;
4024     int tgtPixels = tgtSize * tgtSize;
4025
4026     int numChannels = imgs.getNumRows() / imgPixels;
4027     assert(imgs.getNumRows() == imgPixels * numChannels);
4028     assert(imgPixels == imgSize * imgSize);
4029     assert(imgSize - startY >= tgtSize);
4030     assert(imgSize - startX >= tgtSize);
4031     assert(startY >= 0);
4032     assert(startX >= 0);
4033     target.resize(numChannels * tgtPixels, numImages);
4034     int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
4035     bool checkCaseBounds = numImages % (32*imgsPerThread) != 0;
4036     dim3 blocks(DIVUP(numImages, 32 * imgsPerThread), numChannels * DIVUP(tgtPixels, 4));
4037     dim3 threads(32, 4);
4038     if (imgsPerThread == 4) {
4039         if (checkCaseBounds) {
4040             kCrop<4, true><<<blocks, threads>>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX);
4041         } else {
4042             kCrop<4, false><<<blocks, threads>>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX);
4043         }
4044     } else if (imgsPerThread == 2) {
4045         if (checkCaseBounds) {
4046             kCrop<2, true><<<blocks, threads>>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX);
4047         } else {
4048             kCrop<2, false><<<blocks, threads>>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX);
4049         }
4050     } else {
4051         if (checkCaseBounds) {
4052             kCrop<1, true><<<blocks, threads>>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX);
4053         } else {
4054             kCrop<1, false><<<blocks, threads>>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX);
4055         }
4056     }
4057     cutilCheckMsg("convCrop: kernel execution failed");
4058 }
4059
4060 /*
4061  * images:      (numFilters, imgPixels, numImages)
4062  * ticas:       (numFilters, imgPixels, numImages)
4063  * target:      (numFilters, imgPixels, numImages) (out)
4064  *
4065  * Computes TICA-style gradient for given feature maps
4066  * f(x) = exp(-(sum_i{x_i^2}^(1/2)))
4067  * dlogf(x)/df(x) = -x_i / (sum_i{x_i^2}^(1/2) + eps)
4068  *
4069  * eps added for numerical stability
4070  */
4071 void convTICAGrad(NVMatrix& images, NVMatrix& ticas, NVMatrix& target, int numFilters, int sizeX, float scaleTarget, float scaleOutput) {
4072     int numImages = images.getNumCols();
4073     int imgPixels = images.getNumRows() / numFilters;
4074     assert(images.getNumRows() == numFilters * imgPixels);
4075     int imgSize = int(sqrt((double)imgPixels));
4076     assert(imgSize * imgSize == imgPixels);
4077
4078     assert(!images.isTrans());
4079     assert(images.isContiguous());
4080     assert(numFilters % 16 == 0 || numFilters <= 8);
4081
4082     assert(ticas.isSameDims(images));
4083     assert(ticas.isContiguous());
4084
4085     if (scaleTarget == 0) {
4086         target.resize(images);
4087     } else {
4088         assert(target.isSameDims(images));
4089     }
4090     assert(target.isContiguous());
4091
4092     // TEMPORARY
4093     assert(numFilters > 8);
4094     assert(sizeX < 6);
4095
4096     dim3 threads(32, 4);
4097     dim3 blocks(DIVUP(numImages, 32*4) * imgSize, (numFilters / (4 * 2)) * imgSize);
4098     bool checkCaseBounds = (numImages % 128) != 0;
4099     if (checkCaseBounds) {
4100         cudaFuncSetCacheConfig(kTICAGrad_manyfilter<4, 32, 4, 2, true>, cudaFuncCachePreferL1);
4101         kTICAGrad_manyfilter<4, 32, 4, 2, true><<<blocks, threads>>>(images.getDevData(), ticas.getDevData(), target.getDevData(),
4102                                                                      imgSize, numFilters, numImages, sizeX, scaleTarget, scaleOutput);
4103     } else {
4104         cudaFuncSetCacheConfig(kTICAGrad_manyfilter<4, 32, 4, 2, false>, cudaFuncCachePreferL1);
4105         kTICAGrad_manyfilter<4, 32, 4, 2, false><<<blocks, threads>>>(images.getDevData(), ticas.getDevData(), target.getDevData(),
4106                                                                       imgSize, numFilters, numImages, sizeX, scaleTarget, scaleOutput);
4107     }
4108
4109     cutilCheckMsg("convTICAGrad: kernel execution failed");
4110 }
4111
4112 /*
4113  * images:      (numFilters, imgPixels, numImages)
4114  * target:      (numFilters, imgPixels, numImages) (out)
4115  *
4116  * Computes TICA-style gradient for given feature maps
4117  * f(x) = exp(-(sum_i{x_i^2}^(1/2)))
4118  * dlogf(x)/df(x) = -x_i / (sum_i{x_i^2}^(1/2) + eps)
4119  *
4120  * eps added for numerical stability
4121  */
4122 void convTICA(NVMatrix& images, NVMatrix& target, int numFilters, int sizeX, float scaleTarget, float scaleOutput) {
4123     int numImages = images.getNumCols();
4124     int imgPixels = images.getNumRows() / numFilters;
4125     assert(images.getNumRows() == numFilters * imgPixels);
4126     int imgSize = int(sqrt((double)imgPixels));
4127     assert(imgSize * imgSize == imgPixels);
4128
4129     assert(!images.isTrans());
4130     assert(images.isContiguous());
4131     assert(numFilters % 16 == 0 || numFilters <= 8);
4132
4133     if (scaleTarget == 0) {
4134         target.resize(images);
4135     } else {
4136         assert(target.isSameDims(images));
4137     }
4138     assert(target.isContiguous());
4139
4140     // TEMPORARY
4141     assert(numFilters > 8);
4142     assert(sizeX < 6);
4143
4144     dim3 threads(32, 4);
4145     dim3 blocks(DIVUP(numImages, 32*4) * imgSize, (numFilters / (4 * 2)) * imgSize);
4146     bool checkCaseBounds = (numImages % 128) != 0;
4147     if (checkCaseBounds) {
4148         cudaFuncSetCacheConfig(kTICA_manyfilter<4, 32, 4, 2, true>, cudaFuncCachePreferL1);
4149         kTICA_manyfilter<4, 32, 4, 2, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
4150                                                                  imgSize, numFilters, numImages, sizeX, scaleTarget, scaleOutput);
4151     } else {
4152         cudaFuncSetCacheConfig(kTICA_manyfilter<4, 32, 4, 2, false>, cudaFuncCachePreferL1);
4153         kTICA_manyfilter<4, 32, 4, 2, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
4154                                                                   imgSize, numFilters, numImages, sizeX, scaleTarget, scaleOutput);
4155     }
4156
4157     cutilCheckMsg("convTICA: kernel execution failed");
4158 }
4159
4160
4161 /*
4162  * images:      (numFilters, imgPixels, numImages)
4163  * meanDiffs:   (numFilters, imgPixels, numImages)
4164  * denoms:      (numFilters, imgPixels, numImages) (out)
4165  * target:      (numFilters, imgPixels, numImages) (out)
4166
4167  * Note: at present, I have no code to compute the meanDiffs. So it should be set
4168  * to be equal to images. In other words, this isn't really doing contrast normalization,
4169  * just response normalization.
4170  */
4171 void convContrastNormCrossMap(NVMatrix& images, NVMatrix& meanDiffs, NVMatrix& denoms, NVMatrix& target,
4172                              int numFilters, int sizeF, float addScale, float powScale, bool blocked) {
4173     int numImages = images.getNumCols();
4174     int imgPixels = images.getNumRows() / numFilters;
4175     assert(images.getNumRows() == numFilters * imgPixels);
4176     int imgSize = int(sqrt((double)imgPixels));
4177     assert(imgSize * imgSize == imgPixels);
4178     assert(meanDiffs.isSameDims(images));
4179     assert(sizeF > 0 && sizeF <= numFilters);
4180
4181     assert(!meanDiffs.isTrans());
4182     assert(!images.isTrans());
4183     assert(images.isContiguous());
4184     assert(meanDiffs.isContiguous());
4185     assert(numFilters % 16 == 0);
4186
4187     target.resize(images);
4188     denoms.resize(images);
4189     assert(target.isContiguous());
4190
4191     bool checkCaseBounds = numImages % 128 != 0;
4192
4193     dim3 threads(32, 4);
4194     dim3 blocks(DIVUP(numImages,32*4) * imgSize, (numFilters / 4) * imgSize);
4195     if (blocked) {
4196         if (checkCaseBounds) {
4197             cudaFuncSetCacheConfig(kFCNorm<4, 32, 4, true, true>, cudaFuncCachePreferL1);
4198             kFCNorm<4, 32, 4, true, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
4199                                                                 imgSize, numFilters, numImages, sizeF, addScale, powScale);
4200         } else {
4201             cudaFuncSetCacheConfig(kFCNorm<4, 32, 4, false, true>, cudaFuncCachePreferL1);
4202             kFCNorm<4, 32, 4, false, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
4203                                                                 imgSize, numFilters, numImages, sizeF, addScale, powScale);
4204         }
4205     } else {
4206     if (checkCaseBounds) {
4207             cudaFuncSetCacheConfig(kFCNorm<4, 32, 4, true, false>, cudaFuncCachePreferL1);
4208             kFCNorm<4, 32, 4, true, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
4209                                                                 imgSize, numFilters, numImages, sizeF, addScale, powScale);
4210         } else {
4211             cudaFuncSetCacheConfig(kFCNorm<4, 32, 4, false, false>, cudaFuncCachePreferL1);
4212             kFCNorm<4, 32, 4, false, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
4213                                                                 imgSize, numFilters, numImages, sizeF, addScale, powScale);
4214         }
4215     }
4216
4217     cutilCheckMsg("convContrastNormCrossMap: kernel execution failed");
4218 }
4219
4220 /*
4221  * outGrads:    (numFilters, imgPixels, numImages)
4222  * denoms:      (numFilters, imgPixels, numImages)
4223  * inputs:      (numFilters, imgPixels, numImages)
4224  * acts:        (numFilters, imgPixels, numImages)
4225  * target:      (numFilters, imgPixels, numImages)
4226  *
4227  * THIS WILL OVERWRITE THE ACTS MATRIX.
4228  */
4229 void convResponseNormCrossMapUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& inputs, NVMatrix& acts, NVMatrix& target, int numFilters,
4230                          int sizeF, float addScale, float powScale, bool blocked, float scaleTargets, float scaleOutput) {
4231     int numImages = outGrads.getNumCols();
4232     int imgPixels = outGrads.getNumRows() / numFilters;
4233
4234     int imgSize = int(sqrt((double)imgPixels));
4235     assert(imgSize * imgSize == imgPixels);
4236     assert(sizeF > 0 && sizeF <= numFilters);
4237     assert(outGrads.getNumRows() == numFilters * imgPixels);
4238
4239     assert(denoms.isSameDims(outGrads));
4240     assert(acts.isSameDims(denoms));
4241     assert(!denoms.isTrans());
4242     assert(!outGrads.isTrans());
4243     assert(!acts.isTrans());
4244     assert(!target.isTrans());
4245     assert(outGrads.isContiguous());
4246
4247     assert(numFilters % 16 == 0);
4248
4249     target.resize(outGrads);
4250     assert(target.isContiguous());
4251     // First do acts := -2 x scale x acts x outGrads / denoms
4252     // so that the main routine only has to do an addition in its inner loop.
4253     int prelimEltsPerThread = 4;
4254     dim3 threads(128);
4255     dim3 blocks(MIN(512, DIVUP(outGrads.getNumElements(),(threads.x * prelimEltsPerThread))));
4256     kRNormUndoPrelims<128, 4><<<blocks, threads>>>(acts.getDevData(), denoms.getDevData(), outGrads.getDevData(), outGrads.getNumElements(), -2*addScale*powScale);
4257
4258     // Now the main routine
4259
4260     dim3 threads2 = dim3(32, 4);
4261     dim3 blocks2 = dim3(DIVUP(numImages,32*4) * imgSize, (numFilters / 4) * imgSize);
4262     bool checkCaseBounds = (numImages % 128) != 0;
4263     if (blocked) {
4264         if (scaleTargets == 0 && scaleOutput == 1) {
4265             if (checkCaseBounds) {
4266                 cudaFuncSetCacheConfig(kFRNormUndo<4, 32, 4, false, true, true>, cudaFuncCachePreferL1);
4267                 kFRNormUndo<4, 32, 4, false, true, true><<<blocks2, threads2>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
4268                                                                         target.getDevData(), imgSize, numFilters, numImages, sizeF, powScale,
4269                                                                         scaleTargets, scaleOutput);
4270             } else {
4271                 cudaFuncSetCacheConfig(kFRNormUndo<4, 32, 4, false, false, true>, cudaFuncCachePreferL1);
4272                 kFRNormUndo<4, 32, 4, false, false, true><<<blocks2, threads2>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
4273                                                                         target.getDevData(), imgSize, numFilters, numImages, sizeF, powScale,
4274                                                                         scaleTargets, scaleOutput);
4275             }
4276         } else {
4277             if (checkCaseBounds) {
4278                 cudaFuncSetCacheConfig(kFRNormUndo<4, 32, 4, true, true, true>, cudaFuncCachePreferL1);
4279                 kFRNormUndo<4, 32, 4, true, true, true><<<blocks2, threads2>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
4280                                                                         target.getDevData(), imgSize, numFilters, numImages, sizeF, powScale,
4281                                                                         scaleTargets, scaleOutput);
4282             } else {
4283                 cudaFuncSetCacheConfig(kFRNormUndo<4, 32, 4, true, false, true>, cudaFuncCachePreferL1);
4284                 kFRNormUndo<4, 32, 4, true, false, true><<<blocks2, threads2>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
4285                                                                         target.getDevData(), imgSize, numFilters, numImages, sizeF, powScale,
4286                                                                         scaleTargets, scaleOutput);
4287             }
4288         }
4289     } else {
4290         if (scaleTargets == 0 && scaleOutput == 1) {
4291             if (checkCaseBounds) {
4292                 cudaFuncSetCacheConfig(kFRNormUndo<4, 32, 4, false, true, false>, cudaFuncCachePreferL1);
4293                 kFRNormUndo<4, 32, 4, false, true, false><<<blocks2, threads2>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
4294                                                                         target.getDevData(), imgSize, numFilters, numImages, sizeF, powScale,
4295                                                                         scaleTargets, scaleOutput);
4296             } else {
4297                 cudaFuncSetCacheConfig(kFRNormUndo<4, 32, 4, false, false, false>, cudaFuncCachePreferL1);
4298                 kFRNormUndo<4, 32, 4, false, false, false><<<blocks2, threads2>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
4299                                                                         target.getDevData(), imgSize, numFilters, numImages, sizeF, powScale,
4300                                                                         scaleTargets, scaleOutput);
4301             }
4302         } else {
4303             if (checkCaseBounds) {
4304                 cudaFuncSetCacheConfig(kFRNormUndo<4, 32, 4, true, true, false>, cudaFuncCachePreferL1);
4305                 kFRNormUndo<4, 32, 4, true, true, false><<<blocks2, threads2>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
4306                                                                         target.getDevData(), imgSize, numFilters, numImages, sizeF, powScale,
4307                                                                         scaleTargets, scaleOutput);
4308             } else {
4309                 cudaFuncSetCacheConfig(kFRNormUndo<4, 32, 4, true, false, false>, cudaFuncCachePreferL1);
4310                 kFRNormUndo<4, 32, 4, true, false, false><<<blocks2, threads2>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
4311                                                                         target.getDevData(), imgSize, numFilters, numImages, sizeF, powScale,
4312                                                                         scaleTargets, scaleOutput);
4313             }
4314         }
4315     }
4316
4317     cutilCheckMsg("convResponseNormCrossMapUndo: kernel execution failed");
4318 }
4319
4320 void convResponseNormCrossMap(NVMatrix& images, NVMatrix& denoms, NVMatrix& target, int numFilters, int sizeF, float addScale, float powScale, bool blocked) {
4321     convContrastNormCrossMap(images, images, denoms, target, numFilters, sizeF, addScale, powScale, blocked);
4322 }
4323
4324 /*
4325  * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
4326  * All rights reserved.
4327  *
4328  * Redistribution and use in source and binary forms, with or without modification,
4329  * are permitted provided that the following conditions are met:
4330  *
4331  * - Redistributions of source code must retain the above copyright notice,
4332  *   this list of conditions and the following disclaimer.
4333  *
4334  * - Redistributions in binary form must reproduce the above copyright notice,
4335  *   this list of conditions and the following disclaimer in the documentation
4336  *   and/or other materials provided with the distribution.
4337  *
4338  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
4339  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
4340  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
4341  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
4342  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
4343  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
4344  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
4345  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
4346  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
4347  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4348  */
4349
4350 #ifndef _CUDACONV2_EXPORT
4351 #define _CUDACONV2_EXPORT
4352 #endif
4353
4354 #include <cutil_inline.h>
4355 #include <nvmatrix.cuh>
4356 #include <cudaconv2.cuh>
4357
4358 /*
4359  * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X * imgsPerThread images.
4360  * threadIdx.x determines image
4361  * threadIdx.y determines filter
4362  *
4363  * blockIdx.x determines image batch of B_X * imgsPerThread
4364  * blockIdx.y determines filter batch of module and B_Y * filtersPerThread
4365  *
4366  * images:      (numColors, imgSizeY, imgSizeX, numImages) with stride given
4367  * filters:     (numColors, filterPixels, numFilters) if conv
4368  *              (numModules, numColors, filterPixels, numFilters) otherwise
4369  *
4370  * targets:     (numFilters, numModulesY, numModulesX, numImages)
4371  *
4372  * B_Y one of 4, 8, 16
4373  * B_X one of 16, 32
4374  * imgsPerThread one of 1, 2, 4
4375  * filtersPerThread one of 1, 2, 4, 8
4376  *
4377  * Number of filters per module should be divisible by B_Y * filtersPerThread
4378  * checkImgBounds indicates whether number of images is divisible by B_X * imgsPerThread
4379  *
4380  * The imgSize here is the size of the actual image without the padding.
4381  *
4382  */
4383 template <int B_Y, int B_X, int imgsPerThread, int filtersPerThread, int numColors,
4384           bool scale, bool checkImgBounds>
4385 __global__ void filterActs_YxX_color(float* images, float* filters, float* targets,
4386                                    const int numImages, const int numFilters,
4387                                    const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart,
4388                                    const int moduleStride,
4389                                    const int numModulesY, const int numModulesX, const int imgStride,
4390                                    const float scaleTargets, const float scaleOutputs,
4391                                    const bool conv) {
4392     __shared__ float shFilters[B_Y*numColors][B_Y * filtersPerThread]; // pre-load B_Y pixels from B_Y*filtersPerThread filters
4393     __shared__ float shImages[B_Y*numColors][B_X * imgsPerThread]; // pre-load B_Y pixels from B_X*imgsPerThread images
4394     const int imgPixels = imgSizeY * imgSizeX;
4395     const int filterPixels = filterSize * filterSize;
4396
4397     const int blocksPerModule = numFilters / (B_Y*filtersPerThread);
4398     const int moduleIdx = blockIdx.y / blocksPerModule;
4399     const int blockFilterIdx = blockIdx.y % blocksPerModule;
4400
4401     const int tidx = threadIdx.y * B_X + threadIdx.x;
4402
4403     const int imgLoadModPosY = (moduleIdx / numModulesX) * moduleStride;
4404     const int imgLoadModPosX = (moduleIdx % numModulesX) * moduleStride;
4405
4406     const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
4407     const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
4408     const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
4409     images += myImgIdx;
4410     filters += filtersPerThread * B_Y * blockFilterIdx
4411              + shFilterLoadY * numFilters + shFilterLoadX;
4412     if (!conv) {
4413         filters += moduleIdx * numColors * filterPixels * numFilters;
4414     }
4415
4416     targets += moduleIdx * numImages
4417             + (blockFilterIdx * B_Y * filtersPerThread + threadIdx.y) * numImages * numModulesY * numModulesX
4418             + myImgIdx;
4419
4420
4421     float prod[filtersPerThread][imgsPerThread];
4422     #pragma unroll
4423     for(int f = 0; f < filtersPerThread; f++) {
4424         #pragma unroll
4425         for(int g = 0; g < imgsPerThread; g++) {
4426             prod[f][g] = 0;
4427         }
4428     }
4429
4430     for (int p = 0; p < filterPixels; p += B_Y) {
4431         /*
4432          * Load B_Y pixels from B_Y*filtersPerThread filters
4433          */
4434         if (shFilterLoadY < B_Y) {
4435             #pragma unroll
4436             for (int p2 = 0; p2 < B_Y; p2 += B_X/filtersPerThread) {
4437                 if (p + p2 + shFilterLoadY < filterPixels) {
4438                     #pragma unroll
4439                     for (int c = 0; c < numColors; c++) {
4440                         shFilters[shFilterLoadY + p2 + c * B_Y][shFilterLoadX] = filters[(c * filterPixels + p + p2) * numFilters];
4441                     }
4442                 } else {
4443                     #pragma unroll
4444                     for (int c = 0; c < numColors; c++) {
4445                         shFilters[shFilterLoadY + p2 + c * B_Y][shFilterLoadX] = 0;
4446                     }
4447                 }
4448             }
4449         }
4450
4451         /*
4452          * Load B_Y pixels from B_X*imgsPerThread images
4453          */
4454         const int pixIdx = p + threadIdx.y;
4455         if (pixIdx < filterPixels) {
4456             const int x = paddingStart + imgLoadModPosX + pixIdx % filterSize;
4457             const int y = paddingStart + imgLoadModPosY + pixIdx / filterSize;
4458             if (y >= 0 && y< imgSizeY && x >= 0 && x < imgSizeX) {
4459                 #pragma unroll
4460                 for (int i = 0; i < imgsPerThread; i++) {
4461                     if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
4462                         #pragma unroll
4463                         for (int c = 0; c < numColors; c++) {
4464                             shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = images[imgStride * (c * imgPixels + y * imgSizeX + x) + i * B_X];
4465                         }
4466                     } else {
4467                         #pragma unroll
4468                         for (int c = 0; c < numColors; c++) {
4469                             shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = 0;
4470                         }
4471                     }
4472                 }
4473             } else { // Padding
4474                 #pragma unroll
4475                 for (int i = 0; i < imgsPerThread; i++) {
4476                     #pragma unroll
4477                     for (int c = 0; c < numColors; c++) {
4478                         shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = 0;
4479                     }
4480                 }
4481             }
4482         }
4483         __syncthreads();
4484         #pragma unroll
4485         for (int i = 0; i < B_Y*numColors; i++) {
4486             #pragma unroll
4487             for(int f = 0; f < filtersPerThread; f++) {
4488                 #pragma unroll
4489                 for(int g = 0; g < imgsPerThread; g++) {
4490                     prod[f][g] += shImages[i][g * B_X + threadIdx.x] * shFilters[i][threadIdx.y + f * B_Y];
4491                 }
4492             }
4493
4494         }
4495         __syncthreads();
4496     }
4497
4498     if (scale) {
4499         #pragma unroll
4500         for (int g = 0; g < imgsPerThread; g++) {
4501             if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
4502                 #pragma unroll
4503                 for (int f = 0; f < filtersPerThread; f++) {
4504                     targets[g * B_X + f * B_Y * numImages * numModulesY * numModulesX] = scaleTargets * targets[g * B_X + f * B_Y * numImages * numModulesY * numModulesX] + scaleOutputs * prod[f][g];
4505                 }
4506             }
4507         }
4508     } else {
4509         #pragma unroll
4510         for (int g = 0; g < imgsPerThread; g++) {
4511             if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
4512                 #pragma unroll
4513                 for (int f = 0; f < filtersPerThread; f++) {
4514                     targets[g * B_X + f * B_Y * numImages * numModulesY * numModulesX] = scaleOutputs * prod[f][g];
4515                 }
4516             }
4517         }
4518     }
4519 }
4520
4521 /*
4522  * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X * imgsPerThread images.
4523  * threadIdx.x determines image
4524  * threadIdx.y determines filter
4525  *
4526  * blockIdx.x determines image batch of B_X * imgsPerThread
4527  * blockIdx.y determines filter batch of B_Y * filtersPerThread
4528  *
4529  * images:      (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
4530  * filters:     (numFilterColors, filterPixels, numFilters) if conv
4531  *              (numModules, numFilterColors, filterPixels, numFilters) otherwise
4532  *
4533  * targets:     (numFilters, numModulesY, numModulesX, numImages)
4534  *
4535  * B_Y one of 4, 8, 16
4536  * B_X one of 16, 32
4537  * imgsPerThread one of 1, 2, 4
4538  * filtersPerThread one of 1, 2, 4, 8
4539  * colorCache: how many colors to put into shmem
4540  *
4541  * numFilters should be divisible by B_Y * filtersPerThread
4542  * numImages be divisible by B_X * imgsPerThread
4543  * numFilterColors should be divisible by colorCache.
4544  * numImgColors must be even.
4545  * numFilters must be divisible by numGroups.
4546  *
4547  * The imgSize here is the size of the actual image without the padding.
4548  *
4549  */
4550 template <int B_Y, int B_X, int imgsPerThread, int filtersPerThread, int colorCache,
4551           bool scale, bool checkImgBounds>
4552 __global__ void filterActs_YxX_sparse(float* images, float* filters, float* targets,
4553                                        const int numImages, const int numFilters,
4554                                        const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart,
4555                                        const int moduleStride,
4556                                        const int numModulesY, const int numModulesX, const int imgStride, const int numImgColors,
4557                                        const int numGroups,
4558                                        const float scaleTargets, const float scaleOutputs,
4559                                        const bool conv) {
4560     __shared__ float shFilters[B_Y*colorCache][B_Y * filtersPerThread]; // pre-load B_Y pixels from B_Y*filtersPerThread filters
4561     __shared__ float shImages[B_Y*colorCache][B_X * imgsPerThread]; // pre-load B_Y pixels from B_X*imgsPerThread images
4562     const int imgPixels = imgSizeY * imgSizeX;
4563     const int filterPixels = filterSize * filterSize;
4564     const int numFilterColors = numImgColors / numGroups;
4565     const int blocksPerModule = numFilters / (B_Y*filtersPerThread);
4566     const int moduleIdx = blockIdx.y / blocksPerModule;
4567     const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
4568     const int numFiltersPerGroup = numFilters / numGroups;
4569     const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
4570
4571     const int numModules = numModulesX * numModulesY;
4572     const int blockColorIdx = numFilterColors * blockGroupIdx;
4573
4574     const int tidx = threadIdx.y * B_X + threadIdx.x;
4575
4576     const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride;
4577     const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride;
4578
4579     const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
4580     const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
4581     const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
4582
4583     images += blockColorIdx * imgPixels * imgStride + myImgIdx;
4584     filters +=blockFilterIdx
4585             + shFilterLoadY * numFilters + shFilterLoadX;
4586     if (!conv) {
4587         filters += moduleIdx * numFilterColors * filterPixels * numFilters;
4588     }
4589
4590     targets += moduleIdx * numImages
4591             + (blockFilterIdx + threadIdx.y) * numImages * numModules
4592             + myImgIdx;
4593
4594     float prod[filtersPerThread][imgsPerThread];
4595     #pragma unroll
4596     for(int f = 0; f < filtersPerThread; f++) {
4597         #pragma unroll
4598         for(int g = 0; g < imgsPerThread; g++) {
4599             prod[f][g] = 0;
4600         }
4601     }
4602 //    __shared__ int imgPos[]
4603     for (int oc = 0; oc < numFilterColors; oc += colorCache) { // oc stands for outer color (loop)
4604         for (int p = 0; p < filterPixels; p += B_Y) {
4605             /*
4606              * Load B_Y pixels from B_Y*filtersPerThread filters
4607              */
4608             if (shFilterLoadY < B_Y) {
4609                 #pragma unroll
4610                 for (int p2 = 0; p2 < B_Y; p2 += B_X/filtersPerThread) {
4611                     if (p + p2 + shFilterLoadY < filterPixels) {
4612                         #pragma unroll
4613                         for (int c = 0; c < colorCache; c++) {
4614                             shFilters[shFilterLoadY + p2 + c * B_Y][shFilterLoadX] = filters[((oc+c) * filterPixels + p + p2) * numFilters];
4615                         }
4616                     } else {
4617                         #pragma unroll
4618                         for (int c = 0; c < colorCache; c++) {
4619                             shFilters[shFilterLoadY + p2 + c * B_Y][shFilterLoadX] = 0;
4620                         }
4621                     }
4622                 }
4623             }
4624
4625             /*
4626              * Load B_Y pixels from B_X*imgsPerThread images
4627              */
4628             const int pixIdx = p + threadIdx.y;
4629             if (pixIdx < filterPixels) {
4630                 const int x = imgLoadModPosX + pixIdx % filterSize;
4631                 const int y = imgLoadModPosY + pixIdx / filterSize;
4632                 if (y >= 0 && y < imgSizeY && x >= 0 && x < imgSizeX) {
4633                     float* m = &images[imgStride * (oc * imgPixels + y * imgSizeX + x)];
4634                     #pragma unroll
4635                     for (int i = 0; i < imgsPerThread; i++) {
4636                         if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
4637                             #pragma unroll
4638                             for (int c = 0; c < colorCache; c++) {
4639                                 shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = m[c * imgStride * imgPixels + i * B_X];
4640                             }
4641                         } else {
4642                             #pragma unroll
4643                             for (int c = 0; c < colorCache; c++) {
4644                                 shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = 0;
4645                             }
4646                         }
4647                     }
4648                 } else { // Padding
4649                     #pragma unroll
4650                     for (int i = 0; i < imgsPerThread; i++) {
4651                         #pragma unroll
4652                         for (int c = 0; c < colorCache; c++) {
4653                             shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = 0;
4654                         }
4655                     }
4656                 }
4657             }
4658             __syncthreads();
4659             #pragma unroll
4660             for (int i = 0; i < B_Y*colorCache; i++) {
4661                 #pragma unroll
4662                 for(int f = 0; f < filtersPerThread; f++) {
4663                     #pragma unroll
4664                     for(int g = 0; g < imgsPerThread; g++) {
4665                         prod[f][g] += shImages[i][g * B_X + threadIdx.x] * shFilters[i][threadIdx.y + f * B_Y];
4666                     }
4667                 }
4668
4669             }
4670             __syncthreads();
4671         }
4672     }
4673
4674     if (scale) {
4675         #pragma unroll
4676         for (int g = 0; g < imgsPerThread; g++) {
4677             if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
4678                 #pragma unroll
4679                 for (int f = 0; f < filtersPerThread; f++) {
4680                     targets[g * B_X + f * B_Y * numImages * numModules] = scaleTargets * targets[g * B_X + f * B_Y * numImages * numModules] + scaleOutputs * prod[f][g];
4681                 }
4682             }
4683         }
4684     } else {
4685         #pragma unroll
4686         for (int g = 0; g < imgsPerThread; g++) {
4687             if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
4688                 #pragma unroll
4689                 for (int f = 0; f < filtersPerThread; f++) {
4690                     targets[g * B_X + f * B_Y * numImages * numModules] = scaleOutputs * prod[f][g];
4691                 }
4692             }
4693         }
4694     }
4695 }
4696
4697
4698 /*
4699  * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X * imgsPerThread images.
4700  * threadIdx.x determines image
4701  * threadIdx.y determines filter
4702  *
4703  * blockIdx.x determines image batch of B_X * imgsPerThread
4704  * blockIdx.y determines filter batch of B_Y * filtersPerThread
4705  *
4706  * images:          (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
4707  * filters:         (numFilterColors, filterPixels, numFilters) if conv
4708  *                  (numModules, numFilterColors, filterPixels, numFilters) otherwise
4709  *
4710  * targets:         (numFilters, numModulesY, numModulesX, numImages)
4711  * colorIndices:    (numGroups, numFiltercolors)
4712  *
4713  * B_Y one of 4, 8, 16
4714  * B_X one of 16, 32
4715  * imgsPerThread one of 1, 2, 4
4716  * filtersPerThread one of 1, 2, 4, 8
4717  * colorCache: how many colors to put into shmem
4718  *
4719  * numFilters should be divisible by B_Y * filtersPerThread
4720  * numImages be divisible by B_X * imgsPerThread
4721  * numFilterColors should be divisible by colorCache.
4722  * numImgColors must be even.
4723  * numFilters must be divisible by numGroups.
4724  *
4725  * The imgSize here is the size of the actual image without the padding.
4726  */
4727 template <int B_Y, int B_X, int imgsPerThread, int filtersPerThread, int colorCache, bool scale, bool checkImgBounds>
4728 __global__ void filterActs_YxX_sparse_random(float* images, float* filters, float* targets, int* colorIndices,
4729                                              const int numImages, const int numFilters,
4730                                              const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart,
4731                                              const int moduleStride,
4732                                              const int numModulesY, const int numModulesX, const int imgStride,
4733                                              /*const int numImgColors,*/ const int numFilterColors, const int numGroups,
4734                                              const float scaleTargets, const float scaleOutputs,
4735                                              const bool conv) {
4736     __shared__ float shFilters[B_Y*colorCache][B_Y * filtersPerThread]; // pre-load B_Y pixels from B_Y*filtersPerThread filters
4737     __shared__ float shImages[B_Y*colorCache][B_X * imgsPerThread]; // pre-load B_Y pixels from B_X*imgsPerThread images
4738     __shared__ int shColors[colorCache];
4739     const int imgPixels = imgSizeY * imgSizeX;
4740     const int filterPixels = filterSize * filterSize;
4741 //    const int numFilterColors = numImgColors / numGroups;
4742     const int blocksPerModule = numFilters / (B_Y*filtersPerThread);
4743     const int moduleIdx = blockIdx.y / blocksPerModule;
4744     const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
4745     const int numFiltersPerGroup = numFilters / numGroups;
4746     const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
4747
4748     const int numModules = numModulesY * numModulesX;
4749
4750     const int tidx = threadIdx.y * B_X + threadIdx.x;
4751
4752     const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride;
4753     const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride;
4754
4755     const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
4756     const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
4757     const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
4758
4759     images += myImgIdx;
4760     filters +=blockFilterIdx
4761             + shFilterLoadY * numFilters + shFilterLoadX;
4762     if (!conv) {
4763         filters += moduleIdx * numFilterColors * filterPixels * numFilters;
4764     }
4765
4766     targets += moduleIdx * numImages
4767             + (blockFilterIdx + threadIdx.y) * numImages * numModules
4768             + myImgIdx;
4769     colorIndices += blockGroupIdx * numFilterColors;
4770
4771     float prod[filtersPerThread][imgsPerThread];
4772     #pragma unroll
4773     for(int f = 0; f < filtersPerThread; f++) {
4774         #pragma unroll
4775         for(int g = 0; g < imgsPerThread; g++) {
4776             prod[f][g] = 0;
4777         }
4778     }
4779 //    __shared__ int imgPos[]
4780     for (int oc = 0; oc < numFilterColors; oc += colorCache) { // oc stands for outer color (loop)
4781
4782         // Kinda wasteful here but...shouldn't matter
4783         if (tidx < colorCache) {
4784             shColors[tidx] = colorIndices[oc + tidx] * imgStride * imgPixels;
4785         }
4786         __syncthreads();
4787         for (int p = 0; p < filterPixels; p += B_Y) {
4788             /*
4789              * Load B_Y pixels from B_Y*filtersPerThread filters
4790              */
4791             if (shFilterLoadY < B_Y) {
4792                 #pragma unroll
4793                 for (int p2 = 0; p2 < B_Y; p2 += B_X/filtersPerThread) {
4794                     if (p + p2 + shFilterLoadY < filterPixels) {
4795                         #pragma unroll
4796                         for (int c = 0; c < colorCache; c++) {
4797                             shFilters[shFilterLoadY + p2 + c * B_Y][shFilterLoadX] = filters[((oc+c) * filterPixels + p + p2) * numFilters];
4798                         }
4799                     } else {
4800                         #pragma unroll
4801                         for (int c = 0; c < colorCache; c++) {
4802                             shFilters[shFilterLoadY + p2 + c * B_Y][shFilterLoadX] = 0;
4803                         }
4804                     }
4805                 }
4806             }
4807
4808             /*
4809              * Load B_Y pixels from B_X*imgsPerThread images
4810              */
4811             const int pixIdx = p + threadIdx.y;
4812             if (pixIdx < filterPixels) {
4813                 const int x = imgLoadModPosX + pixIdx % filterSize;
4814                 const int y = imgLoadModPosY + pixIdx / filterSize;
4815                 if (y >= 0 && y < imgSizeY && x >= 0 && x < imgSizeX) {
4816                     float* m = &images[imgStride * (y * imgSizeX + x)];
4817                     #pragma unroll
4818                     for (int i = 0; i < imgsPerThread; i++) {
4819                         if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
4820                             #pragma unroll
4821                             for (int c = 0; c < colorCache; c++) {
4822                                 shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = m[shColors[c] + i * B_X];
4823                             }
4824                         } else {
4825                             #pragma unroll
4826                             for (int c = 0; c < colorCache; c++) {
4827                                 shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = 0;
4828                             }
4829                         }
4830                     }
4831                 } else { // Padding
4832                     #pragma unroll
4833                     for (int i = 0; i < imgsPerThread; i++) {
4834                         #pragma unroll
4835                         for (int c = 0; c < colorCache; c++) {
4836                             shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = 0;
4837                         }
4838                     }
4839                 }
4840             }
4841             __syncthreads();
4842             #pragma unroll
4843             for (int i = 0; i < B_Y*colorCache; i++) {
4844                 #pragma unroll
4845                 for(int f = 0; f < filtersPerThread; f++) {
4846                     #pragma unroll
4847                     for(int g = 0; g < imgsPerThread; g++) {
4848                         prod[f][g] += shImages[i][g * B_X + threadIdx.x] * shFilters[i][threadIdx.y + f * B_Y];
4849                     }
4850                 }
4851
4852             }
4853             __syncthreads();
4854         }
4855     }
4856
4857     if (scale) {
4858         #pragma unroll
4859         for (int g = 0; g < imgsPerThread; g++) {
4860             if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
4861                 #pragma unroll
4862                 for (int f = 0; f < filtersPerThread; f++) {
4863                     targets[g * B_X + f * B_Y * numImages * numModules] = scaleTargets * targets[g * B_X + f * B_Y * numImages * numModules] + scaleOutputs * prod[f][g];
4864                 }
4865             }
4866         }
4867     } else {
4868         #pragma unroll
4869         for (int g = 0; g < imgsPerThread; g++) {
4870             if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
4871                 #pragma unroll
4872                 for (int f = 0; f < filtersPerThread; f++) {
4873                     targets[g * B_X + f * B_Y * numImages * numModules] = scaleOutputs * prod[f][g];
4874                 }
4875             }
4876         }
4877     }
4878 }
4879
4880 /*
4881  * images:      (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
4882  * filters:     (numFilterColors, filterPixels, numFilters)             if conv
4883  *              (numModules, numFilterColors, filterPixels, numFilters) otherwise
4884  *
4885  * targets:     (numFilters, numModules, numImages)
4886  *
4887  * Note: all of these convolution routines are optimized for the case when
4888  * the number of images (i.e. the minibatch size) is a multiple of 128.
4889  * Other batch sizes will work, but but I made no attempt whatsoever
4890  * to make them work fast.
4891  */
4892  void _filterActs(NVMatrix& images, NVMatrix& filters, NVMatrix& targets,
4893                    int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
4894                    int numImgColors, int numGroups,
4895                    float scaleTargets, float scaleOutput, bool conv) {
4896     int numFilterColors = numImgColors / numGroups;
4897     int numFilters = filters.getNumCols();
4898     int numModules = numModulesY * numModulesX;
4899     int numImages = images.getNumCols();
4900     int imgPixels = images.getNumRows()/numImgColors;
4901     int imgSizeX = imgPixels / imgSizeY;
4902     int filterModuleMult = conv ? 1 : numModules;
4903
4904     assert(numGroups > 1 || (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 2 == 0)));
4905     assert(numGroups == 1 || numFilterColors % 2 == 0);
4906     assert(numFilters % (16 * numGroups) == 0);
4907     assert(numImgColors % numGroups == 0);
4908     assert(images.getNumRows() == imgPixels * numImgColors);
4909     assert(imgSizeY * imgSizeX == imgPixels);
4910     int numFiltersPerGroup = numFilters / numGroups;
4911
4912     int imgStride = images.getStride(); // images does not need to be a contiguous matrix
4913
4914     int filterPixels = filters.getNumRows() / (filterModuleMult * numFilterColors);
4915     int filterSize = int(sqrt((double)filterPixels));
4916     assert(filterSize * filterSize == filterPixels);
4917     assert(filters.getNumRows() == filterModuleMult * numFilterColors * filterPixels);
4918
4919     // These routines don't handle the case when only part of the image is visited in the convolution
4920     assert(paddingStart <= 0);
4921     assert(paddingStart + (numModulesX-1)*moduleStride + filterSize >= imgSizeX);
4922     assert(paddingStart + (numModulesY-1)*moduleStride + filterSize >= imgSizeY);
4923     if (moduleStride > filterSize)
4924     {
4925         printf("moduleStride: %d\n", moduleStride);
4926         printf("filterSize: %d\n", filterSize);
4927         assert(false);
4928     }
4929
4930     assert(!images.isTrans());
4931     assert(!filters.isTrans());
4932     assert(!targets.isTrans());
4933
4934     assert(filters.isContiguous());
4935     assert(targets.isContiguous());
4936     int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
4937     dim3 blocks = numFiltersPerGroup % 32 == 0 ? dim3(DIVUP(numImages, 32 * imgsPerThread), (numModules * numFilters) / (4 * 8))
4938                                                : dim3(DIVUP(numImages, 32 * imgsPerThread), (numModules * numFilters) / (4 * 4));
4939     dim3 threads(32, 4);
4940     bool checkImgBounds = numImages % (32*imgsPerThread) != 0;
4941     if (scaleTargets == 0) {
4942         targets.resize(numFilters * numModules, numImages);
4943     } else {
4944         assert(targets.getNumRows() == numFilters * numModules);
4945         assert(targets.getNumCols() == numImages);
4946     }
4947
4948     if (imgsPerThread == 4) {
4949         if (numImgColors <= 3) {
4950             assert(numGroups == 1); // It has to be based on above definitions, but just to be sure.
4951             if (scaleTargets == 0) { // don't scale
4952                 if (numImgColors == 1) {
4953                     if (checkImgBounds) {
4954                         if (numFilters % 32 == 0) {
4955                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 1, false, true >, cudaFuncCachePreferShared);
4956                             filterActs_YxX_color < 4, 32, 4, 8, 1, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
4957                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
4958                         } else {
4959                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 1, false, true >, cudaFuncCachePreferShared);
4960                             filterActs_YxX_color < 4, 32, 4, 4, 1, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
4961                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
4962                         }
4963                     } else {
4964                         if (numFilters % 32 == 0) {
4965                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 1, false, false >, cudaFuncCachePreferShared);
4966                             filterActs_YxX_color < 4, 32, 4, 8, 1, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
4967                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
4968                         } else {
4969                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 1, false, false >, cudaFuncCachePreferShared);
4970                             filterActs_YxX_color < 4, 32, 4, 4, 1, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
4971                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
4972                         }
4973                     }
4974                 } else if (numImgColors == 2) {
4975                     if (checkImgBounds) {
4976                         if (numFilters % 32 == 0) {
4977                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 2, false, true >, cudaFuncCachePreferShared);
4978                             filterActs_YxX_color < 4, 32, 4, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
4979                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
4980                         } else {
4981                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 2, false, true >, cudaFuncCachePreferShared);
4982                             filterActs_YxX_color < 4, 32, 4, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
4983                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
4984                         }
4985                     } else {
4986                         if (numFilters % 32 == 0) {
4987                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 2, false, false >, cudaFuncCachePreferShared);
4988                             filterActs_YxX_color < 4, 32, 4, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
4989                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
4990                         } else {
4991                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 2, false, false >, cudaFuncCachePreferShared);
4992                             filterActs_YxX_color < 4, 32, 4, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
4993                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
4994                         }
4995                     }
4996                 }  else if (numImgColors == 3) {
4997                     if (checkImgBounds) {
4998                          if (numFilters % 32 == 0) {
4999                              cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 3, false, true >, cudaFuncCachePreferShared);
5000                              filterActs_YxX_color < 4, 32, 4, 8, 3, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5001                                          numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5002                          } else {
5003                              cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 3, false, true >, cudaFuncCachePreferShared);
5004                              filterActs_YxX_color < 4, 32, 4, 4, 3, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5005                                          numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5006                          }
5007                     } else {
5008                          if (numFilters % 32 == 0) {
5009                              cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 3, false, false >, cudaFuncCachePreferShared);
5010                              filterActs_YxX_color < 4, 32, 4, 8, 3, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5011                                          numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5012                          } else {
5013                              cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 3, false, false >, cudaFuncCachePreferShared);
5014                              filterActs_YxX_color < 4, 32, 4, 4, 3, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5015                                          numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5016                          }
5017                     }
5018                 }
5019             } else { // do scale
5020                 if (numImgColors == 1) {
5021                     if (checkImgBounds) {
5022                         if (numFilters % 32 == 0) {
5023                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 1, true, true >, cudaFuncCachePreferShared);
5024                             filterActs_YxX_color < 4, 32, 4, 8, 1, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5025                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5026                         } else {
5027                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 1, true, true >, cudaFuncCachePreferShared);
5028                             filterActs_YxX_color < 4, 32, 4, 4, 1, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5029                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5030                         }
5031                     } else {
5032                         if (numFilters % 32 == 0) {
5033                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 1, true, false >, cudaFuncCachePreferShared);
5034                             filterActs_YxX_color < 4, 32, 4, 8, 1, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5035                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5036                         } else {
5037                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 1, true, false >, cudaFuncCachePreferShared);
5038                             filterActs_YxX_color < 4, 32, 4, 4, 1, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5039                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5040                         }
5041                     }
5042                 } else if (numImgColors == 2) {
5043                     if (checkImgBounds) {
5044                         if (numFilters % 32 == 0) {
5045                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 2, true, true >, cudaFuncCachePreferShared);
5046                             filterActs_YxX_color < 4, 32, 4, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5047                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5048                         } else {
5049                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 2, true, true >, cudaFuncCachePreferShared);
5050                             filterActs_YxX_color < 4, 32, 4, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5051                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5052                         }
5053                     } else {
5054                         if (numFilters % 32 == 0) {
5055                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 2, true, false >, cudaFuncCachePreferShared);
5056                             filterActs_YxX_color < 4, 32, 4, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5057                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5058                         } else {
5059                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 2, true, false >, cudaFuncCachePreferShared);
5060                             filterActs_YxX_color < 4, 32, 4, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5061                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5062                         }
5063                     }
5064                 }  else if (numImgColors == 3) {
5065                     if (checkImgBounds) {
5066                         if (numFilters % 32 == 0) {
5067                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 3, true, true >, cudaFuncCachePreferShared);
5068                             filterActs_YxX_color < 4, 32, 4, 8, 3, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5069                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5070                         } else {
5071                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 3, true, true >, cudaFuncCachePreferShared);
5072                             filterActs_YxX_color < 4, 32, 4, 4, 3, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5073                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5074                         }
5075                     } else {
5076                         if (numFilters % 32 == 0) {
5077                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 3, true, false >, cudaFuncCachePreferShared);
5078                             filterActs_YxX_color < 4, 32, 4, 8, 3, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5079                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5080                         } else {
5081                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 3, true, false >, cudaFuncCachePreferShared);
5082                             filterActs_YxX_color < 4, 32, 4, 4, 3, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5083                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5084                         }
5085                     }
5086                 }
5087             }
5088         } else {
5089             if (scaleTargets == 0) { // don't scale
5090                 if (checkImgBounds) {
5091                     if (numFiltersPerGroup % 32 == 0) {
5092                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 4, 8, 2, false, true >, cudaFuncCachePreferShared);
5093                         filterActs_YxX_sparse < 4, 32, 4, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5094                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5095                     } else {
5096                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 4, 4, 2, false, true >, cudaFuncCachePreferShared);
5097                         filterActs_YxX_sparse < 4, 32, 4, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5098                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5099                     }
5100                 } else {
5101                     if (numFiltersPerGroup % 32 == 0) {
5102                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 4, 8, 2, false, false >, cudaFuncCachePreferShared);
5103                         filterActs_YxX_sparse < 4, 32, 4, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5104                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5105                     } else {
5106                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 4, 4, 2, false, false >, cudaFuncCachePreferShared);
5107                         filterActs_YxX_sparse < 4, 32, 4, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5108                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5109                     }
5110                 }
5111             } else { // do scale
5112                 if (checkImgBounds) {
5113                     if (numFiltersPerGroup % 32 == 0) {
5114                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 4, 8, 2, false, true >, cudaFuncCachePreferShared);
5115                         filterActs_YxX_sparse < 4, 32, 4, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5116                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5117                     } else {
5118                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 4, 4, 2, false, true >, cudaFuncCachePreferShared);
5119                         filterActs_YxX_sparse < 4, 32, 4, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5120                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5121                     }
5122                 } else {
5123                     if (numFiltersPerGroup % 32 == 0) {
5124                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 4, 8, 2, false, false >, cudaFuncCachePreferShared);
5125                         filterActs_YxX_sparse < 4, 32, 4, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5126                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5127                     } else {
5128                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 4, 4, 2, false, false >, cudaFuncCachePreferShared);
5129                         filterActs_YxX_sparse < 4, 32, 4, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5130                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5131                     }
5132                 }
5133             }
5134         }
5135     } else if (imgsPerThread == 2) {
5136         if (numImgColors <= 3) {
5137             assert(numGroups == 1); // It has to be based on above definitions, but just to be sure.
5138             if (scaleTargets == 0) { // don't scale
5139                 if (numImgColors == 1) {
5140                     if (checkImgBounds) {
5141                         if (numFilters % 32 == 0) {
5142                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 1, false, true >, cudaFuncCachePreferShared);
5143                             filterActs_YxX_color < 4, 32, 2, 8, 1, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5144                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5145                         } else {
5146                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 1, false, true >, cudaFuncCachePreferShared);
5147                             filterActs_YxX_color < 4, 32, 2, 4, 1, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5148                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5149                         }
5150                     } else {
5151                         if (numFilters % 32 == 0) {
5152                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 1, false, false >, cudaFuncCachePreferShared);
5153                             filterActs_YxX_color < 4, 32, 2, 8, 1, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5154                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5155                         } else {
5156                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 1, false, false >, cudaFuncCachePreferShared);
5157                             filterActs_YxX_color < 4, 32, 2, 4, 1, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5158                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5159                         }
5160                     }
5161                 } else if (numImgColors == 2) {
5162                     if (checkImgBounds) {
5163                         if (numFilters % 32 == 0) {
5164                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 2, false, true >, cudaFuncCachePreferShared);
5165                             filterActs_YxX_color < 4, 32, 2, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5166                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5167                         } else {
5168                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 2, false, true >, cudaFuncCachePreferShared);
5169                             filterActs_YxX_color < 4, 32, 2, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5170                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5171                         }
5172                     } else {
5173                         if (numFilters % 32 == 0) {
5174                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 2, false, false >, cudaFuncCachePreferShared);
5175                             filterActs_YxX_color < 4, 32, 2, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5176                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5177                         } else {
5178                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 2, false, false >, cudaFuncCachePreferShared);
5179                             filterActs_YxX_color < 4, 32, 2, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5180                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5181                         }
5182                     }
5183                 }  else if (numImgColors == 3) {
5184                     if (checkImgBounds) {
5185                          if (numFilters % 32 == 0) {
5186                              cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 3, false, true >, cudaFuncCachePreferShared);
5187                              filterActs_YxX_color < 4, 32, 2, 8, 3, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5188                                          numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5189                          } else {
5190                              cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 3, false, true >, cudaFuncCachePreferShared);
5191                              filterActs_YxX_color < 4, 32, 2, 4, 3, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5192                                          numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5193                          }
5194                     } else {
5195                          if (numFilters % 32 == 0) {
5196                              cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 3, false, false >, cudaFuncCachePreferShared);
5197                              filterActs_YxX_color < 4, 32, 2, 8, 3, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5198                                          numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5199                          } else {
5200                              cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 3, false, false >, cudaFuncCachePreferShared);
5201                              filterActs_YxX_color < 4, 32, 2, 4, 3, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5202                                          numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5203                          }
5204                     }
5205                 }
5206             } else { // do scale
5207                 if (numImgColors == 1) {
5208                     if (checkImgBounds) {
5209                         if (numFilters % 32 == 0) {
5210                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 1, true, true >, cudaFuncCachePreferShared);
5211                             filterActs_YxX_color < 4, 32, 2, 8, 1, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5212                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5213                         } else {
5214                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 1, true, true >, cudaFuncCachePreferShared);
5215                             filterActs_YxX_color < 4, 32, 2, 4, 1, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5216                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5217                         }
5218                     } else {
5219                         if (numFilters % 32 == 0) {
5220                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 1, true, false >, cudaFuncCachePreferShared);
5221                             filterActs_YxX_color < 4, 32, 2, 8, 1, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5222                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5223                         } else {
5224                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 1, true, false >, cudaFuncCachePreferShared);
5225                             filterActs_YxX_color < 4, 32, 2, 4, 1, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5226                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5227                         }
5228                     }
5229                 } else if (numImgColors == 2) {
5230                     if (checkImgBounds) {
5231                         if (numFilters % 32 == 0) {
5232                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 2, true, true >, cudaFuncCachePreferShared);
5233                             filterActs_YxX_color < 4, 32, 2, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5234                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5235                         } else {
5236                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 2, true, true >, cudaFuncCachePreferShared);
5237                             filterActs_YxX_color < 4, 32, 2, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5238                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5239                         }
5240                     } else {
5241                         if (numFilters % 32 == 0) {
5242                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 2, true, false >, cudaFuncCachePreferShared);
5243                             filterActs_YxX_color < 4, 32, 2, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5244                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5245                         } else {
5246                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 2, true, false >, cudaFuncCachePreferShared);
5247                             filterActs_YxX_color < 4, 32, 2, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5248                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5249                         }
5250                     }
5251                 }  else if (numImgColors == 3) {
5252                     if (checkImgBounds) {
5253                         if (numFilters % 32 == 0) {
5254                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 3, true, true >, cudaFuncCachePreferShared);
5255                             filterActs_YxX_color < 4, 32, 2, 8, 3, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5256                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5257                         } else {
5258                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 3, true, true >, cudaFuncCachePreferShared);
5259                             filterActs_YxX_color < 4, 32, 2, 4, 3, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5260                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5261                         }
5262                     } else {
5263                         if (numFilters % 32 == 0) {
5264                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 3, true, false >, cudaFuncCachePreferShared);
5265                             filterActs_YxX_color < 4, 32, 2, 8, 3, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5266                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5267                         } else {
5268                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 3, true, false >, cudaFuncCachePreferShared);
5269                             filterActs_YxX_color < 4, 32, 2, 4, 3, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5270                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5271                         }
5272                     }
5273                 }
5274             }
5275         } else {
5276             if (scaleTargets == 0) { // don't scale
5277                 if (checkImgBounds) {
5278                     if (numFiltersPerGroup % 32 == 0) {
5279                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 2, 8, 2, false, true >, cudaFuncCachePreferShared);
5280                         filterActs_YxX_sparse < 4, 32, 2, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5281                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5282                     } else {
5283                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 2, 4, 2, false, true >, cudaFuncCachePreferShared);
5284                         filterActs_YxX_sparse < 4, 32, 2, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5285                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5286                     }
5287                 } else {
5288                     if (numFiltersPerGroup % 32 == 0) {
5289                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 2, 8, 2, false, false >, cudaFuncCachePreferShared);
5290                         filterActs_YxX_sparse < 4, 32, 2, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5291                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5292                     } else {
5293                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 2, 4, 2, false, false >, cudaFuncCachePreferShared);
5294                         filterActs_YxX_sparse < 4, 32, 2, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5295                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5296                     }
5297                 }
5298             } else { // do scale
5299                 if (checkImgBounds) {
5300                     if (numFiltersPerGroup % 32 == 0) {
5301                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 2, 8, 2, false, true >, cudaFuncCachePreferShared);
5302                         filterActs_YxX_sparse < 4, 32, 2, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5303                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5304                     } else {
5305                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 2, 4, 2, false, true >, cudaFuncCachePreferShared);
5306                         filterActs_YxX_sparse < 4, 32, 2, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5307                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5308                     }
5309                 } else {
5310                     if (numFiltersPerGroup % 32 == 0) {
5311                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 2, 8, 2, false, false >, cudaFuncCachePreferShared);
5312                         filterActs_YxX_sparse < 4, 32, 2, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5313                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5314                     } else {
5315                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 2, 4, 2, false, false >, cudaFuncCachePreferShared);
5316                         filterActs_YxX_sparse < 4, 32, 2, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5317                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5318                     }
5319                 }
5320             }
5321         }
5322     } else {
5323         if (numImgColors <= 3) {
5324             assert(numGroups == 1); // It has to be based on above definitions, but just to be sure.
5325             if (scaleTargets == 0) { // don't scale
5326                 if (numImgColors == 1) {
5327                     if (checkImgBounds) {
5328                         if (numFilters % 32 == 0) {
5329                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 1, false, true >, cudaFuncCachePreferShared);
5330                             filterActs_YxX_color < 4, 32, 1, 8, 1, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5331                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5332                         } else {
5333                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 1, false, true >, cudaFuncCachePreferShared);
5334                             filterActs_YxX_color < 4, 32, 1, 4, 1, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5335                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5336                         }
5337                     } else {
5338                         if (numFilters % 32 == 0) {
5339                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 1, false, false >, cudaFuncCachePreferShared);
5340                             filterActs_YxX_color < 4, 32, 1, 8, 1, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5341                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5342                         } else {
5343                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 1, false, false >, cudaFuncCachePreferShared);
5344                             filterActs_YxX_color < 4, 32, 1, 4, 1, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5345                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5346                         }
5347                     }
5348                 } else if (numImgColors == 2) {
5349                     if (checkImgBounds) {
5350                         if (numFilters % 32 == 0) {
5351                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 2, false, true >, cudaFuncCachePreferShared);
5352                             filterActs_YxX_color < 4, 32, 1, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5353                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5354                         } else {
5355                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 2, false, true >, cudaFuncCachePreferShared);
5356                             filterActs_YxX_color < 4, 32, 1, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5357                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5358                         }
5359                     } else {
5360                         if (numFilters % 32 == 0) {
5361                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 2, false, false >, cudaFuncCachePreferShared);
5362                             filterActs_YxX_color < 4, 32, 1, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5363                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5364                         } else {
5365                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 2, false, false >, cudaFuncCachePreferShared);
5366                             filterActs_YxX_color < 4, 32, 1, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5367                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5368                         }
5369                     }
5370                 }  else if (numImgColors == 3) {
5371                     if (checkImgBounds) {
5372                          if (numFilters % 32 == 0) {
5373                              cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 3, false, true >, cudaFuncCachePreferShared);
5374                              filterActs_YxX_color < 4, 32, 1, 8, 3, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5375                                          numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5376                          } else {
5377                              cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 3, false, true >, cudaFuncCachePreferShared);
5378                              filterActs_YxX_color < 4, 32, 1, 4, 3, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5379                                          numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5380                          }
5381                     } else {
5382                          if (numFilters % 32 == 0) {
5383                              cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 3, false, false >, cudaFuncCachePreferShared);
5384                              filterActs_YxX_color < 4, 32, 1, 8, 3, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5385                                          numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5386                          } else {
5387                              cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 3, false, false >, cudaFuncCachePreferShared);
5388                              filterActs_YxX_color < 4, 32, 1, 4, 3, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5389                                          numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5390                          }
5391                     }
5392                 }
5393             } else { // do scale
5394                 if (numImgColors == 1) {
5395                     if (checkImgBounds) {
5396                         if (numFilters % 32 == 0) {
5397                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 1, true, true >, cudaFuncCachePreferShared);
5398                             filterActs_YxX_color < 4, 32, 1, 8, 1, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5399                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5400                         } else {
5401                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 1, true, true >, cudaFuncCachePreferShared);
5402                             filterActs_YxX_color < 4, 32, 1, 4, 1, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5403                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5404                         }
5405                     } else {
5406                         if (numFilters % 32 == 0) {
5407                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 1, true, false >, cudaFuncCachePreferShared);
5408                             filterActs_YxX_color < 4, 32, 1, 8, 1, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5409                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5410                         } else {
5411                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 1, true, false >, cudaFuncCachePreferShared);
5412                             filterActs_YxX_color < 4, 32, 1, 4, 1, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5413                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5414                         }
5415                     }
5416                 } else if (numImgColors == 2) {
5417                     if (checkImgBounds) {
5418                         if (numFilters % 32 == 0) {
5419                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 2, true, true >, cudaFuncCachePreferShared);
5420                             filterActs_YxX_color < 4, 32, 1, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5421                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5422                         } else {
5423                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 2, true, true >, cudaFuncCachePreferShared);
5424                             filterActs_YxX_color < 4, 32, 1, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5425                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5426                         }
5427                     } else {
5428                         if (numFilters % 32 == 0) {
5429                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 2, true, false >, cudaFuncCachePreferShared);
5430                             filterActs_YxX_color < 4, 32, 1, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5431                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5432                         } else {
5433                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 2, true, false >, cudaFuncCachePreferShared);
5434                             filterActs_YxX_color < 4, 32, 1, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5435                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5436                         }
5437                     }
5438                 }  else if (numImgColors == 3) {
5439                     if (checkImgBounds) {
5440                         if (numFilters % 32 == 0) {
5441                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 3, true, true >, cudaFuncCachePreferShared);
5442                             filterActs_YxX_color < 4, 32, 1, 8, 3, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5443                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5444                         } else {
5445                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 3, true, true >, cudaFuncCachePreferShared);
5446                             filterActs_YxX_color < 4, 32, 1, 4, 3, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5447                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5448                         }
5449                     } else {
5450                         if (numFilters % 32 == 0) {
5451                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 3, true, false >, cudaFuncCachePreferShared);
5452                             filterActs_YxX_color < 4, 32, 1, 8, 3, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5453                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5454                         } else {
5455                             cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 3, true, false >, cudaFuncCachePreferShared);
5456                             filterActs_YxX_color < 4, 32, 1, 4, 3, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5457                                         numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5458                         }
5459                     }
5460                 }
5461             }
5462         } else {
5463             if (scaleTargets == 0) { // don't scale
5464                 if (checkImgBounds) {
5465                     if (numFiltersPerGroup % 32 == 0) {
5466                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 1, 8, 2, false, true >, cudaFuncCachePreferShared);
5467                         filterActs_YxX_sparse < 4, 32, 1, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5468                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5469                     } else {
5470                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 1, 4, 2, false, true >, cudaFuncCachePreferShared);
5471                         filterActs_YxX_sparse < 4, 32, 1, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5472                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5473                     }
5474                 } else {
5475                     if (numFiltersPerGroup % 32 == 0) {
5476                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 1, 8, 2, false, false >, cudaFuncCachePreferShared);
5477                         filterActs_YxX_sparse < 4, 32, 1, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5478                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5479                     } else {
5480                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 1, 4, 2, false, false >, cudaFuncCachePreferShared);
5481                         filterActs_YxX_sparse < 4, 32, 1, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5482                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5483                     }
5484                 }
5485             } else { // do scale
5486                 if (checkImgBounds) {
5487                     if (numFiltersPerGroup % 32 == 0) {
5488                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 1, 8, 2, false, true >, cudaFuncCachePreferShared);
5489                         filterActs_YxX_sparse < 4, 32, 1, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5490                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5491                     } else {
5492                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 1, 4, 2, false, true >, cudaFuncCachePreferShared);
5493                         filterActs_YxX_sparse < 4, 32, 1, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5494                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5495                     }
5496                 } else {
5497                     if (numFiltersPerGroup % 32 == 0) {
5498                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 1, 8, 2, false, false >, cudaFuncCachePreferShared);
5499                         filterActs_YxX_sparse < 4, 32, 1, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5500                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5501                     } else {
5502                         cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 1, 4, 2, false, false >, cudaFuncCachePreferShared);
5503                         filterActs_YxX_sparse < 4, 32, 1, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5504                                     numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5505                     }
5506                 }
5507             }
5508         }
5509     }
5510
5511     cutilCheckMsg("filterActs: kernel execution failed");
5512 }
5513
5514 void convFilterActs(NVMatrix& images, NVMatrix& filters, NVMatrix& targets,
5515                           int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5516                           int numImgColors, int numGroups) {
5517     convFilterActs(images, filters, targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, numImgColors, numGroups, 0, 1);
5518 }
5519
5520 void convFilterActs(NVMatrix& images, NVMatrix& filters, NVMatrix& targets,
5521                    int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5522                    int numImgColors, int numGroups,
5523                    float scaleTargets, float scaleOutput) {
5524      _filterActs(images, filters, targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput, true);
5525 }
5526
5527 void localFilterActs(NVMatrix& images, NVMatrix& filters, NVMatrix& targets,
5528                           int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5529                           int numImgColors, int numGroups) {
5530     localFilterActs(images, filters, targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, numImgColors, numGroups, 0, 1);
5531 }
5532
5533 void localFilterActs(NVMatrix& images, NVMatrix& filters, NVMatrix& targets,
5534                    int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5535                    int numImgColors, int numGroups,
5536                    float scaleTargets, float scaleOutput) {
5537      _filterActs(images, filters, targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput, false);
5538 }
5539
5540 /*
5541  * images:          (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
5542  * filters:         (numFilterColors, filterPixels, numFilters)             if conv
5543  *                  (numModules, numFilterColors, filterPixels, numFilters) otherwise
5544  *
5545  * targets:         (numFilters, numModulesY, numModulesX, numImages)
5546  * colorIndices:    (numGroups, numFilterColors)
5547  *
5548  * Note: all of these convolution routines are optimized for the case when
5549  * the number of images (i.e. the minibatch size) is a multiple of 128.
5550  * Other batch sizes will work, but but I made no attempt whatsoever
5551  * to make them work fast.
5552  */
5553 void _filterActsSparse(NVMatrix& images, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
5554                           int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5555                           int numImgColors, int numFilterColors, int numGroups,
5556                           float scaleTargets, float scaleOutput, bool conv) {
5557     int numFilters = filters.getNumCols();
5558     int numModules = numModulesY * numModulesX;
5559     int numImages = images.getNumCols();
5560     int imgPixels = images.getNumRows() / numImgColors;
5561     int imgSizeX = imgPixels / imgSizeY;
5562     int filterModuleMult = conv ? 1 : numModules;
5563
5564     assert(numGroups > 1);
5565     assert(numImgColors % numFilterColors == 0);
5566     assert((numFilterColors * numGroups) % numImgColors == 0);
5567     assert(numFilters % (16 * numGroups) == 0);
5568     assert(numFilterColors % 2 == 0);
5569
5570     assert(imgSizeY * imgSizeX == imgPixels);
5571     assert(images.getNumRows() == imgPixels * numImgColors);
5572     int numFiltersPerGroup = numFilters / numGroups;
5573
5574     int imgStride = images.getStride(); // images does not need to be a contiguous matrix
5575
5576     int filterPixels = filters.getNumRows() / (filterModuleMult * numFilterColors);
5577     int filterSize = int(sqrt((double)filterPixels));
5578     assert(filterSize * filterSize == filterPixels);
5579     assert(filters.getNumRows() == filterModuleMult * numFilterColors * filterPixels);
5580
5581     // These routines don't handle the case when only part of the image is visited in the convolution
5582     assert(paddingStart <= 0);
5583     assert(paddingStart + (numModulesX-1) * moduleStride + filterSize >= imgSizeX);
5584     assert(paddingStart + (numModulesY-1) * moduleStride + filterSize >= imgSizeY);
5585     assert(moduleStride <= filterSize);
5586
5587     assert(!images.isTrans());
5588     assert(!filters.isTrans());
5589     assert(!targets.isTrans());
5590
5591     assert(filters.isContiguous());
5592     assert(targets.isContiguous());
5593     int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
5594     dim3 blocks = numFiltersPerGroup % 32 == 0 ? dim3(DIVUP(numImages, 32 * imgsPerThread), (numModules * numFilters) / (4 * 8))
5595                                                : dim3(DIVUP(numImages, 32 * imgsPerThread), (numModules * numFilters) / (4 * 4));
5596     dim3 threads(32, 4);
5597     bool checkImgBounds = numImages % (32*imgsPerThread) != 0;
5598     if (scaleTargets == 0) {
5599         targets.resize(numFilters * numModules, numImages);
5600     } else {
5601         assert(targets.getNumRows() == numFilters * numModules);
5602         assert(targets.getNumCols() == numImages);
5603     }
5604
5605     if (imgsPerThread == 4) {
5606         if (scaleTargets == 0) { // don't scale
5607             if (checkImgBounds) {
5608                 if (numFiltersPerGroup % 32 == 0) {
5609                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 4, 8, 2, false, true >, cudaFuncCachePreferShared);
5610                     filterActs_YxX_sparse_random < 4, 32, 4, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5611                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5612                 } else {
5613                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 4, 4, 2, false, true >, cudaFuncCachePreferShared);
5614                     filterActs_YxX_sparse_random < 4, 32, 4, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5615                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5616                 }
5617             } else {
5618                 if (numFiltersPerGroup % 32 == 0) {
5619                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 4, 8, 2, false, false >, cudaFuncCachePreferShared);
5620                     filterActs_YxX_sparse_random < 4, 32, 4, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5621                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5622                 } else {
5623                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 4, 4, 2, false, false >, cudaFuncCachePreferShared);
5624                     filterActs_YxX_sparse_random < 4, 32, 4, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5625                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5626                 }
5627             }
5628         } else { // do scale
5629             if (checkImgBounds) {
5630                 if (numFiltersPerGroup % 32 == 0) {
5631                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 4, 8, 2, false, true >, cudaFuncCachePreferShared);
5632                     filterActs_YxX_sparse_random < 4, 32, 4, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5633                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5634                 } else {
5635                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 4, 4, 2, false, true >, cudaFuncCachePreferShared);
5636                     filterActs_YxX_sparse_random < 4, 32, 4, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5637                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5638                 }
5639             } else {
5640                 if (numFiltersPerGroup % 32 == 0) {
5641                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 4, 8, 2, false, false >, cudaFuncCachePreferShared);
5642                     filterActs_YxX_sparse_random < 4, 32, 4, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5643                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5644                 } else {
5645                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 4, 4, 2, false, false >, cudaFuncCachePreferShared);
5646                     filterActs_YxX_sparse_random < 4, 32, 4, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5647                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5648                 }
5649             }
5650         }
5651     } else if (imgsPerThread == 2) {
5652         if (scaleTargets == 0) { // don't scale
5653             if (checkImgBounds) {
5654                 if (numFiltersPerGroup % 32 == 0) {
5655                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 2, 8, 2, false, true >, cudaFuncCachePreferShared);
5656                     filterActs_YxX_sparse_random < 4, 32, 2, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5657                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5658                 } else {
5659                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 2, 4, 2, false, true >, cudaFuncCachePreferShared);
5660                     filterActs_YxX_sparse_random < 4, 32, 2, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5661                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5662                 }
5663             } else {
5664                 if (numFiltersPerGroup % 32 == 0) {
5665                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 2, 8, 2, false, false >, cudaFuncCachePreferShared);
5666                     filterActs_YxX_sparse_random < 4, 32, 2, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5667                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5668                 } else {
5669                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 2, 4, 2, false, false >, cudaFuncCachePreferShared);
5670                     filterActs_YxX_sparse_random < 4, 32, 2, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5671                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5672                 }
5673             }
5674         } else { // do scale
5675             if (checkImgBounds) {
5676                 if (numFiltersPerGroup % 32 == 0) {
5677                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 2, 8, 2, false, true >, cudaFuncCachePreferShared);
5678                     filterActs_YxX_sparse_random < 4, 32, 2, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5679                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5680                 } else {
5681                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 2, 4, 2, false, true >, cudaFuncCachePreferShared);
5682                     filterActs_YxX_sparse_random < 4, 32, 2, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5683                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5684                 }
5685             } else {
5686                 if (numFiltersPerGroup % 32 == 0) {
5687                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 2, 8, 2, false, false >, cudaFuncCachePreferShared);
5688                     filterActs_YxX_sparse_random < 4, 32, 2, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5689                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5690                 } else {
5691                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 2, 4, 2, false, false >, cudaFuncCachePreferShared);
5692                     filterActs_YxX_sparse_random < 4, 32, 2, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5693                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5694                 }
5695             }
5696         }
5697     } else {
5698         if (scaleTargets == 0) { // don't scale
5699             if (checkImgBounds) {
5700                 if (numFiltersPerGroup % 32 == 0) {
5701                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 1, 8, 2, false, true >, cudaFuncCachePreferShared);
5702                     filterActs_YxX_sparse_random < 4, 32, 1, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5703                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5704                 } else {
5705                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 1, 4, 2, false, true >, cudaFuncCachePreferShared);
5706                     filterActs_YxX_sparse_random < 4, 32, 1, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5707                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5708                 }
5709             } else {
5710                 if (numFiltersPerGroup % 32 == 0) {
5711                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 1, 8, 2, false, false >, cudaFuncCachePreferShared);
5712                     filterActs_YxX_sparse_random < 4, 32, 1, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5713                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5714                 } else {
5715                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 1, 4, 2, false, false >, cudaFuncCachePreferShared);
5716                     filterActs_YxX_sparse_random < 4, 32, 1, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5717                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5718                 }
5719             }
5720         } else { // do scale
5721             if (checkImgBounds) {
5722                 if (numFiltersPerGroup % 32 == 0) {
5723                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 1, 8, 2, false, true >, cudaFuncCachePreferShared);
5724                     filterActs_YxX_sparse_random < 4, 32, 1, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5725                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5726                 } else {
5727                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 1, 4, 2, false, true >, cudaFuncCachePreferShared);
5728                     filterActs_YxX_sparse_random < 4, 32, 1, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5729                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5730                 }
5731             } else {
5732                 if (numFiltersPerGroup % 32 == 0) {
5733                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 1, 8, 2, false, false >, cudaFuncCachePreferShared);
5734                     filterActs_YxX_sparse_random < 4, 32, 1, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5735                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5736                 } else {
5737                     cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 1, 4, 2, false, false >, cudaFuncCachePreferShared);
5738                     filterActs_YxX_sparse_random < 4, 32, 1, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5739                                 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5740                 }
5741             }
5742         }
5743     }
5744
5745     cutilCheckMsg("filterActsSparse: kernel execution failed");
5746 }
5747
5748 void convFilterActsSparse(NVMatrix& images, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
5749                           int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5750                           int numImgColors, int numFilterColors, int numGroups,
5751                           float scaleTargets, float scaleOutput) {
5752     _filterActsSparse(images, filters, targets, dColorIndices, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride,
5753                       numImgColors,  numFilterColors, numGroups, scaleTargets, scaleOutput, true);
5754 }
5755
5756 void convFilterActsSparse(NVMatrix& images, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
5757                           int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5758                           int numImgColors, int numFilterColors, int numGroups) {
5759     convFilterActsSparse(images, filters, targets, dColorIndices, imgSizeY, numModulesY, numModulesX, paddingStart,
5760                          moduleStride, numImgColors, numFilterColors, numGroups, 0, 1);
5761 }
5762
5763 void localFilterActsSparse(NVMatrix& images, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
5764                           int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5765                           int numImgColors, int numFilterColors, int numGroups,
5766                           float scaleTargets, float scaleOutput) {
5767     _filterActsSparse(images, filters, targets, dColorIndices, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride,
5768                       numImgColors,  numFilterColors, numGroups, scaleTargets, scaleOutput, false);
5769 }
5770
5771 void localFilterActsSparse(NVMatrix& images, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
5772                           int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5773                           int numImgColors, int numFilterColors, int numGroups) {
5774     localFilterActsSparse(images, filters, targets, dColorIndices, imgSizeY, numModulesY, numModulesX, paddingStart,
5775                          moduleStride, numImgColors, numFilterColors, numGroups, 0, 1);
5776 }
5777
5778 /*
5779  * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
5780  * All rights reserved.
5781  *
5782  * Redistribution and use in source and binary forms, with or without modification,
5783  * are permitted provided that the following conditions are met:
5784  *
5785  * - Redistributions of source code must retain the above copyright notice,
5786  *   this list of conditions and the following disclaimer.
5787  *
5788  * - Redistributions in binary form must reproduce the above copyright notice,
5789  *   this list of conditions and the following disclaimer in the documentation
5790  *   and/or other materials provided with the distribution.
5791  *
5792  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
5793  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
5794  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
5795  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
5796  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
5797  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
5798  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
5799  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
5800  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
5801  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5802  */
5803
5804 #ifndef _CUDACONV2_EXPORT
5805 #define _CUDACONV2_EXPORT
5806 #endif
5807
5808 #include <cudaconv2.cuh>
5809
5810 /*
5811  * Block size: 16x16.
5812  * blockIdx.x determines case in batches of 16*imgsPerThread.
5813  * blockIdx.y determines 4x4 image region in target image.
5814  *
5815  * threadIdx.x determines case.
5816  * threadIdx.y determines pixel.
5817  *
5818  * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
5819  * filters:     (numColors, filterPixels, numFilters)                               if conv
5820  *              (numModulesY, numModulesX, numColors, filterPixels, numFilters)     otherwise
5821  * targets:     (numColors, imgSizeY, imgSizeX, numImages)
5822  *
5823  * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases.
5824  *
5825  * Number of filters must be divisible by 16.
5826  * Number of images must be divisible by 16*imgsPerThread  if checkCaseBounds is false.
5827  * 16 * imgsPerThread must be divisible by 32.
5828  *
5829  * This version loads 32 cases at a time, so it gets full coalescing on that load.
5830  * It only loads 16 weights at a time, so those aren't fully coalesced.
5831  * This version conserves shared memory by loading 16 filters at a time rather than 32.
5832  */
5833 template <int imgsPerThread, int numColors, bool scale, bool checkCaseBounds, bool conv>
5834 __global__ void img_acts_color(const float* hidActs, const float* filters, float* targets,
5835                                    const int numModulesY, const int numModulesX, const int numImages, const int numFilters,
5836                                    const int filterSize, const int imgSizeY, const int imgSizeX,
5837                                    const int paddingStart, const int moduleStride,
5838                                    const float scaleTargets, const float scaleOutputs) {
5839     __shared__ float shFilters[numColors*16][16 + 1];
5840     __shared__ float shHidActs[16][16*imgsPerThread];
5841
5842     const int blockCaseIdx = blockIdx.x * 16*imgsPerThread;
5843     const int numRegionsX = DIVUP(imgSizeX, 4);
5844     const int blockRegionIdx = blockIdx.y;
5845     const int blockRegionIdxX = blockRegionIdx % numRegionsX;
5846     const int blockRegionIdxY = blockRegionIdx / numRegionsX;
5847     const int blockRegionLeft = blockRegionIdxX * 4;
5848     const int blockRegionTop = blockRegionIdxY * 4;
5849     const int pxYInRegion = threadIdx.y / 4, pxXInRegion = threadIdx.y % 4;
5850     const int pxY = blockRegionTop + pxYInRegion;
5851     const int pxX = blockRegionLeft + pxXInRegion;
5852     const int pxIdx = pxY * imgSizeX + pxX;
5853     const bool isPxInImg = pxY < imgSizeY && pxX < imgSizeX;
5854     const int numModules = numModulesY * numModulesX;
5855     const int filterPixels = filterSize * filterSize;
5856     const int imgPixels = imgSizeX * imgSizeY;
5857     const int tidx = threadIdx.y * 16 + threadIdx.x;
5858     const int loadY = tidx / 32, loadX = tidx % 32;
5859
5860     hidActs += blockCaseIdx + loadY * numImages * numModules + loadX;
5861     filters += threadIdx.x;
5862     targets += pxIdx * numImages + blockCaseIdx + threadIdx.x;
5863
5864
5865     float prod[numColors][imgsPerThread];
5866     #pragma unroll
5867     for (int c = 0; c < numColors; c++) {
5868         #pragma unroll
5869         for (int i = 0; i < imgsPerThread; i++) {
5870             prod[c][i] = 0;
5871         }
5872     }
5873     const int startY = blockRegionTop - paddingStart < filterSize ? 0
5874                         : 1 + (blockRegionTop - paddingStart - filterSize) / moduleStride;
5875     const int endY = MIN(numModulesY, 1 + (blockRegionTop + 3 - paddingStart) / moduleStride);
5876     const int startX = blockRegionLeft - paddingStart < filterSize ? 0
5877                         : 1 + (blockRegionLeft - paddingStart - filterSize) / moduleStride;
5878     const int endX = MIN(numModulesX, 1 + (blockRegionLeft + 3 - paddingStart) / moduleStride);
5879
5880     float* shilterLoad = &shFilters[threadIdx.y][threadIdx.x];
5881     float* shHidActLoad = &shHidActs[loadY][loadX];
5882
5883     for (int my = startY; my < endY; my++) {
5884         const int moduleTop = paddingStart + my * moduleStride;
5885         const int pxInModuleY = pxY - moduleTop;
5886
5887         for (int mx = startX; mx < endX; mx++) {
5888             const int moduleIdx = my * numModulesX + mx;
5889             const int moduleLeft = paddingStart + mx * moduleStride;
5890             const int pxInModuleX = pxX - moduleLeft;
5891
5892             const bool isPxInModule = pxInModuleY >= 0 && pxInModuleY < filterSize && pxInModuleX >= 0 && pxInModuleX < filterSize;
5893             const int pxIdxInModule = pxInModuleY * filterSize + pxInModuleX;
5894
5895             for (int f = 0; f < numFilters; f += 16) { // multiply with 16 filters at a time
5896                 // Now the threads split up into half-warps, and each half-warp decides if it's interested.
5897                 const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages];
5898                 #pragma unroll
5899                 for (int i = 0; i < imgsPerThread * 16; i += 32) {
5900                     if (!checkCaseBounds || blockCaseIdx + i + loadX < numImages) {
5901                         #pragma unroll
5902                         for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
5903                             shHidActLoad[j * 16 * imgsPerThread + i] = hLoad[j * numModules * numImages + i];
5904                         }
5905                     } else {
5906                         #pragma unroll
5907                         for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
5908                             shHidActLoad[j * 16 * imgsPerThread + i] = 0;
5909                         }
5910                     }
5911                 }
5912
5913                 if (isPxInImg && isPxInModule) {
5914                     // This half-warp is interested, so it's going to load the weights from this module to its pixel.
5915                     // Not fully coalesced read :(
5916                     // But taking out this read entirely only reduces the runtime by ~2.8%, so it isn't costing me much.
5917                     const float* fLoad = conv ? &filters[pxIdxInModule * numFilters + f]
5918                                               : &filters[(moduleIdx * numColors * filterPixels + pxIdxInModule) * numFilters + f];
5919                     #pragma unroll
5920                     for (int c = 0; c < numColors; c++) {
5921                         shilterLoad[c * 16 * (16 + 1)] = fLoad[c * filterPixels * numFilters];
5922                     }
5923
5924
5925                 }
5926
5927                 __syncthreads();
5928                 // Do some actual computation
5929                 if (isPxInImg && isPxInModule) {
5930                     #pragma unroll
5931                     for (int c = 0; c < numColors; c++) {
5932                         #pragma unroll
5933                         for (int w = 0; w < 16; w++) {
5934                             #pragma unroll
5935                             for (int i = 0; i < imgsPerThread; i++) {
5936                                 prod[c][i] += shFilters[threadIdx.y + c * 16][w] * shHidActs[w][threadIdx.x + i * 16];
5937                             }
5938                         }
5939                     }
5940                 }
5941                 __syncthreads();
5942             }
5943         }
5944     }
5945     // Not fully coalesced write :(... shmem (and fully coalesced) version is actually slightly slower, though
5946     if (isPxInImg) {
5947         if (scale) {
5948             #pragma unroll
5949             for (int i = 0; i < imgsPerThread; i++) {
5950                 if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) {
5951                     #pragma unroll
5952                     for (int c = 0; c < numColors; c++) {
5953                         targets[c * imgPixels * numImages + i * 16] = scaleTargets * targets[c * imgPixels * numImages + i * 16] + scaleOutputs * prod[c][i];
5954                     }
5955                 }
5956             }
5957         } else {
5958             #pragma unroll
5959             for (int i = 0; i < imgsPerThread; i++) {
5960                 if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) {
5961                     #pragma unroll
5962                     for (int c = 0; c < numColors; c++) {
5963                         targets[c * imgPixels * numImages + i * 16] = scaleOutputs * prod[c][i];
5964                     }
5965                 }
5966             }
5967         }
5968     }
5969 }
5970
5971 /*
5972  * Block size: 16x16.
5973  * blockIdx.x determines case in batches of 16*imgsPerThread, also color in batches of colorsPerThread.
5974  *  In essence, blockIdx.x.x = 1..numImages/(16*imgsPerThread)
5975  *              blockIdx.x.y = 1..numImgColors/colorsPerThread
5976  * blockIdx.y determines 4x4 image region in target image.
5977  *
5978  * threadIdx.x determines case.
5979  * threadIdx.y determines pixel.
5980  *
5981  * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
5982  * filters:     (numFilterColors, filterPixels, numFilters)                             if conv
5983  *              (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters)   otherwise
5984  * targets:     (numImageColors, imgSizeY, imgSizeX, numImages)
5985  *
5986  * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases.
5987  *
5988  * numImages must be divisible by 16*imgsPerThread if checkCaseBounds is false.
5989  * 16 * imgsPerThread must be divisible by 32.
5990  * numImageColors/numGroups must be divisible by colorsPerThread.
5991  *
5992  * This version loads 32 cases at a time, so it gets full coalescing on that load.
5993  * It only loads 16 weights at a time, so those aren't fully coalesced.
5994  * This version conserves shared memory by loading 16 filters at a time rather than 32.
5995  *
5996  * To be used when there are 4-16 color channels.
5997  */
5998 template <int imgsPerThread, int colorsPerThread,  bool scale, bool checkCaseBounds, bool conv>
5999 __global__ void img_acts_mediumcolor(const float* hidActs, const float* filters, float* targets,
6000                                        const int numModulesY, const int numModulesX, const int numImages, const int numFilters,
6001                                        const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart,
6002                                        const int moduleStride, const int numImgColors, const int numGroups,
6003                                        const float scaleTargets, const float scaleOutputs) {
6004     __shared__ float shFilters[colorsPerThread*16][16 + 1];
6005     __shared__ float shHidActs[16][16*imgsPerThread];
6006
6007     const int numImgBlocks = DIVUP(numImages,16*imgsPerThread);
6008     const int blockCaseIdx = (blockIdx.x % numImgBlocks) * 16*imgsPerThread;
6009
6010     const int imgColorIdx = (blockIdx.x / numImgBlocks) * colorsPerThread; // color idx globally
6011     const int numFilterColors = numImgColors / numGroups;
6012     const int blockGroupIdx = imgColorIdx / numFilterColors;
6013     const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group
6014     const int numFiltersPerGroup = numFilters / numGroups;
6015     const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
6016
6017     const int numRegionsX = DIVUP(imgSizeX, 4);
6018     const int blockRegionIdx = blockIdx.y;
6019     const int blockRegionIdxX = blockRegionIdx % numRegionsX;
6020     const int blockRegionIdxY = blockRegionIdx / numRegionsX;
6021     const int blockRegionLeft = blockRegionIdxX * 4;
6022     const int blockRegionTop = blockRegionIdxY * 4;
6023     const int pxYInRegion = threadIdx.y / 4, pxXInRegion = threadIdx.y % 4;
6024     const int pxY = blockRegionTop + pxYInRegion;
6025     const int pxX = blockRegionLeft + pxXInRegion;
6026     const int pxIdx = pxY * imgSizeX + pxX;
6027     const bool isPxInImg = pxY < imgSizeY && pxX < imgSizeX;
6028     const uint numModules = numModulesY * numModulesX;
6029     const int filterPixels = filterSize * filterSize;
6030     const int imgPixels = imgSizeY * imgSizeX;
6031     const int tidx = threadIdx.y * 16 + threadIdx.x;
6032     const int loadY = tidx / 32, loadX = tidx % 32;
6033
6034     hidActs += blockCaseIdx + (blockFilterIdx + loadY) * numImages * numModules + loadX;
6035     filters += blockFilterIdx + filterColorIdx * filterPixels * numFilters + threadIdx.x;
6036     targets += imgColorIdx * imgPixels * numImages + pxIdx * numImages + blockCaseIdx + threadIdx.x;
6037
6038     float prod[colorsPerThread][imgsPerThread];
6039     #pragma unroll
6040     for (int c = 0; c < colorsPerThread; c++) {
6041         #pragma unroll
6042         for (int i = 0; i < imgsPerThread; i++) {
6043             prod[c][i] = 0;
6044         }
6045     }
6046     const int startY = blockRegionTop - paddingStart < filterSize ? 0
6047                         : 1 + (blockRegionTop - paddingStart - filterSize) / moduleStride;
6048     const int endY = MIN(numModulesY, 1 + (blockRegionTop + 3 - paddingStart) / moduleStride);
6049     const int startX = blockRegionLeft - paddingStart < filterSize ? 0
6050                         : 1 + (blockRegionLeft - paddingStart - filterSize) / moduleStride;
6051     const int endX = MIN(numModulesX, 1 + (blockRegionLeft + 3 - paddingStart) / moduleStride);
6052
6053     float* shFilterLoad = &shFilters[threadIdx.y][threadIdx.x];
6054     float* shHidActLoad = &shHidActs[loadY][loadX];
6055
6056     for (int my = startY; my < endY; my++) {
6057         const int moduleTop = paddingStart + my * moduleStride;
6058         const int pxInModuleY = pxY - moduleTop;
6059
6060         for (int mx = startX; mx < endX; mx++) {
6061             const int moduleIdx = my * numModulesX + mx;
6062             const int moduleLeft = paddingStart + mx * moduleStride;
6063             const int pxInModuleX = pxX - moduleLeft;
6064
6065             const bool isPxInModule = pxInModuleY >= 0 && pxInModuleY < filterSize && pxInModuleX >= 0 && pxInModuleX < filterSize;
6066             const int pxIdxInModule = pxInModuleY * filterSize + pxInModuleX;
6067
6068             for (int f = 0; f < numFiltersPerGroup; f += 16) { // multipply with 16 filters at a time
6069                 // Now the threads split up into half-warps, and each half-warp decides if it's interested.
6070                 const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages];
6071                 #pragma unroll
6072                 for (int i = 0; i < imgsPerThread * 16; i += 32) {
6073                     if (!checkCaseBounds || blockCaseIdx + loadX + i < numImages) {
6074                         #pragma unroll
6075                         for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
6076                             shHidActLoad[j * 16 * imgsPerThread + i] = hLoad[j * numModules * numImages + i];
6077                         }
6078                     } else {
6079                         #pragma unroll
6080                         for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
6081                             shHidActLoad[j * 16 * imgsPerThread + i] = 0;
6082                         }
6083                     }
6084                 }
6085
6086                 if (isPxInImg && isPxInModule) {
6087                     // This half-warp is interested, so it's going to load the weights from this module to its pixel.
6088
6089                     // Not fully coalesced read :(
6090                     // But taking out this read entirely only reduces the runtime by ~2.8%, so it isn't costing me much.
6091                     const float* fLoad = conv ? &filters[pxIdxInModule * numFilters + f]
6092                                               : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInModule * numFilters + f];
6093                     #pragma unroll
6094                     for (int c = 0; c < colorsPerThread; c++) {
6095                         shFilterLoad[c * 16 * (16 + 1)] = fLoad[c * filterPixels * numFilters];
6096                     }
6097                 }
6098
6099                 __syncthreads();
6100                 // Do some actual computation
6101                 if (isPxInImg && isPxInModule) {
6102                     #pragma unroll
6103                     for (int c = 0; c < colorsPerThread; c++) {
6104                         #pragma unroll
6105                         for (int w = 0; w < 16; w++) {
6106                             #pragma unroll
6107                             for (int i = 0; i < imgsPerThread; i++) {
6108                                 prod[c][i] += shFilters[threadIdx.y + c * 16][w] * shHidActs[w][threadIdx.x + i * 16];
6109                             }
6110                         }
6111                     }
6112                 }
6113                 __syncthreads();
6114             }
6115         }
6116     }
6117     // Not fully coalesced write :(... shmem (and fully coalesced) version is actually slightly slower, though
6118     if (isPxInImg) {
6119         if (scale) {
6120             #pragma unroll
6121             for (int i = 0; i < imgsPerThread; i++) {
6122                 if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) {
6123                     #pragma unroll
6124                     for (int c = 0; c < colorsPerThread; c++) {
6125                         targets[c * imgPixels * numImages + i * 16] = scaleTargets * targets[c * imgPixels * numImages + i * 16] + scaleOutputs * prod[c][i];
6126                     }
6127                 }
6128             }
6129         } else {
6130             #pragma unroll
6131             for (int i = 0; i < imgsPerThread; i++) {
6132                 if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) {
6133                     #pragma unroll
6134                     for (int c = 0; c < colorsPerThread; c++) {
6135                         targets[c * imgPixels * numImages + i * 16] = scaleOutputs * prod[c][i];
6136                     }
6137                 }
6138             }
6139         }
6140     }
6141 }
6142
6143 /*
6144  * Block size: B_YxB_X.
6145  * blockIdx.x determines case in batches of B_X*imgsPerThread, also color in batches of B_Y*colorsPerThread.
6146  *  In essence, blockIdx.x.x = 1..numImages/(B_X*imgsPerThread)
6147  *              blockIdx.x.y = 1..numImgColors/(B_Y*colorsPerThread)
6148  * blockIdx.y determines image pixel in target image.
6149  *
6150  * threadIdx.x determines case.
6151  * threadIdx.y determines color.
6152  *
6153  * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
6154  * filters:     (numFilterColors, filterPixels, numFilters)                             if conv
6155  *              (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters)   otherwise
6156  * targets:     (numImageColors, imgSizeY, imgSizeX, numImages)
6157  *
6158  * Each block reconstructs one B_Y*colorsPerThread colors from 1 pixel from B_X*imgsPerThread cases.
6159  *
6160  * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false.
6161  * numFiltersPerGroup must be divisible by 16.
6162  *
6163  * B_X * imgsPerThread must be divisible by 32.
6164  * numFilterColors must be divisible by B_Y*colorsPerThread.
6165  * B_X*B_Y must be divisible by 32.
6166  *
6167  * This version loads 32 cases at a time, so it gets full coalescing on that load.
6168  * It only loads 16 weights at a time, so those aren't fully coalesced.
6169  * This version conserves shared memory by loading 16 filters at a time rather than 32.
6170  *
6171  * To be used when there are >= 16 color channels.
6172  */
6173 template <int B_Y, int B_X, int imgsPerThread, int colorsPerThread, bool scale, bool checkCaseBounds, bool conv>
6174 __global__ void conv_img_acts_manycolor(const float* hidActs, const float* filters, float* targets,
6175                                           const int numModulesY, const int numModulesX, const int numImages, const int numFilters,
6176                                           const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride,
6177                                           const int numImgColors, const int numGroups,
6178                                           const float scaleTargets, const float scaleOutputs) {
6179     __shared__ float shFilters[colorsPerThread*B_Y][16 + 1]; // TODO: perhaps reconsider this 16
6180     __shared__ float shHidActs[16][B_X*imgsPerThread];
6181
6182     const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
6183     const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread;
6184
6185     const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y*colorsPerThread; // color idx globally
6186     const int numFilterColors = numImgColors / numGroups;
6187     const int blockGroupIdx = imgColorIdx / numFilterColors;
6188     const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group
6189     const int numFiltersPerGroup = numFilters / numGroups;
6190     const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
6191
6192     const int blockPixelIdx = blockIdx.y;
6193     const int blockPixelIdxX = blockPixelIdx % imgSizeX;
6194     const int blockPixelIdxY = blockPixelIdx / imgSizeX;
6195
6196     const int filterPixels = filterSize * filterSize;
6197     const int imgPixels = imgSizeY * imgSizeX;
6198     const int tidx = threadIdx.y * B_X + threadIdx.x;
6199     const int hidActLoadY = tidx / 32, hidActLoadX = tidx % 32;
6200     const int filtersLoadY = tidx / 16, filtersLoadX = tidx % 16;
6201     const int numModules = numModulesY * numModulesX;
6202
6203     hidActs += blockCaseIdx + (blockFilterIdx + hidActLoadY) * numImages * numModules + hidActLoadX;
6204     filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX;
6205     targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages + blockPixelIdx * numImages + blockCaseIdx + threadIdx.x;
6206
6207     float prod[colorsPerThread][imgsPerThread];
6208     #pragma unroll
6209     for (int c = 0; c < colorsPerThread; c++) {
6210         #pragma unroll
6211         for (int i = 0; i < imgsPerThread; i++) {
6212             prod[c][i] = 0;
6213         }
6214     }
6215
6216     const int startY = blockPixelIdxY - paddingStart < filterSize ? 0
6217                         : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride;
6218     const int endY = MIN(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride);
6219     const int startX = blockPixelIdxX - paddingStart < filterSize ? 0
6220                         : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride;
6221     const int endX = MIN(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride);
6222
6223     float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX];
6224     float* shHidActLoad = &shHidActs[hidActLoadY][hidActLoadX];
6225
6226     for (int my = startY; my < endY; my++) {
6227         const int moduleTop = paddingStart + my * moduleStride;
6228         const int pxInFilterY = blockPixelIdxY - moduleTop;
6229
6230         for (int mx = startX; mx < endX; mx++) {
6231             const int moduleIdx = my * numModulesX + mx;
6232             const int moduleLeft = paddingStart + mx * moduleStride;
6233             const int pxInFilterX = blockPixelIdxX - moduleLeft;
6234
6235             const int pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX;
6236
6237             for (int f = 0; f < numFiltersPerGroup; f += 16) { // multiply with 16 filters at a time
6238                 const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages];
6239                 #pragma unroll
6240                 for (int i = 0; i < imgsPerThread * B_X; i += 32) {
6241                     if (!checkCaseBounds || blockCaseIdx + hidActLoadX + i < numImages) {
6242                         #pragma unroll
6243                         for (int j = 0; j < 16; j += B_X*B_Y/32) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
6244                             shHidActLoad[j * B_X * imgsPerThread + i] = hLoad[j * numModules * numImages + i];
6245                         }
6246                     } else {
6247                         #pragma unroll
6248                         for (int j = 0; j < 16; j += B_X*B_Y/32) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
6249                             shHidActLoad[j * B_X * imgsPerThread + i] = 0;
6250                         }
6251                     }
6252                 }
6253                 const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + f]
6254                                           : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + f];
6255                 #pragma unroll
6256                 for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/16) {
6257                     if ((colorsPerThread*B_Y) % (B_X*B_Y/16) == 0 || i + filtersLoadY < colorsPerThread*B_Y) {
6258                         shFilterLoad[i * (16 + 1)] = fLoad[i * filterPixels * numFilters];
6259                     }
6260                 }
6261
6262                 __syncthreads();
6263                 // Do some actual computation
6264                 #pragma unroll
6265                 for (int c = 0; c < colorsPerThread; c++) {
6266                     #pragma unroll
6267                     for (int w = 0; w < 16; w++) {
6268                         #pragma unroll
6269                         for (int i = 0; i < imgsPerThread; i++) {
6270                             prod[c][i] += shFilters[c * B_Y + threadIdx.y][w] * shHidActs[w][threadIdx.x + i * B_X];
6271                         }
6272                     }
6273                 }
6274                 __syncthreads();
6275             }
6276         }
6277     }
6278     if (scale) {
6279         #pragma unroll
6280         for (int i = 0; i < imgsPerThread; i++) {
6281             if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * B_X < numImages) {
6282                 #pragma unroll
6283                 for (int c = 0; c < colorsPerThread; c++) {
6284                     targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * targets[c * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[c][i];
6285                 }
6286             }
6287         }
6288     } else {
6289         #pragma unroll
6290         for (int i = 0; i < imgsPerThread; i++) {
6291             if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * B_X < numImages) {
6292                 #pragma unroll
6293                 for (int c = 0; c < colorsPerThread; c++) {
6294                     targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleOutputs * prod[c][i];
6295                 }
6296             }
6297         }
6298     }
6299 }
6300
6301
6302 /*
6303  * Block size: 16x16.
6304  * blockIdx.x determines case in batches of 16*imgsPerThread, also color in batches of colorsPerThread.
6305  *  In essence, blockIdx.x.x = 1..numImages/(16*imgsPerThread)
6306  *              blockIdx.x.y = 1..numImgColors/colorsPerThread
6307  * blockIdx.y determines 4x4 image region in target image, also sample
6308  *  In essence, blockIdx.y.x = 1..numRegions
6309  *              blockIdx.y.y = 1..overSample
6310  *
6311  * threadIdx.x determines case.
6312  * threadIdx.y determines pixel.
6313  *
6314  * overSample := numFilterColors*numGroups/numImgColors
6315  *     ^ this is the number of groups that each color channel is connected to
6316  *
6317  * hidActs:         (numFilters, numModulesY, numModulesX, numImages)
6318  * filters:         (numFilterColors, filterPixels, numFilters)                             if conv
6319  *                  (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters)   otherwise
6320  * targets:         (overSample, numImgColors, imgSizeY, imgSizeX, numImages)
6321  *
6322  * colorIndices:    (numGroups, numFilterColors)
6323  *
6324  * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases.
6325  *
6326  * numImages must be divisible by 16*imgsPerThread if checkCaseBounds is false.
6327  * 16 * imgsPerThread must be divisible by 32.
6328  * numFilterColors must be divisible by colorsPerThread.
6329  *
6330  * This version loads 32 cases at a time, so it gets full coalescing on that load.
6331  * It only loads 16 weights at a time, so those aren't fully coalesced.
6332  * This version conserves shared memory by loading 16 filters at a time rather than 32.
6333  *
6334  * To be used when there are 4-16 color channels.
6335  */
6336 template <int imgsPerThread, int colorsPerThread, bool scale, bool checkCaseBounds, bool conv>
6337 __global__ void img_acts_mediumcolor_sparse_rand(const float* hidActs, const float* filters, float* targets, int* colorIndices,
6338                                                  const int numModulesY, const int numModulesX, const int numImages, const int numFilters,
6339                                                  const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride,
6340                                                  const int numImgColors, const int numFilterColors, const int numGroups,
6341                                                  const float scaleTargets, const float scaleOutputs) {
6342     __shared__ float shFilters[colorsPerThread*16][16 + 1];
6343     __shared__ float shHidActs[16][16*imgsPerThread];
6344     __shared__ int shColors[colorsPerThread]; // not really necessary -- can repurpose the other shmems
6345
6346     const int numImgBlocks = DIVUP(numImages,16*imgsPerThread);
6347     const int blockCaseIdx = (blockIdx.x % numImgBlocks) * 16*imgsPerThread;
6348
6349     const int numRegionsX = DIVUP(imgSizeX, 4);
6350     const int numRegions = numRegionsX * numRegionsX;
6351     const int imgColorIdx = (blockIdx.x / numImgBlocks) * colorsPerThread; // color idx globally
6352
6353     const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group
6354     const int numFiltersPerGroup = numFilters / numGroups;
6355
6356     const int overSample = gridDim.y / numRegions;
6357     const int blockSample = blockIdx.y / numRegions;
6358     const int groupsPerSample = numGroups / overSample;
6359     const int blockGroupIdx = imgColorIdx / numFilterColors + blockSample * groupsPerSample;
6360     const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
6361
6362     const int blockRegionIdx = blockIdx.y % numRegions;
6363     const int blockRegionIdxX = blockRegionIdx % numRegionsX;
6364     const int blockRegionIdxY = blockRegionIdx / numRegionsX;
6365     const int blockRegionLeft = blockRegionIdxX * 4;
6366     const int blockRegionTop = blockRegionIdxY * 4;
6367     const int pxYInRegion = threadIdx.y / 4, pxXInRegion = threadIdx.y % 4;
6368     const int pxY = blockRegionTop + pxYInRegion;
6369     const int pxX = blockRegionLeft + pxXInRegion;
6370     const int pxIdx = pxY * imgSizeX + pxX;
6371     const bool isPxInImg = pxY < imgSizeY && pxX < imgSizeX;
6372     const uint numModules = numModulesY * numModulesX;
6373     const int filterPixels = filterSize * filterSize;
6374     const int imgPixels = imgSizeY * imgSizeX;
6375     const int tidx = threadIdx.y * 16 + threadIdx.x;
6376     const int loadY = tidx / 32, loadX = tidx % 32;
6377
6378     hidActs += blockCaseIdx + (blockFilterIdx + loadY) * numImages * numModules + loadX;
6379     filters += blockFilterIdx + filterColorIdx * filterPixels * numFilters + threadIdx.x;
6380     targets += blockSample * numImgColors * imgPixels * numImages + pxIdx * numImages + blockCaseIdx + threadIdx.x;
6381
6382     float prod[colorsPerThread][imgsPerThread];
6383     #pragma unroll
6384     for (int c = 0; c < colorsPerThread; c++) {
6385         #pragma unroll
6386         for (int i = 0; i < imgsPerThread; i++) {
6387             prod[c][i] = 0;
6388         }
6389     }
6390     const int startY = blockRegionTop - paddingStart < filterSize ? 0
6391                         : 1 + (blockRegionTop - paddingStart - filterSize) / moduleStride;
6392     const int endY = MIN(numModulesY, 1 + (blockRegionTop + 3 - paddingStart) / moduleStride);
6393     const int startX = blockRegionLeft - paddingStart < filterSize ? 0
6394                         : 1 + (blockRegionLeft - paddingStart - filterSize) / moduleStride;
6395     const int endX = MIN(numModulesX, 1 + (blockRegionLeft + 3 - paddingStart) / moduleStride);
6396
6397     float* shFilterLoad = &shFilters[threadIdx.y][threadIdx.x];
6398     float* shHidActLoad = &shHidActs[loadY][loadX];
6399
6400     if (tidx < colorsPerThread) {
6401         shColors[tidx] = colorIndices[blockGroupIdx * numFilterColors + filterColorIdx + tidx] * imgPixels * numImages;
6402     }
6403
6404     for (int my = startY; my < endY; my++) {
6405         const int moduleTop = paddingStart + my * moduleStride;
6406         const int pxInModuleY = pxY - moduleTop;
6407
6408         for (int mx = startX; mx < endX; mx++) {
6409             const int moduleIdx = my * numModulesX + mx;
6410             const int moduleLeft = paddingStart + mx * moduleStride;
6411             const int pxInModuleX = pxX - moduleLeft;
6412
6413             const bool isPxInModule = pxInModuleY >= 0 && pxInModuleY < filterSize && pxInModuleX >= 0 && pxInModuleX < filterSize;
6414             const int pxIdxInModule = pxInModuleY * filterSize + pxInModuleX;
6415
6416             for (int f = 0; f < numFiltersPerGroup; f += 16) { // multipply with 16 filters at a time
6417                 // Now the threads split up into half-warps, and each half-warp decides if it's interested.
6418                 const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages];
6419                 #pragma unroll
6420                 for (int i = 0; i < imgsPerThread * 16; i += 32) {
6421                     if (!checkCaseBounds || blockCaseIdx + loadX + i < numImages) {
6422                         #pragma unroll
6423                         for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
6424                             shHidActLoad[j * 16 * imgsPerThread + i] = hLoad[j * numModules * numImages + i];
6425                         }
6426                     } else {
6427                         #pragma unroll
6428                         for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
6429                             shHidActLoad[j * 16 * imgsPerThread + i] = 0;
6430                         }
6431                     }
6432                 }
6433
6434                 if (isPxInImg && isPxInModule) {
6435                     // This half-warp is interested, so it's going to load the weights from this module to its pixel.
6436
6437                     // Not fully coalesced read :(
6438                     // But taking out this read entirely only reduces the runtime by ~2.8%, so it isn't costing me much.
6439                     const float* fLoad = conv ? &filters[pxIdxInModule * numFilters + f]
6440                                               : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInModule * numFilters + f];
6441                     #pragma unroll
6442                     for (int c = 0; c < colorsPerThread; c++) {
6443                         shFilterLoad[c * 16 * (16 + 1)] = fLoad[c * filterPixels * numFilters];
6444                     }
6445                 }
6446
6447                 __syncthreads();
6448                 // Do some actual computation
6449                 if (isPxInImg && isPxInModule) {
6450                     #pragma unroll
6451                     for (int c = 0; c < colorsPerThread; c++) {
6452                         #pragma unroll
6453                         for (int w = 0; w < 16; w++) {
6454                             #pragma unroll
6455                             for (int i = 0; i < imgsPerThread; i++) {
6456                                 prod[c][i] += shFilters[threadIdx.y + c * 16][w] * shHidActs[w][threadIdx.x + i * 16];
6457                             }
6458                         }
6459                     }
6460                 }
6461                 __syncthreads();
6462             }
6463         }
6464     }
6465     // Not fully coalesced write :(... shmem (and fully coalesced) version is actually slightly slower, though
6466     if (isPxInImg) {
6467         if (scale) {
6468             #pragma unroll
6469             for (int i = 0; i < imgsPerThread; i++) {
6470                 if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) {
6471                     #pragma unroll
6472                     for (int c = 0; c < colorsPerThread; c++) {
6473                         targets[shColors[c] + i * 16] = scaleTargets * targets[shColors[c] + i * 16] + scaleOutputs * prod[c][i];
6474                     }
6475                 }
6476             }
6477         } else {
6478             #pragma unroll
6479             for (int i = 0; i < imgsPerThread; i++) {
6480                 if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) {
6481                     #pragma unroll
6482                     for (int c = 0; c < colorsPerThread; c++) {
6483                         targets[shColors[c] + i * 16] = scaleOutputs * prod[c][i];
6484                     }
6485                 }
6486             }
6487         }
6488     }
6489 }
6490
6491 /*
6492  * Block size: B_YxB_X.
6493  * blockIdx.x determines case in batches of B_X*imgsPerThread, also color in batches of B_Y*colorsPerThread.
6494  *  In essence, blockIdx.x.x = 1..numImages/(B_X*imgsPerThread)
6495  *              blockIdx.x.y = 1..numImgColors/(B_Y*colorsPerThread)
6496  * blockIdx.y determines image pixel in target image, sample idx.
6497  *  In essence, blockIdx.y.x = 1..imgPixels
6498  *              blockIdx.y.y = 1..overSample
6499  *
6500  * threadIdx.x determines case.
6501  * threadIdx.y determines color.
6502  *
6503  * overSample := numFilterColors*numGroups/numImgColors
6504  *     ^ this is the number of groups that each color channel is connected to
6505  *
6506  * hidActs:         (numFilters, numModulesY, numModulesX, numImages)
6507  * filters:         (numFilterColors, filterPixels, numFilters)                             if conv
6508  *                  (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters)   otherwise
6509  * targets:         (overSample, numImgColors, imgSizeY, imgSizeX, numImages)
6510  *
6511  * colorIndices:    (numGroups, numFilterColors)
6512  *
6513  * Each block reconstructs one B_Y*colorsPerThread colors from 1 pixel from B_X*imgsPerThread cases.
6514  *
6515  * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false.
6516  * numFiltersPerGroup must be divisible by 16.
6517  * numFilterColors*numGroups must be divisible by numImgColors.
6518  *
6519  * B_X * imgsPerThread must be divisible by 32.
6520  * numFilterColors must be divisible by B_Y*colorsPerThread.
6521  * B_X*B_Y must be divisible by 32.
6522  *
6523  * This version loads 32 cases at a time, so it gets full coalescing on that load.
6524  * It only loads 16 weights at a time, so those aren't fully coalesced.
6525  * This version conserves shared memory by loading 16 filters at a time rather than 32.
6526  *
6527  * To be used when there are >= 16 color channels.
6528  */
6529 template <int B_Y, int B_X, int imgsPerThread, int colorsPerThread, bool scale, bool checkCaseBounds, bool conv>
6530 __global__ void img_acts_manycolor_sparse_rand(const float* hidActs, const float* filters, float* targets, int* colorIndices,
6531                                               const int numModulesY, const int numModulesX, const int numImages, const int numFilters,
6532                                               const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride,
6533                                               const int numImgColors, const int numFilterColors, const int numGroups,
6534                                               const float scaleTargets, const float scaleOutputs) {
6535     __shared__ float shFilters[colorsPerThread*B_Y][16 + 1]; // TODO: perhaps reconsider this 16
6536     __shared__ float shHidActs[16][B_X*imgsPerThread];
6537     __shared__ int shColors[colorsPerThread * B_Y]; // not really necessary -- can repurpose the other shmems
6538
6539     const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
6540     const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread;
6541
6542     const int filterPixels = filterSize * filterSize;
6543     const int imgPixels = imgSizeY * imgSizeX;
6544     const int tidx = threadIdx.y * B_X + threadIdx.x;
6545     const int hidActLoadY = tidx / 32, hidActLoadX = tidx % 32;
6546     const int filtersLoadY = tidx / 16, filtersLoadX = tidx % 16;
6547     const int numModules = numModulesY * numModulesX;
6548
6549     const int overSample = gridDim.y / imgPixels;
6550     const int blockSample = blockIdx.y / imgPixels;
6551     const int groupsPerSample = numGroups / overSample;
6552
6553 //    const int overSample = (numFilterColors * numGroups) / numImgColors;
6554     const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y * colorsPerThread; // color idx globally
6555     const int blockGroupIdx = imgColorIdx / numFilterColors + blockSample * groupsPerSample;
6556 //    const int filterColorsPerSample = numFilterColors / overSample;
6557
6558     const int blockPixelIdx = blockIdx.y % imgPixels;
6559     const int blockPixelIdxX = blockPixelIdx % imgSizeX;
6560     const int blockPixelIdxY = blockPixelIdx / imgSizeX;
6561
6562     const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group
6563     const int numFiltersPerGroup = numFilters / numGroups;
6564     const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
6565
6566     hidActs += blockCaseIdx + (blockFilterIdx + hidActLoadY) * numImages * numModules + hidActLoadX;
6567     filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX;
6568     targets += blockSample * numImgColors * imgPixels * numImages + blockPixelIdx * numImages + blockCaseIdx + threadIdx.x;
6569
6570     float prod[colorsPerThread][imgsPerThread];
6571     #pragma unroll
6572     for (int c = 0; c < colorsPerThread; c++) {
6573         #pragma unroll
6574         for (int i = 0; i < imgsPerThread; i++) {
6575             prod[c][i] = 0;
6576         }
6577     }
6578
6579     const int startY = blockPixelIdxY - paddingStart < filterSize ? 0
6580                         : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride;
6581     const int endY = MIN(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride);
6582     const int startX = blockPixelIdxX - paddingStart < filterSize ? 0
6583                         : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride;
6584     const int endX = MIN(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride);
6585
6586     float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX];
6587     float* shHidActLoad = &shHidActs[hidActLoadY][hidActLoadX];
6588
6589     if (tidx < colorsPerThread * B_Y) {
6590         shColors[tidx] = colorIndices[blockGroupIdx * numFilterColors + filterColorIdx + tidx] * imgPixels * numImages;
6591     }
6592
6593     for (int my = startY; my < endY; my++) {
6594         const int moduleTop = paddingStart + my * moduleStride;
6595         const int pxInFilterY = blockPixelIdxY - moduleTop;
6596
6597         for (int mx = startX; mx < endX; mx++) {
6598             const int moduleIdx = my * numModulesX + mx;
6599             const int moduleLeft = paddingStart + mx * moduleStride;
6600             const int pxInFilterX = blockPixelIdxX - moduleLeft;
6601
6602             const int pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX;
6603
6604             for (int f = 0; f < numFiltersPerGroup; f += 16) { // multiply with 16 filters at a time
6605                 const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages];
6606                 #pragma unroll
6607                 for (int i = 0; i < imgsPerThread * B_X; i += 32) {
6608                     if (!checkCaseBounds || blockCaseIdx + hidActLoadX + i < numImages) {
6609                         #pragma unroll
6610                         for (int j = 0; j < 16; j += B_X*B_Y/32) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
6611                             shHidActLoad[j * B_X * imgsPerThread + i] = hLoad[j * numModules * numImages + i];
6612                         }
6613                     } else {
6614                         #pragma unroll
6615                         for (int j = 0; j < 16; j += B_X*B_Y/32) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
6616                             shHidActLoad[j * B_X * imgsPerThread + i] = 0;
6617                         }
6618                     }
6619                 }
6620
6621                 const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + f]
6622                                           : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + f];
6623                 #pragma unroll
6624                 for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/16) {
6625                     if ((colorsPerThread*B_Y) % (B_X*B_Y/16) == 0 || i + filtersLoadY < colorsPerThread*B_Y) {
6626                         shFilterLoad[i * (16 + 1)] = fLoad[i * filterPixels * numFilters];
6627                     }
6628                 }
6629
6630                 __syncthreads();
6631                 // Do some actual computation
6632                 #pragma unroll
6633                 for (int c = 0; c < colorsPerThread; c++) {
6634                     #pragma unroll
6635                     for (int w = 0; w < 16; w++) {
6636                         #pragma unroll
6637                         for (int i = 0; i < imgsPerThread; i++) {
6638                             prod[c][i] += shFilters[c * B_Y + threadIdx.y][w] * shHidActs[w][threadIdx.x + i * B_X];
6639                         }
6640                     }
6641                 }
6642                 __syncthreads();
6643             }
6644         }
6645     }
6646
6647     if (scale) {
6648         #pragma unroll
6649         for (int i = 0; i < imgsPerThread; i++) {
6650             if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * B_X < numImages) {
6651                 #pragma unroll
6652                 for (int c = 0; c < colorsPerThread; c++) {
6653                     targets[shColors[c * B_Y + threadIdx.y] + i * B_X] = scaleTargets * targets[shColors[c * B_Y + threadIdx.y] + i * B_X] + scaleOutputs * prod[c][i];
6654                 }
6655             }
6656         }
6657     } else {
6658         #pragma unroll
6659         for (int i = 0; i < imgsPerThread; i++) {
6660             if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * B_X < numImages) {
6661                 #pragma unroll
6662                 for (int c = 0; c < colorsPerThread; c++) {
6663                     targets[shColors[c * B_Y + threadIdx.y] + i * B_X] = scaleOutputs * prod[c][i];
6664                 }
6665             }
6666         }
6667     }
6668 }
6669
6670 /*
6671  * hidActs:         (numFilters, numModules, numImages)
6672  * filters:         (numFilterColors, filterPixels, numFilters)               if conv
6673  *                  (numModules, numFilterColors, filterPixels, numFilters)   otherwise
6674  * targets:         (overSample, numImgColors, imgPixels, numImages)
6675  *
6676  * Note: all of these convolution routines are optimized for the case when
6677  * the number of images (i.e. the minibatch size) is a multiple of 128.
6678  * Other batch sizes will work, but but I made no attempt whatsoever
6679  * to make them work fast.
6680  */
6681 void _imgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets,
6682               int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups,
6683               float scaleTargets, float scaleOutput, bool conv) {
6684     int numFilterColors = numImgColors / numGroups;
6685     int numImages = hidActs.getNumCols();
6686     int numFilters = filters.getNumCols();
6687     int numModules = hidActs.getNumRows() / numFilters;
6688     int filterModuleMult = conv ? 1 : numModules;
6689     int filterPixels = filters.getNumRows() / (filterModuleMult * numFilterColors);
6690     int filterSize = sqrt((double)filterPixels);
6691     int imgPixels = imgSizeY * imgSizeX;
6692     int numModulesX = numModules / numModulesY;
6693
6694     assert(numImgColors % numGroups == 0);
6695     assert(numFilters % (16*numGroups) == 0);
6696     assert(numGroups > 1 || (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 2 == 0)));
6697     assert(numGroups == 1 || numFilterColors % 4 == 0);
6698
6699     assert(filterPixels == filterSize * filterSize);
6700     assert(hidActs.getNumRows() == numModules * numFilters);
6701     assert(filters.getNumRows() == filterModuleMult * numFilterColors * filterPixels);
6702     assert(numModules == numModulesY * numModulesX);
6703
6704     assert(hidActs.isContiguous());
6705     assert(filters.isContiguous());
6706
6707     assert(!hidActs.isTrans());
6708     assert(!filters.isTrans());
6709     assert(!targets.isTrans());
6710     // These routines don't handle the case when only part of the image is visited in the convolution
6711     assert(paddingStart <= 0);
6712     // assert changed into if statement by Ian Goodfellow
6713     if (paddingStart + (numModulesX-1)*moduleStride + filterSize < imgSizeX)
6714     {
6715         printf("imgSizeX: %d\n", imgSizeX);
6716         printf("Bound on image size: %d\n", paddingStart + (numModulesX-1)*moduleStride+filterSize);
6717         printf("paddingStart: %d\n", paddingStart);
6718         printf("numModulesX: %d\n", numModulesX);
6719         printf("moduleStride: %d\n", moduleStride);
6720         printf("filterSize: %d\n", filterSize);
6721         assert(false);
6722     }
6723     assert(paddingStart + (numModulesY-1)*moduleStride + filterSize >= imgSizeY);
6724     assert(moduleStride <= filterSize);
6725
6726     assert(targets.isContiguous()); // no stride support here!
6727
6728     dim3 blocks;
6729     dim3 threads(16,16);
6730     int colorsPerThread;
6731     int imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2;
6732     if (numFilterColors % 8 == 0) {
6733         threads = dim3(32, 4);
6734         colorsPerThread = numFilterColors % 16 == 0 ? 4 : 2;
6735         imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
6736         assert(numFilterColors % (threads.y * colorsPerThread) == 0);
6737
6738         blocks = dim3(DIVUP(numImages, threads.x*imgsPerThread) * (numImgColors/(threads.y*colorsPerThread)), imgPixels);
6739     } else if (numFilterColors > 3) {
6740         colorsPerThread = numFilterColors % 4 == 0 ? 4 : 2;
6741         blocks = dim3(DIVUP(numImages,threads.x*imgsPerThread) * (numImgColors / colorsPerThread), DIVUP(imgSizeY,4) * DIVUP(imgSizeX,4));
6742     } else {
6743         blocks = dim3(DIVUP(numImages,threads.x*imgsPerThread), DIVUP(imgSizeY,4) * DIVUP(imgSizeX,4));
6744     }
6745     bool checkCaseBounds = numImages % (threads.x * imgsPerThread) != 0;
6746
6747     if (scaleTargets == 0) { // do not scale or use targets matrix
6748         targets.resize(numImgColors*imgPixels, numImages);
6749     } else {
6750         assert(targets.getNumRows() == numImgColors * imgPixels);
6751         assert(targets.getNumCols() == numImages);
6752     }
6753     if (conv) { // convolutional units
6754         if (scaleTargets == 0) { // do not scale or use targets matrix
6755             if (numFilterColors % 8 == 0) {
6756                 if (imgsPerThread == 4) {
6757                     if (checkCaseBounds) {
6758                         if (numFilterColors % 16 == 0) {
6759                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 4, false, true, true>, cudaFuncCachePreferShared);
6760                             conv_img_acts_manycolor<4, 32, 4, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6761                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6762                         } else {
6763                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 2, false, true, true>, cudaFuncCachePreferShared);
6764                             conv_img_acts_manycolor<4, 32, 4, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6765                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6766                         }
6767                     } else {
6768                         if (numFilterColors % 16 == 0) {
6769                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 4, false, false, true>, cudaFuncCachePreferShared);
6770                             conv_img_acts_manycolor<4, 32, 4, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6771                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6772                         } else {
6773                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 2, false, false, true>, cudaFuncCachePreferShared);
6774                             conv_img_acts_manycolor<4, 32, 4, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6775                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6776                         }
6777                     }
6778                 } else if (imgsPerThread == 2) {
6779                     if (checkCaseBounds) {
6780                         if (numFilterColors % 16 == 0) {
6781                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 4, false, true, true>, cudaFuncCachePreferShared);
6782                             conv_img_acts_manycolor<4, 32, 2, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6783                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6784                         } else {
6785                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 2, false, true, true>, cudaFuncCachePreferShared);
6786                             conv_img_acts_manycolor<4, 32, 2, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6787                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6788                         }
6789                     } else {
6790                         if (numFilterColors % 16 == 0) {
6791                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 4, false, false, true>, cudaFuncCachePreferShared);
6792                             conv_img_acts_manycolor<4, 32, 2, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6793                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6794                         } else {
6795                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 2, false, false, true>, cudaFuncCachePreferShared);
6796                             conv_img_acts_manycolor<4, 32, 2, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6797                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6798                         }
6799                     }
6800                 } else {
6801                     if (checkCaseBounds) {
6802                         if (numFilterColors % 16 == 0) {
6803                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 4, false, true, true>, cudaFuncCachePreferShared);
6804                             conv_img_acts_manycolor<4, 32, 1, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6805                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6806                         } else {
6807                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 2, false, true, true>, cudaFuncCachePreferShared);
6808                             conv_img_acts_manycolor<4, 32, 1, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6809                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6810                         }
6811                     } else {
6812                         if (numFilterColors % 16 == 0) {
6813                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 4, false, false, true>, cudaFuncCachePreferShared);
6814                             conv_img_acts_manycolor<4, 32, 1, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6815                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6816                         } else {
6817                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 2, false, false, true>, cudaFuncCachePreferShared);
6818                             conv_img_acts_manycolor<4, 32, 1, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6819                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6820                         }
6821                     }
6822                 }
6823             } else if (numFilterColors > 3) {
6824                 if (imgsPerThread == 8) {
6825                     if (checkCaseBounds) {
6826                         if (colorsPerThread == 4) {
6827                             cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 4, false, true, true>, cudaFuncCachePreferShared);
6828                             img_acts_mediumcolor<8, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6829                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6830                         } else {
6831                             cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 2, false, true, true>, cudaFuncCachePreferShared);
6832                             img_acts_mediumcolor<8, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6833                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6834                         }
6835                     } else {
6836                         if (colorsPerThread == 4) {
6837                             cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 4, false, false, true>, cudaFuncCachePreferShared);
6838                             img_acts_mediumcolor<8, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6839                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6840                         } else {
6841                             cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 2, false, false, true>, cudaFuncCachePreferShared);
6842                             img_acts_mediumcolor<8, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6843                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6844                         }
6845                     }
6846                 } else if (imgsPerThread == 4) {
6847                     if (checkCaseBounds) {
6848                         if (colorsPerThread == 4) {
6849                             cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 4, false, true, true>, cudaFuncCachePreferShared);
6850                             img_acts_mediumcolor<4, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6851                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6852                         } else {
6853                             cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 2, false, true, true>, cudaFuncCachePreferShared);
6854                             img_acts_mediumcolor<4, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6855                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6856                         }
6857                     } else {
6858                         if (colorsPerThread == 4) {
6859                             cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 4, false, false, true>, cudaFuncCachePreferShared);
6860                             img_acts_mediumcolor<4, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6861                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6862                         } else {
6863                             cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 2, false, false, true>, cudaFuncCachePreferShared);
6864                             img_acts_mediumcolor<4, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6865                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6866                         }
6867                     }
6868                 } else {
6869                     if (checkCaseBounds) {
6870                         if (colorsPerThread == 4) {
6871                             cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 4, false, true, true>, cudaFuncCachePreferShared);
6872                             img_acts_mediumcolor<2, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6873                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6874                         } else {
6875                             cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 2, false, true, true>, cudaFuncCachePreferShared);
6876                             img_acts_mediumcolor<2, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6877                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6878                         }
6879                     } else {
6880                         if (colorsPerThread == 4) {
6881                             cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 4, false, false, true>, cudaFuncCachePreferShared);
6882                             img_acts_mediumcolor<2, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6883                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6884                         } else {
6885                             cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 2, false, false, true>, cudaFuncCachePreferShared);
6886                             img_acts_mediumcolor<2, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6887                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6888                         }
6889                     }
6890                 }
6891             } else {
6892                 if (imgsPerThread == 8) {
6893                     if (checkCaseBounds) {
6894                         if (numFilterColors == 1) {
6895                             cudaFuncSetCacheConfig(img_acts_color<8, 1, false, true, true>, cudaFuncCachePreferShared);
6896                             img_acts_color<8, 1, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6897                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6898                         } else if (numFilterColors == 2) {
6899                             cudaFuncSetCacheConfig(img_acts_color<8, 2, false, true, true>, cudaFuncCachePreferShared);
6900                             img_acts_color<8, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6901                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6902                         } else if (numFilterColors == 3) {
6903                             cudaFuncSetCacheConfig(img_acts_color<8, 3, false, true, true>, cudaFuncCachePreferShared);
6904                             img_acts_color<8, 3, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6905                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6906                         }
6907                     } else {
6908                         if (numFilterColors == 1) {
6909                             cudaFuncSetCacheConfig(img_acts_color<8, 1, false, false, true>, cudaFuncCachePreferShared);
6910                             img_acts_color<8, 1, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6911                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6912                         } else if (numFilterColors == 2) {
6913                             cudaFuncSetCacheConfig(img_acts_color<8, 2, false, false, true>, cudaFuncCachePreferShared);
6914                             img_acts_color<8, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6915                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6916                         } else if (numFilterColors == 3) {
6917                             cudaFuncSetCacheConfig(img_acts_color<8, 3, false, false, true>, cudaFuncCachePreferShared);
6918                             img_acts_color<8, 3, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6919                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6920                         }
6921                     }
6922                 } else if (imgsPerThread == 4) {
6923                     if (checkCaseBounds) {
6924                         if (numFilterColors == 1) {
6925                             cudaFuncSetCacheConfig(img_acts_color<4, 1, false, true, true>, cudaFuncCachePreferShared);
6926                             img_acts_color<4, 1, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6927                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6928                         } else if (numFilterColors == 2) {
6929                             cudaFuncSetCacheConfig(img_acts_color<4, 2, false, true, true>, cudaFuncCachePreferShared);
6930                             img_acts_color<4, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6931                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6932                         } else if (numFilterColors == 3) {
6933                             cudaFuncSetCacheConfig(img_acts_color<4, 3, false, true, true>, cudaFuncCachePreferShared);
6934                             img_acts_color<4, 3, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6935                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6936                         }
6937                     } else {
6938                         if (numFilterColors == 1) {
6939                             cudaFuncSetCacheConfig(img_acts_color<4, 1, false, false, true>, cudaFuncCachePreferShared);
6940                             img_acts_color<4, 1, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6941                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6942                         } else if (numFilterColors == 2) {
6943                             cudaFuncSetCacheConfig(img_acts_color<4, 2, false, false, true>, cudaFuncCachePreferShared);
6944                             img_acts_color<4, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6945                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6946                         } else if (numFilterColors == 3) {
6947                             cudaFuncSetCacheConfig(img_acts_color<4, 3, false, false, true>, cudaFuncCachePreferShared);
6948                             img_acts_color<4, 3, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6949                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6950                         }
6951                     }
6952                 } else {
6953                     if (checkCaseBounds) {
6954                         if (numFilterColors == 1) {
6955                             cudaFuncSetCacheConfig(img_acts_color<2, 1, false, true, true>, cudaFuncCachePreferShared);
6956                             img_acts_color<2, 1, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6957                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6958                         } else if (numFilterColors == 2) {
6959                             cudaFuncSetCacheConfig(img_acts_color<2, 2, false, true, true>, cudaFuncCachePreferShared);
6960                             img_acts_color<2, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6961                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6962                         } else if (numFilterColors == 3) {
6963                             cudaFuncSetCacheConfig(img_acts_color<2, 3, false, true, true>, cudaFuncCachePreferShared);
6964                             img_acts_color<2, 3, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6965                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6966                         }
6967                     } else {
6968                         if (numFilterColors == 1) {
6969                             cudaFuncSetCacheConfig(img_acts_color<2, 1, false, false, true>, cudaFuncCachePreferShared);
6970                             img_acts_color<2, 1, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6971                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6972                         } else if (numFilterColors == 2) {
6973                             cudaFuncSetCacheConfig(img_acts_color<2, 2, false, false, true>, cudaFuncCachePreferShared);
6974                             img_acts_color<2, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6975                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6976                         } else if (numFilterColors == 3) {
6977                             cudaFuncSetCacheConfig(img_acts_color<2, 3, false, false, true>, cudaFuncCachePreferShared);
6978                             img_acts_color<2, 3, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6979                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6980                         }
6981                     }
6982                 }
6983             }
6984         } else { // do scale
6985             if (numFilterColors % 8 == 0) {
6986                 if (imgsPerThread == 4) {
6987                     if (checkCaseBounds) {
6988                         if (numFilterColors % 16 == 0) {
6989                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 4, true, true, true>, cudaFuncCachePreferShared);
6990                             conv_img_acts_manycolor<4, 32, 4, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6991                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6992                         } else {
6993                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 2, true, true, true>, cudaFuncCachePreferShared);
6994                             conv_img_acts_manycolor<4, 32, 4, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6995                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6996                         }
6997                     } else {
6998                         if (numFilterColors % 16 == 0) {
6999                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 4, true, false, true>, cudaFuncCachePreferShared);
7000                             conv_img_acts_manycolor<4, 32, 4, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7001                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7002                         } else {
7003                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 2, true, false, true>, cudaFuncCachePreferShared);
7004                             conv_img_acts_manycolor<4, 32, 4, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7005                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7006                         }
7007                     }
7008                 } else if (imgsPerThread == 2) {
7009                     if (checkCaseBounds) {
7010                         if (numFilterColors % 16 == 0) {
7011                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 4, true, true, true>, cudaFuncCachePreferShared);
7012                             conv_img_acts_manycolor<4, 32, 2, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7013                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7014                         } else {
7015                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 2, true, true, true>, cudaFuncCachePreferShared);
7016                             conv_img_acts_manycolor<4, 32, 2, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7017                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7018                         }
7019                     } else {
7020                         if (numFilterColors % 16 == 0) {
7021                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 4, true, false, true>, cudaFuncCachePreferShared);
7022                             conv_img_acts_manycolor<4, 32, 2, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7023                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7024                         } else {
7025                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 2, true, false, true>, cudaFuncCachePreferShared);
7026                             conv_img_acts_manycolor<4, 32, 2, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7027                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7028                         }
7029                     }
7030                 } else {
7031                     if (checkCaseBounds) {
7032                         if (numFilterColors % 16 == 0) {
7033                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 4, true, true, true>, cudaFuncCachePreferShared);
7034                             conv_img_acts_manycolor<4, 32, 1, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7035                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7036                         } else {
7037                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 2, true, true, true>, cudaFuncCachePreferShared);
7038                             conv_img_acts_manycolor<4, 32, 1, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7039                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7040                         }
7041                     } else {
7042                         if (numFilterColors % 16 == 0) {
7043                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 4, true, false, true>, cudaFuncCachePreferShared);
7044                             conv_img_acts_manycolor<4, 32, 1, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7045                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7046                         } else {
7047                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 2, true, false, true>, cudaFuncCachePreferShared);
7048                             conv_img_acts_manycolor<4, 32, 1, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7049                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7050                         }
7051                     }
7052                 }
7053             } else if (numFilterColors > 3) {
7054                 if (imgsPerThread == 8) {
7055                     if  (checkCaseBounds) {
7056                         if (colorsPerThread == 4) {
7057                             cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 4, true, true, true>, cudaFuncCachePreferShared);
7058                             img_acts_mediumcolor<8, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7059                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7060                         } else {
7061                             cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 2, true, true, true>, cudaFuncCachePreferShared);
7062                             img_acts_mediumcolor<8, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7063                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7064                         }
7065                     } else {
7066                         if (colorsPerThread == 4) {
7067                             cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 4, true, false, true>, cudaFuncCachePreferShared);
7068                             img_acts_mediumcolor<8, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7069                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7070                         } else {
7071                             cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 2, true, false, true>, cudaFuncCachePreferShared);
7072                             img_acts_mediumcolor<8, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7073                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7074                         }
7075                     }
7076                 } else if (imgsPerThread == 4) {
7077                     if  (checkCaseBounds) {
7078                         if (colorsPerThread == 4) {
7079                             cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 4, true, true, true>, cudaFuncCachePreferShared);
7080                             img_acts_mediumcolor<4, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7081                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7082                         } else {
7083                             cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 2, true, true, true>, cudaFuncCachePreferShared);
7084                             img_acts_mediumcolor<4, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7085                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7086                         }
7087                     } else {
7088                         if (colorsPerThread == 4) {
7089                             cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 4, true, false, true>, cudaFuncCachePreferShared);
7090                             img_acts_mediumcolor<4, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7091                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7092                         } else {
7093                             cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 2, true, false, true>, cudaFuncCachePreferShared);
7094                             img_acts_mediumcolor<4, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7095                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7096                         }
7097                     }
7098                 } else {
7099                     if  (checkCaseBounds) {
7100                         if (colorsPerThread == 4) {
7101                             cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 4, true, true, true>, cudaFuncCachePreferShared);
7102                             img_acts_mediumcolor<2, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7103                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7104                         } else {
7105                             cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 2, true, true, true>, cudaFuncCachePreferShared);
7106                             img_acts_mediumcolor<2, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7107                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7108                         }
7109                     } else {
7110                         if (colorsPerThread == 4) {
7111                             cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 4, true, false, true>, cudaFuncCachePreferShared);
7112                             img_acts_mediumcolor<2, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7113                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7114                         } else {
7115                             cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 2, true, false, true>, cudaFuncCachePreferShared);
7116                             img_acts_mediumcolor<2, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7117                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7118                         }
7119                     }
7120                 }
7121             } else {
7122                 if (imgsPerThread == 8) {
7123                     if (checkCaseBounds) {
7124                         if (numFilterColors == 1) {
7125                             cudaFuncSetCacheConfig(img_acts_color<8, 1, true, true, true>, cudaFuncCachePreferShared);
7126                             img_acts_color<8, 1, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7127                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7128                         } else if (numFilterColors == 2) {
7129                             cudaFuncSetCacheConfig(img_acts_color<8, 2, true, true, true>, cudaFuncCachePreferShared);
7130                             img_acts_color<8, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7131                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7132                         } else if (numFilterColors == 3) {
7133                             cudaFuncSetCacheConfig(img_acts_color<8, 3, true, true, true>, cudaFuncCachePreferShared);
7134                             img_acts_color<8, 3, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7135                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7136                         }
7137                     } else {
7138                         if (numFilterColors == 1) {
7139                             cudaFuncSetCacheConfig(img_acts_color<8, 1, true, false, true>, cudaFuncCachePreferShared);
7140                             img_acts_color<8, 1, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7141                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7142                         } else if (numFilterColors == 2) {
7143                             cudaFuncSetCacheConfig(img_acts_color<8, 2, true, false, true>, cudaFuncCachePreferShared);
7144                             img_acts_color<8, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7145                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7146                         } else if (numFilterColors == 3) {
7147                             cudaFuncSetCacheConfig(img_acts_color<8, 3, true, false, true>, cudaFuncCachePreferShared);
7148                             img_acts_color<8, 3, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7149                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7150                         }
7151                     }
7152                 } else if (imgsPerThread == 4) {
7153                     if (checkCaseBounds) {
7154                         if (numFilterColors == 1) {
7155                             cudaFuncSetCacheConfig(img_acts_color<4, 1, true, true, true>, cudaFuncCachePreferShared);
7156                             img_acts_color<4, 1, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7157                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7158                         } else if (numFilterColors == 2) {
7159                             cudaFuncSetCacheConfig(img_acts_color<4, 2, true, true, true>, cudaFuncCachePreferShared);
7160                             img_acts_color<4, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7161                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7162                         } else if (numFilterColors == 3) {
7163                             cudaFuncSetCacheConfig(img_acts_color<4, 3, true, true, true>, cudaFuncCachePreferShared);
7164                             img_acts_color<4, 3, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7165                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7166                         }
7167                     } else {
7168                         if (numFilterColors == 1) {
7169                             cudaFuncSetCacheConfig(img_acts_color<4, 1, true, false, true>, cudaFuncCachePreferShared);
7170                             img_acts_color<4, 1, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7171                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7172                         } else if (numFilterColors == 2) {
7173                             cudaFuncSetCacheConfig(img_acts_color<4, 2, true, false, true>, cudaFuncCachePreferShared);
7174                             img_acts_color<4, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7175                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7176                         } else if (numFilterColors == 3) {
7177                             cudaFuncSetCacheConfig(img_acts_color<4, 3, true, false, true>, cudaFuncCachePreferShared);
7178                             img_acts_color<4, 3, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7179                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7180                         }
7181                     }
7182                 } else {
7183                     if (checkCaseBounds) {
7184                         if (numFilterColors == 1) {
7185                             cudaFuncSetCacheConfig(img_acts_color<2, 1, true, true, true>, cudaFuncCachePreferShared);
7186                             img_acts_color<2, 1, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7187                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7188                         } else if (numFilterColors == 2) {
7189                             cudaFuncSetCacheConfig(img_acts_color<2, 2, true, true, true>, cudaFuncCachePreferShared);
7190                             img_acts_color<2, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7191                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7192                         } else if (numFilterColors == 3) {
7193                             cudaFuncSetCacheConfig(img_acts_color<2, 3, true, true, true>, cudaFuncCachePreferShared);
7194                             img_acts_color<2, 3, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7195                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7196                         }
7197                     } else {
7198                         if (numFilterColors == 1) {
7199                             cudaFuncSetCacheConfig(img_acts_color<2, 1, true, false, true>, cudaFuncCachePreferShared);
7200                             img_acts_color<2, 1, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7201                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7202                         } else if (numFilterColors == 2) {
7203                             cudaFuncSetCacheConfig(img_acts_color<2, 2, true, false, true>, cudaFuncCachePreferShared);
7204                             img_acts_color<2, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7205                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7206                         } else if (numFilterColors == 3) {
7207                             cudaFuncSetCacheConfig(img_acts_color<2, 3, true, false, true>, cudaFuncCachePreferShared);
7208                             img_acts_color<2, 3, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7209                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7210                         }
7211                     }
7212                 }
7213             }
7214         }
7215     } else { // local, unshared units
7216         if (scaleTargets == 0) { // do not scale or use targets matrix
7217             if (numFilterColors % 8 == 0) {
7218                 if (imgsPerThread == 4) {
7219                     if (checkCaseBounds) {
7220                         if (numFilterColors % 16 == 0) {
7221                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 4, false, true, false>, cudaFuncCachePreferShared);
7222                             conv_img_acts_manycolor<4, 32, 4, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7223                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7224                         } else {
7225                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 2, false, true, false>, cudaFuncCachePreferShared);
7226                             conv_img_acts_manycolor<4, 32, 4, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7227                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7228                         }
7229                     } else {
7230                         if (numFilterColors % 16 == 0) {
7231                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 4, false, false, false>, cudaFuncCachePreferShared);
7232                             conv_img_acts_manycolor<4, 32, 4, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7233                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7234                         } else {
7235                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 2, false, false, false>, cudaFuncCachePreferShared);
7236                             conv_img_acts_manycolor<4, 32, 4, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7237                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7238                         }
7239                     }
7240                 } else if (imgsPerThread == 2) {
7241                     if (checkCaseBounds) {
7242                         if (numFilterColors % 16 == 0) {
7243                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 4, false, true, false>, cudaFuncCachePreferShared);
7244                             conv_img_acts_manycolor<4, 32, 2, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7245                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7246                         } else {
7247                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 2, false, true, false>, cudaFuncCachePreferShared);
7248                             conv_img_acts_manycolor<4, 32, 2, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7249                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7250                         }
7251                     } else {
7252                         if (numFilterColors % 16 == 0) {
7253                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 4, false, false, false>, cudaFuncCachePreferShared);
7254                             conv_img_acts_manycolor<4, 32, 2, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7255                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7256                         } else {
7257                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 2, false, false, false>, cudaFuncCachePreferShared);
7258                             conv_img_acts_manycolor<4, 32, 2, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7259                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7260                         }
7261                     }
7262                 } else {
7263                     if (checkCaseBounds) {
7264                         if (numFilterColors % 16 == 0) {
7265                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 4, false, true, false>, cudaFuncCachePreferShared);
7266                             conv_img_acts_manycolor<4, 32, 1, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7267                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7268                         } else {
7269                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 2, false, true, false>, cudaFuncCachePreferShared);
7270                             conv_img_acts_manycolor<4, 32, 1, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7271                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7272                         }
7273                     } else {
7274                         if (numFilterColors % 16 == 0) {
7275                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 4, false, false, false>, cudaFuncCachePreferShared);
7276                             conv_img_acts_manycolor<4, 32, 1, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7277                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7278                         } else {
7279                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 2, false, false, false>, cudaFuncCachePreferShared);
7280                             conv_img_acts_manycolor<4, 32, 1, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7281                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7282                         }
7283                     }
7284                 }
7285             } else if (numFilterColors > 3) {
7286                 if (imgsPerThread == 8) {
7287                     if (checkCaseBounds) {
7288                         if (colorsPerThread == 4) {
7289                             cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 4, false, true, false>, cudaFuncCachePreferShared);
7290                             img_acts_mediumcolor<8, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7291                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7292                         } else {
7293                             cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 2, false, true, false>, cudaFuncCachePreferShared);
7294                             img_acts_mediumcolor<8, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7295                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7296                         }
7297                     } else {
7298                         if (colorsPerThread == 4) {
7299                             cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 4, false, false, false>, cudaFuncCachePreferShared);
7300                             img_acts_mediumcolor<8, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7301                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7302                         } else {
7303                             cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 2, false, false, false>, cudaFuncCachePreferShared);
7304                             img_acts_mediumcolor<8, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7305                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7306                         }
7307                     }
7308                 } else if (imgsPerThread == 4) {
7309                     if (checkCaseBounds) {
7310                         if (colorsPerThread == 4) {
7311                             cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 4, false, true, false>, cudaFuncCachePreferShared);
7312                             img_acts_mediumcolor<4, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7313                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7314                         } else {
7315                             cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 2, false, true, false>, cudaFuncCachePreferShared);
7316                             img_acts_mediumcolor<4, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7317                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7318                         }
7319                     } else {
7320                         if (colorsPerThread == 4) {
7321                             cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 4, false, false, false>, cudaFuncCachePreferShared);
7322                             img_acts_mediumcolor<4, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7323                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7324                         } else {
7325                             cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 2, false, false, false>, cudaFuncCachePreferShared);
7326                             img_acts_mediumcolor<4, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7327                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7328                         }
7329                     }
7330                 } else {
7331                     if (checkCaseBounds) {
7332                         if (colorsPerThread == 4) {
7333                             cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 4, false, true, false>, cudaFuncCachePreferShared);
7334                             img_acts_mediumcolor<2, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7335                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7336                         } else {
7337                             cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 2, false, true, false>, cudaFuncCachePreferShared);
7338                             img_acts_mediumcolor<2, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7339                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7340                         }
7341                     } else {
7342                         if (colorsPerThread == 4) {
7343                             cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 4, false, false, false>, cudaFuncCachePreferShared);
7344                             img_acts_mediumcolor<2, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7345                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7346                         } else {
7347                             cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 2, false, false, false>, cudaFuncCachePreferShared);
7348                             img_acts_mediumcolor<2, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7349                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7350                         }
7351                     }
7352                 }
7353             } else {
7354                 if (imgsPerThread == 8) {
7355                     if (checkCaseBounds) {
7356                         if (numFilterColors == 1) {
7357                             cudaFuncSetCacheConfig(img_acts_color<8, 1, false, true, false>, cudaFuncCachePreferShared);
7358                             img_acts_color<8, 1, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7359                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7360                         } else if (numFilterColors == 2) {
7361                             cudaFuncSetCacheConfig(img_acts_color<8, 2, false, true, false>, cudaFuncCachePreferShared);
7362                             img_acts_color<8, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7363                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7364                         } else if (numFilterColors == 3) {
7365                             cudaFuncSetCacheConfig(img_acts_color<8, 3, false, true, false>, cudaFuncCachePreferShared);
7366                             img_acts_color<8, 3, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7367                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7368                         }
7369                     } else {
7370                         if (numFilterColors == 1) {
7371                             cudaFuncSetCacheConfig(img_acts_color<8, 1, false, false, false>, cudaFuncCachePreferShared);
7372                             img_acts_color<8, 1, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7373                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7374                         } else if (numFilterColors == 2) {
7375                             cudaFuncSetCacheConfig(img_acts_color<8, 2, false, false, false>, cudaFuncCachePreferShared);
7376                             img_acts_color<8, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7377                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7378                         } else if (numFilterColors == 3) {
7379                             cudaFuncSetCacheConfig(img_acts_color<8, 3, false, false, false>, cudaFuncCachePreferShared);
7380                             img_acts_color<8, 3, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7381                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7382                         }
7383                     }
7384                 } else if (imgsPerThread == 4) {
7385                     if (checkCaseBounds) {
7386                         if (numFilterColors == 1) {
7387                             cudaFuncSetCacheConfig(img_acts_color<4, 1, false, true, false>, cudaFuncCachePreferShared);
7388                             img_acts_color<4, 1, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7389                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7390                         } else if (numFilterColors == 2) {
7391                             cudaFuncSetCacheConfig(img_acts_color<4, 2, false, true, false>, cudaFuncCachePreferShared);
7392                             img_acts_color<4, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7393                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7394                         } else if (numFilterColors == 3) {
7395                             cudaFuncSetCacheConfig(img_acts_color<4, 3, false, true, false>, cudaFuncCachePreferShared);
7396                             img_acts_color<4, 3, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7397                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7398                         }
7399                     } else {
7400                         if (numFilterColors == 1) {
7401                             cudaFuncSetCacheConfig(img_acts_color<4, 1, false, false, false>, cudaFuncCachePreferShared);
7402                             img_acts_color<4, 1, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7403                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7404                         } else if (numFilterColors == 2) {
7405                             cudaFuncSetCacheConfig(img_acts_color<4, 2, false, false, false>, cudaFuncCachePreferShared);
7406                             img_acts_color<4, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7407                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7408                         } else if (numFilterColors == 3) {
7409                             cudaFuncSetCacheConfig(img_acts_color<4, 3, false, false, false>, cudaFuncCachePreferShared);
7410                             img_acts_color<4, 3, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7411                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7412                         }
7413                     }
7414                 } else {
7415                     if (checkCaseBounds) {
7416                         if (numFilterColors == 1) {
7417                             cudaFuncSetCacheConfig(img_acts_color<2, 1, false, true, false>, cudaFuncCachePreferShared);
7418                             img_acts_color<2, 1, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7419                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7420                         } else if (numFilterColors == 2) {
7421                             cudaFuncSetCacheConfig(img_acts_color<2, 2, false, true, false>, cudaFuncCachePreferShared);
7422                             img_acts_color<2, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7423                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7424                         } else if (numFilterColors == 3) {
7425                             cudaFuncSetCacheConfig(img_acts_color<2, 3, false, true, false>, cudaFuncCachePreferShared);
7426                             img_acts_color<2, 3, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7427                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7428                         }
7429                     } else {
7430                         if (numFilterColors == 1) {
7431                             cudaFuncSetCacheConfig(img_acts_color<2, 1, false, false, false>, cudaFuncCachePreferShared);
7432                             img_acts_color<2, 1, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7433                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7434                         } else if (numFilterColors == 2) {
7435                             cudaFuncSetCacheConfig(img_acts_color<2, 2, false, false, false>, cudaFuncCachePreferShared);
7436                             img_acts_color<2, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7437                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7438                         } else if (numFilterColors == 3) {
7439                             cudaFuncSetCacheConfig(img_acts_color<2, 3, false, false, false>, cudaFuncCachePreferShared);
7440                             img_acts_color<2, 3, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7441                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7442                         }
7443                     }
7444                 }
7445             }
7446         } else { // do scale
7447             if (numFilterColors % 8 == 0) {
7448                 if (imgsPerThread == 4) {
7449                     if (checkCaseBounds) {
7450                         if (numFilterColors % 16 == 0) {
7451                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 4, true, true, false>, cudaFuncCachePreferShared);
7452                             conv_img_acts_manycolor<4, 32, 4, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7453                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7454                         } else {
7455                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 2, true, true, false>, cudaFuncCachePreferShared);
7456                             conv_img_acts_manycolor<4, 32, 4, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7457                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7458                         }
7459                     } else {
7460                         if (numFilterColors % 16 == 0) {
7461                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 4, true, false, false>, cudaFuncCachePreferShared);
7462                             conv_img_acts_manycolor<4, 32, 4, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7463                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7464                         } else {
7465                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 2, true, false, false>, cudaFuncCachePreferShared);
7466                             conv_img_acts_manycolor<4, 32, 4, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7467                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7468                         }
7469                     }
7470                 } else if (imgsPerThread == 2) {
7471                     if (checkCaseBounds) {
7472                         if (numFilterColors % 16 == 0) {
7473                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 4, true, true, false>, cudaFuncCachePreferShared);
7474                             conv_img_acts_manycolor<4, 32, 2, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7475                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7476                         } else {
7477                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 2, true, true, false>, cudaFuncCachePreferShared);
7478                             conv_img_acts_manycolor<4, 32, 2, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7479                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7480                         }
7481                     } else {
7482                         if (numFilterColors % 16 == 0) {
7483                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 4, true, false, false>, cudaFuncCachePreferShared);
7484                             conv_img_acts_manycolor<4, 32, 2, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7485                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7486                         } else {
7487                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 2, true, false, false>, cudaFuncCachePreferShared);
7488                             conv_img_acts_manycolor<4, 32, 2, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7489                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7490                         }
7491                     }
7492                 } else {
7493                     if (checkCaseBounds) {
7494                         if (numFilterColors % 16 == 0) {
7495                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 4, true, true, false>, cudaFuncCachePreferShared);
7496                             conv_img_acts_manycolor<4, 32, 1, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7497                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7498                         } else {
7499                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 2, true, true, false>, cudaFuncCachePreferShared);
7500                             conv_img_acts_manycolor<4, 32, 1, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7501                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7502                         }
7503                     } else {
7504                         if (numFilterColors % 16 == 0) {
7505                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 4, true, false, false>, cudaFuncCachePreferShared);
7506                             conv_img_acts_manycolor<4, 32, 1, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7507                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7508                         } else {
7509                             cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 2, true, false, false>, cudaFuncCachePreferShared);
7510                             conv_img_acts_manycolor<4, 32, 1, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7511                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7512                         }
7513                     }
7514                 }
7515             } else if (numFilterColors > 3) {
7516                 if (imgsPerThread == 8) {
7517                     if  (checkCaseBounds) {
7518                         if (colorsPerThread == 4) {
7519                             cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 4, true, true, false>, cudaFuncCachePreferShared);
7520                             img_acts_mediumcolor<8, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7521                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7522                         } else {
7523                             cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 2, true, true, false>, cudaFuncCachePreferShared);
7524                             img_acts_mediumcolor<8, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7525                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7526                         }
7527                     } else {
7528                         if (colorsPerThread == 4) {
7529                             cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 4, true, false, false>, cudaFuncCachePreferShared);
7530                             img_acts_mediumcolor<8, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7531                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7532                         } else {
7533                             cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 2, true, false, false>, cudaFuncCachePreferShared);
7534                             img_acts_mediumcolor<8, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7535                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7536                         }
7537                     }
7538                 } else if (imgsPerThread == 4) {
7539                     if  (checkCaseBounds) {
7540                         if (colorsPerThread == 4) {
7541                             cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 4, true, true, false>, cudaFuncCachePreferShared);
7542                             img_acts_mediumcolor<4, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7543                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7544                         } else {
7545                             cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 2, true, true, false>, cudaFuncCachePreferShared);
7546                             img_acts_mediumcolor<4, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7547                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7548                         }
7549                     } else {
7550                         if (colorsPerThread == 4) {
7551                             cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 4, true, false, false>, cudaFuncCachePreferShared);
7552                             img_acts_mediumcolor<4, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7553                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7554                         } else {
7555                             cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 2, true, false, false>, cudaFuncCachePreferShared);
7556                             img_acts_mediumcolor<4, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7557                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7558                         }
7559                     }
7560                 } else {
7561                     if  (checkCaseBounds) {
7562                         if (colorsPerThread == 4) {
7563                             cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 4, true, true, false>, cudaFuncCachePreferShared);
7564                             img_acts_mediumcolor<2, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7565                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7566                         } else {
7567                             cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 2, true, true, false>, cudaFuncCachePreferShared);
7568                             img_acts_mediumcolor<2, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7569                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7570                         }
7571                     } else {
7572                         if (colorsPerThread == 4) {
7573                             cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 4, true, false, false>, cudaFuncCachePreferShared);
7574                             img_acts_mediumcolor<2, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7575                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7576                         } else {
7577                             cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 2, true, false, false>, cudaFuncCachePreferShared);
7578                             img_acts_mediumcolor<2, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7579                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7580                         }
7581                     }
7582                 }
7583             } else {
7584                 if (imgsPerThread == 8) {
7585                     if (checkCaseBounds) {
7586                         if (numFilterColors == 1) {
7587                             cudaFuncSetCacheConfig(img_acts_color<8, 1, true, true, false>, cudaFuncCachePreferShared);
7588                             img_acts_color<8, 1, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7589                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7590                         } else if (numFilterColors == 2) {
7591                             cudaFuncSetCacheConfig(img_acts_color<8, 2, true, true, false>, cudaFuncCachePreferShared);
7592                             img_acts_color<8, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7593                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7594                         } else if (numFilterColors == 3) {
7595                             cudaFuncSetCacheConfig(img_acts_color<8, 3, true, true, false>, cudaFuncCachePreferShared);
7596                             img_acts_color<8, 3, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7597                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7598                         }
7599                     } else {
7600                         if (numFilterColors == 1) {
7601                             cudaFuncSetCacheConfig(img_acts_color<8, 1, true, false, false>, cudaFuncCachePreferShared);
7602                             img_acts_color<8, 1, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7603                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7604                         } else if (numFilterColors == 2) {
7605                             cudaFuncSetCacheConfig(img_acts_color<8, 2, true, false, false>, cudaFuncCachePreferShared);
7606                             img_acts_color<8, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7607                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7608                         } else if (numFilterColors == 3) {
7609                             cudaFuncSetCacheConfig(img_acts_color<8, 3, true, false, false>, cudaFuncCachePreferShared);
7610                             img_acts_color<8, 3, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7611                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7612                         }
7613                     }
7614                 } else if (imgsPerThread == 4) {
7615                     if (checkCaseBounds) {
7616                         if (numFilterColors == 1) {
7617                             cudaFuncSetCacheConfig(img_acts_color<4, 1, true, true, false>, cudaFuncCachePreferShared);
7618                             img_acts_color<4, 1, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7619                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7620                         } else if (numFilterColors == 2) {
7621                             cudaFuncSetCacheConfig(img_acts_color<4, 2, true, true, false>, cudaFuncCachePreferShared);
7622                             img_acts_color<4, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7623                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7624                         } else if (numFilterColors == 3) {
7625                             cudaFuncSetCacheConfig(img_acts_color<4, 3, true, true, false>, cudaFuncCachePreferShared);
7626                             img_acts_color<4, 3, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7627                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7628                         }
7629                     } else {
7630                         if (numFilterColors == 1) {
7631                             cudaFuncSetCacheConfig(img_acts_color<4, 1, true, false, false>, cudaFuncCachePreferShared);
7632                             img_acts_color<4, 1, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7633                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7634                         } else if (numFilterColors == 2) {
7635                             cudaFuncSetCacheConfig(img_acts_color<4, 2, true, false, false>, cudaFuncCachePreferShared);
7636                             img_acts_color<4, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7637                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7638                         } else if (numFilterColors == 3) {
7639                             cudaFuncSetCacheConfig(img_acts_color<4, 3, true, false, false>, cudaFuncCachePreferShared);
7640                             img_acts_color<4, 3, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7641                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7642                         }
7643                     }
7644                 } else {
7645                     if (checkCaseBounds) {
7646                         if (numFilterColors == 1) {
7647                             cudaFuncSetCacheConfig(img_acts_color<2, 1, true, true, false>, cudaFuncCachePreferShared);
7648                             img_acts_color<2, 1, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7649                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7650                         } else if (numFilterColors == 2) {
7651                             cudaFuncSetCacheConfig(img_acts_color<2, 2, true, true, false>, cudaFuncCachePreferShared);
7652                             img_acts_color<2, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7653                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7654                         } else if (numFilterColors == 3) {
7655                             cudaFuncSetCacheConfig(img_acts_color<2, 3, true, true, false>, cudaFuncCachePreferShared);
7656                             img_acts_color<2, 3, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7657                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7658                         }
7659                     } else {
7660                         if (numFilterColors == 1) {
7661                             cudaFuncSetCacheConfig(img_acts_color<2, 1, true, false, false>, cudaFuncCachePreferShared);
7662                             img_acts_color<2, 1, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7663                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7664                         } else if (numFilterColors == 2) {
7665                             cudaFuncSetCacheConfig(img_acts_color<2, 2, true, false, false>, cudaFuncCachePreferShared);
7666                             img_acts_color<2, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7667                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7668                         } else if (numFilterColors == 3) {
7669                             cudaFuncSetCacheConfig(img_acts_color<2, 3, true, false, false>, cudaFuncCachePreferShared);
7670                             img_acts_color<2, 3, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7671                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7672                         }
7673                     }
7674                 }
7675             }
7676         }
7677     }
7678
7679     cutilCheckMsg("imgActs: kernel execution failed");
7680 }
7681
7682
7683 void convImgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets,
7684                     int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups) {
7685     _imgActs(hidActs, filters, targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, numImgColors, numGroups, 0, 1, true);
7686 }
7687
7688 void convImgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets,
7689                     int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups,
7690                     float scaleTargets, float scaleOutput) {
7691     _imgActs(hidActs, filters, targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput, true);
7692 }
7693
7694 void localImgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets,
7695                     int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups) {
7696     _imgActs(hidActs, filters, targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, numImgColors, numGroups, 0, 1, false);
7697 }
7698
7699 void localImgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets,
7700                     int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups,
7701                     float scaleTargets, float scaleOutput) {
7702     _imgActs(hidActs, filters, targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput, false);
7703 }
7704
7705
7706 /*
7707  * hidActs:         (numFilters, numModulesY, numModulesX, numImages)
7708  * filters:         (numFilterColors, filterPixels, numFilters)                             if conv
7709  *                  (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters)   otherwise
7710  * targets:         (overSample, numImgColors, imgSizeY, imgSizeX, numImages)
7711  * colorIndices:    (numGroups, numFilterColors)
7712  *
7713  * where overSample := (numFilterColors * numGroups) / numImgColors
7714  *
7715  * Note: all of these convolution routines are optimized for the case when
7716  * the number of images (i.e. the minibatch size) is a multiple of 128.
7717  * Other batch sizes will work, but but I made no attempt whatsoever
7718  * to make them work fast.
7719  */
7720 void _imgActsSparse(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
7721                        int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride,
7722                        int numImgColors, int numFilterColors, int numGroups,
7723                        float scaleTargets, float scaleOutput, bool conv) {
7724     int numImages = hidActs.getNumCols();
7725     int numFilters = filters.getNumCols();
7726 //    int numFiltersPerGroup = numFilters / numGroups;
7727     int numModules = hidActs.getNumRows() / numFilters;
7728     int filterModuleMult = conv ? 1 : numModules;
7729     int filterPixels = filters.getNumRows() / (filterModuleMult * numFilterColors);
7730     int filterSize = sqrt((double)filterPixels);
7731     int imgPixels = imgSizeY * imgSizeX;
7732     int numModulesX = numModules / numModulesY;
7733     int overSample = (numFilterColors * numGroups) / numImgColors;
7734
7735     assert(numImgColors % numFilterColors == 0);
7736     assert(numFilters % (16*numGroups) == 0);
7737     assert((numFilterColors * numGroups) % numImgColors == 0);
7738     assert(numGroups > 1);
7739     assert(numFilterColors > 3 && numFilterColors % 2 == 0);
7740
7741     assert(filterPixels == filterSize * filterSize);
7742     assert(hidActs.getNumRows() == numModules * numFilters);
7743     assert(filters.getNumRows() == filterModuleMult * numFilterColors * filterPixels);
7744     assert(numModules == numModulesY * numModulesX);
7745
7746     assert(hidActs.isContiguous());
7747     assert(filters.isContiguous());
7748
7749     assert(!hidActs.isTrans());
7750     assert(!filters.isTrans());
7751     assert(!targets.isTrans());
7752     // These routines don't handle the case when only part of the image is visited in the convolution
7753     assert(paddingStart <= 0);
7754     assert(paddingStart + (numModulesX-1)*moduleStride + filterSize >= imgSizeX);
7755     assert(paddingStart + (numModulesY-1)*moduleStride + filterSize >= imgSizeY);
7756     assert(moduleStride <= filterSize);
7757
7758     assert(targets.isContiguous()); // no stride support here!
7759
7760     dim3 blocks;
7761     dim3 threads;
7762     int colorsPerThread;
7763     int imgsPerThread;
7764     if (numFilterColors % 8 == 0) {
7765         threads = dim3(32, 4);
7766         colorsPerThread = numFilterColors % 16 == 0 ? 4 : 2;
7767         imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
7768         assert(numFilterColors % (threads.y * colorsPerThread) == 0);
7769         blocks = dim3(DIVUP(numImages, threads.x*imgsPerThread) * (numImgColors/(threads.y*colorsPerThread)), overSample * imgPixels);
7770     } else if (numFilterColors > 3) {
7771         imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2;
7772         threads = dim3(16, 16);
7773         colorsPerThread = numFilterColors % 4 == 0 ? 4 : 2;
7774         blocks = dim3(DIVUP(numImages,16*imgsPerThread) * (numImgColors / colorsPerThread), overSample * DIVUP(imgSizeY,4) * DIVUP(imgSizeX,4));
7775     }
7776
7777     bool checkCaseBounds = numImages % (threads.x * imgsPerThread) != 0;
7778
7779     if (scaleTargets == 0) { // do not scale or use targets matrix
7780         targets.resize(overSample*numImgColors*imgPixels, numImages);
7781     } else {
7782         assert(targets.getNumRows() == overSample * numImgColors * imgPixels);
7783         assert(targets.getNumCols() == numImages);
7784     }
7785
7786     if (conv) {
7787         if (scaleTargets == 0) { // do not scale or use targets matrix
7788             if (numFilterColors % 8 == 0) {
7789                 if (imgsPerThread == 4) {
7790                     if (checkCaseBounds) {
7791                         if (numFilterColors % 16 == 0) {
7792                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 4, false, true, true>, cudaFuncCachePreferShared);
7793                             img_acts_manycolor_sparse_rand<4, 32, 4, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7794                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7795                         } else {
7796                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 2, false, true, true>, cudaFuncCachePreferShared);
7797                             img_acts_manycolor_sparse_rand<4, 32, 4, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7798                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7799                         }
7800                     } else {
7801                         if (numFilterColors % 16 == 0) {
7802                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 4, false, false, true>, cudaFuncCachePreferShared);
7803                             img_acts_manycolor_sparse_rand<4, 32, 4, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7804                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7805                         } else {
7806                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 2, false, false, true>, cudaFuncCachePreferShared);
7807                             img_acts_manycolor_sparse_rand<4, 32, 4, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7808                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7809                         }
7810                     }
7811                 } else if (imgsPerThread == 2) {
7812                     if (checkCaseBounds) {
7813                         if (numFilterColors % 16 == 0) {
7814                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 4, false, true, true>, cudaFuncCachePreferShared);
7815                             img_acts_manycolor_sparse_rand<4, 32, 2, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7816                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7817                         } else {
7818                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 2, false, true, true>, cudaFuncCachePreferShared);
7819                             img_acts_manycolor_sparse_rand<4, 32, 2, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7820                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7821                         }
7822                     } else {
7823                         if (numFilterColors % 16 == 0) {
7824                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 4, false, false, true>, cudaFuncCachePreferShared);
7825                             img_acts_manycolor_sparse_rand<4, 32, 2, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7826                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7827                         } else {
7828                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 2, false, false, true>, cudaFuncCachePreferShared);
7829                             img_acts_manycolor_sparse_rand<4, 32, 2, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7830                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7831                         }
7832                     }
7833                 } else {
7834                     if (checkCaseBounds) {
7835                         if (numFilterColors % 16 == 0) {
7836                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 4, false, true, true>, cudaFuncCachePreferShared);
7837                             img_acts_manycolor_sparse_rand<4, 32, 1, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7838                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7839                         } else {
7840                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 2, false, true, true>, cudaFuncCachePreferShared);
7841                             img_acts_manycolor_sparse_rand<4, 32, 1, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7842                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7843                         }
7844                     } else {
7845                         if (numFilterColors % 16 == 0) {
7846                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 4, false, false, true>, cudaFuncCachePreferShared);
7847                             img_acts_manycolor_sparse_rand<4, 32, 1, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7848                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7849                         } else {
7850                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 2, false, false, true>, cudaFuncCachePreferShared);
7851                             img_acts_manycolor_sparse_rand<4, 32, 1, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7852                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7853                         }
7854                     }
7855                 }
7856
7857             } else if (numFilterColors > 3) {
7858                 if (imgsPerThread == 8) {
7859                     if (checkCaseBounds) {
7860                         if (colorsPerThread == 4) {
7861                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 4, false, true, true>, cudaFuncCachePreferShared);
7862                             img_acts_mediumcolor_sparse_rand<8, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7863                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7864                         } else {
7865                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 2, false, true, true>, cudaFuncCachePreferShared);
7866                             img_acts_mediumcolor_sparse_rand<8, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7867                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7868                         }
7869                     } else {
7870                         if (colorsPerThread == 4) {
7871                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 4, false, false, true>, cudaFuncCachePreferShared);
7872                             img_acts_mediumcolor_sparse_rand<8, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7873                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7874                         } else {
7875                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 2, false, false, true>, cudaFuncCachePreferShared);
7876                             img_acts_mediumcolor_sparse_rand<8, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7877                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7878                         }
7879                     }
7880                 } else if (imgsPerThread == 4) {
7881                     if (checkCaseBounds) {
7882                         if (colorsPerThread == 4) {
7883                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 4, false, true, true>, cudaFuncCachePreferShared);
7884                             img_acts_mediumcolor_sparse_rand<4, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7885                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7886                         } else {
7887                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 2, false, true, true>, cudaFuncCachePreferShared);
7888                             img_acts_mediumcolor_sparse_rand<4, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7889                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7890                         }
7891                     } else {
7892                         if (colorsPerThread == 4) {
7893                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 4, false, false, true>, cudaFuncCachePreferShared);
7894                             img_acts_mediumcolor_sparse_rand<4, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7895                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7896                         } else {
7897                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 2, false, false, true>, cudaFuncCachePreferShared);
7898                             img_acts_mediumcolor_sparse_rand<4, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7899                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7900                         }
7901                     }
7902                 } else {
7903                     if (checkCaseBounds) {
7904                         if (colorsPerThread == 4) {
7905                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 4, false, true, true>, cudaFuncCachePreferShared);
7906                             img_acts_mediumcolor_sparse_rand<2, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7907                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7908                         } else {
7909                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 2, false, true, true>, cudaFuncCachePreferShared);
7910                             img_acts_mediumcolor_sparse_rand<2, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7911                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7912                         }
7913                     } else {
7914                         if (colorsPerThread == 4) {
7915                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 4, false, false, true>, cudaFuncCachePreferShared);
7916                             img_acts_mediumcolor_sparse_rand<2, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7917                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7918                         } else {
7919                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 2, false, false, true>, cudaFuncCachePreferShared);
7920                             img_acts_mediumcolor_sparse_rand<2, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7921                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7922                         }
7923                     }
7924                 }
7925             }
7926         } else { // do scale
7927             if (numFilterColors % 8 == 0) {
7928                 if (imgsPerThread == 4) {
7929                     if (checkCaseBounds) {
7930                         if (numFilterColors % 16 == 0) {
7931                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 4, true, true, true>, cudaFuncCachePreferShared);
7932                             img_acts_manycolor_sparse_rand<4, 32, 4, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7933                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7934                         } else {
7935                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 2, true, true, true>, cudaFuncCachePreferShared);
7936                             img_acts_manycolor_sparse_rand<4, 32, 4, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7937                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7938                         }
7939                     } else {
7940                         if (numFilterColors % 16 == 0) {
7941                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 4, true, false, true>, cudaFuncCachePreferShared);
7942                             img_acts_manycolor_sparse_rand<4, 32, 4, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7943                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7944                         } else {
7945                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 2, true, false, true>, cudaFuncCachePreferShared);
7946                             img_acts_manycolor_sparse_rand<4, 32, 4, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7947                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7948                         }
7949                     }
7950                 } else if (imgsPerThread == 2) {
7951                     if (checkCaseBounds) {
7952                         if (numFilterColors % 16 == 0) {
7953                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 4, true, true, true>, cudaFuncCachePreferShared);
7954                             img_acts_manycolor_sparse_rand<4, 32, 2, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7955                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7956                         } else {
7957                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 2, true, true, true>, cudaFuncCachePreferShared);
7958                             img_acts_manycolor_sparse_rand<4, 32, 2, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7959                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7960                         }
7961                     } else {
7962                         if (numFilterColors % 16 == 0) {
7963                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 4, true, false, true>, cudaFuncCachePreferShared);
7964                             img_acts_manycolor_sparse_rand<4, 32, 2, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7965                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7966                         } else {
7967                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 2, true, false, true>, cudaFuncCachePreferShared);
7968                             img_acts_manycolor_sparse_rand<4, 32, 2, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7969                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7970                         }
7971                     }
7972                 } else {
7973                     if (checkCaseBounds) {
7974                         if (numFilterColors % 16 == 0) {
7975                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 4, true, true, true>, cudaFuncCachePreferShared);
7976                             img_acts_manycolor_sparse_rand<4, 32, 1, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7977                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7978                         } else {
7979                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 2, true, true, true>, cudaFuncCachePreferShared);
7980                             img_acts_manycolor_sparse_rand<4, 32, 1, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7981                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7982                         }
7983                     } else {
7984                         if (numFilterColors % 16 == 0) {
7985                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 4, true, false, true>, cudaFuncCachePreferShared);
7986                             img_acts_manycolor_sparse_rand<4, 32, 1, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7987                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7988                         } else {
7989                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 2, true, false, true>, cudaFuncCachePreferShared);
7990                             img_acts_manycolor_sparse_rand<4, 32, 1, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7991                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7992                         }
7993                     }
7994                 }
7995
7996             } else if (numFilterColors > 3) {
7997                 if (imgsPerThread == 8) {
7998                     if  (checkCaseBounds) {
7999                         if (colorsPerThread == 4) {
8000                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 4, true, true, true>, cudaFuncCachePreferShared);
8001                             img_acts_mediumcolor_sparse_rand<8, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8002                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8003                         } else {
8004                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 2, true, true, true>, cudaFuncCachePreferShared);
8005                             img_acts_mediumcolor_sparse_rand<8, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8006                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8007                         }
8008                     } else {
8009                         if (colorsPerThread == 4) {
8010                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 4, true, false, true>, cudaFuncCachePreferShared);
8011                             img_acts_mediumcolor_sparse_rand<8, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8012                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8013                         } else {
8014                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 2, true, false, true>, cudaFuncCachePreferShared);
8015                             img_acts_mediumcolor_sparse_rand<8, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8016                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8017                         }
8018                     }
8019                 } else if (imgsPerThread == 4) {
8020                     if  (checkCaseBounds) {
8021                         if (colorsPerThread == 4) {
8022                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 4, true, true, true>, cudaFuncCachePreferShared);
8023                             img_acts_mediumcolor_sparse_rand<4, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8024                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8025                         } else {
8026                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 2, true, true, true>, cudaFuncCachePreferShared);
8027                             img_acts_mediumcolor_sparse_rand<4, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8028                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8029                         }
8030                     } else {
8031                         if (colorsPerThread == 4) {
8032                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 4, true, false, true>, cudaFuncCachePreferShared);
8033                             img_acts_mediumcolor_sparse_rand<4, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8034                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8035                         } else {
8036                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 2, true, false, true>, cudaFuncCachePreferShared);
8037                             img_acts_mediumcolor_sparse_rand<4, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8038                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8039                         }
8040                     }
8041                 } else {
8042                     if  (checkCaseBounds) {
8043                         if (colorsPerThread == 4) {
8044                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 4, true, true, true>, cudaFuncCachePreferShared);
8045                             img_acts_mediumcolor_sparse_rand<2, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8046                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8047                         } else {
8048                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 2, true, true, true>, cudaFuncCachePreferShared);
8049                             img_acts_mediumcolor_sparse_rand<2, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8050                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8051                         }
8052                     } else {
8053                         if (colorsPerThread == 4) {
8054                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 4, true, false, true>, cudaFuncCachePreferShared);
8055                             img_acts_mediumcolor_sparse_rand<2, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8056                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8057                         } else {
8058                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 2, true, false, true>, cudaFuncCachePreferShared);
8059                             img_acts_mediumcolor_sparse_rand<2, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8060                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8061                         }
8062                     }
8063                 }
8064             }
8065         }
8066     } else {
8067         if (scaleTargets == 0) { // do not scale or use targets matrix
8068             if (numFilterColors % 8 == 0) {
8069                 if (imgsPerThread == 4) {
8070                     if (checkCaseBounds) {
8071                         if (numFilterColors % 16 == 0) {
8072                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 4, false, true, false>, cudaFuncCachePreferShared);
8073                             img_acts_manycolor_sparse_rand<4, 32, 4, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8074                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8075                         } else {
8076                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 2, false, true, false>, cudaFuncCachePreferShared);
8077                             img_acts_manycolor_sparse_rand<4, 32, 4, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8078                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8079                         }
8080                     } else {
8081                         if (numFilterColors % 16 == 0) {
8082                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 4, false, false, false>, cudaFuncCachePreferShared);
8083                             img_acts_manycolor_sparse_rand<4, 32, 4, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8084                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8085                         } else {
8086                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 2, false, false, false>, cudaFuncCachePreferShared);
8087                             img_acts_manycolor_sparse_rand<4, 32, 4, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8088                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8089                         }
8090                     }
8091                 } else if (imgsPerThread == 2) {
8092                     if (checkCaseBounds) {
8093                         if (numFilterColors % 16 == 0) {
8094                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 4, false, true, false>, cudaFuncCachePreferShared);
8095                             img_acts_manycolor_sparse_rand<4, 32, 2, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8096                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8097                         } else {
8098                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 2, false, true, false>, cudaFuncCachePreferShared);
8099                             img_acts_manycolor_sparse_rand<4, 32, 2, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8100                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8101                         }
8102                     } else {
8103                         if (numFilterColors % 16 == 0) {
8104                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 4, false, false, false>, cudaFuncCachePreferShared);
8105                             img_acts_manycolor_sparse_rand<4, 32, 2, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8106                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8107                         } else {
8108                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 2, false, false, false>, cudaFuncCachePreferShared);
8109                             img_acts_manycolor_sparse_rand<4, 32, 2, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8110                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8111                         }
8112                     }
8113                 } else {
8114                     if (checkCaseBounds) {
8115                         if (numFilterColors % 16 == 0) {
8116                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 4, false, true, false>, cudaFuncCachePreferShared);
8117                             img_acts_manycolor_sparse_rand<4, 32, 1, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8118                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8119                         } else {
8120                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 2, false, true, false>, cudaFuncCachePreferShared);
8121                             img_acts_manycolor_sparse_rand<4, 32, 1, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8122                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8123                         }
8124                     } else {
8125                         if (numFilterColors % 16 == 0) {
8126                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 4, false, false, false>, cudaFuncCachePreferShared);
8127                             img_acts_manycolor_sparse_rand<4, 32, 1, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8128                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8129                         } else {
8130                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 2, false, false, false>, cudaFuncCachePreferShared);
8131                             img_acts_manycolor_sparse_rand<4, 32, 1, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8132                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8133                         }
8134                     }
8135                 }
8136
8137             } else if (numFilterColors > 3) {
8138                 if (imgsPerThread == 8) {
8139                     if (checkCaseBounds) {
8140                         if (colorsPerThread == 4) {
8141                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 4, false, true, false>, cudaFuncCachePreferShared);
8142                             img_acts_mediumcolor_sparse_rand<8, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8143                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8144                         } else {
8145                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 2, false, true, false>, cudaFuncCachePreferShared);
8146                             img_acts_mediumcolor_sparse_rand<8, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8147                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8148                         }
8149                     } else {
8150                         if (colorsPerThread == 4) {
8151                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 4, false, false, false>, cudaFuncCachePreferShared);
8152                             img_acts_mediumcolor_sparse_rand<8, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8153                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8154                         } else {
8155                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 2, false, false, false>, cudaFuncCachePreferShared);
8156                             img_acts_mediumcolor_sparse_rand<8, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8157                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8158                         }
8159                     }
8160                 } else if (imgsPerThread == 4) {
8161                     if (checkCaseBounds) {
8162                         if (colorsPerThread == 4) {
8163                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 4, false, true, false>, cudaFuncCachePreferShared);
8164                             img_acts_mediumcolor_sparse_rand<4, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8165                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8166                         } else {
8167                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 2, false, true, false>, cudaFuncCachePreferShared);
8168                             img_acts_mediumcolor_sparse_rand<4, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8169                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8170                         }
8171                     } else {
8172                         if (colorsPerThread == 4) {
8173                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 4, false, false, false>, cudaFuncCachePreferShared);
8174                             img_acts_mediumcolor_sparse_rand<4, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8175                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8176                         } else {
8177                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 2, false, false, false>, cudaFuncCachePreferShared);
8178                             img_acts_mediumcolor_sparse_rand<4, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8179                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8180                         }
8181                     }
8182                 } else {
8183                     if (checkCaseBounds) {
8184                         if (colorsPerThread == 4) {
8185                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 4, false, true, false>, cudaFuncCachePreferShared);
8186                             img_acts_mediumcolor_sparse_rand<2, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8187                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8188                         } else {
8189                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 2, false, true, false>, cudaFuncCachePreferShared);
8190                             img_acts_mediumcolor_sparse_rand<2, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8191                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8192                         }
8193                     } else {
8194                         if (colorsPerThread == 4) {
8195                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 4, false, false, false>, cudaFuncCachePreferShared);
8196                             img_acts_mediumcolor_sparse_rand<2, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8197                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8198                         } else {
8199                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 2, false, false, false>, cudaFuncCachePreferShared);
8200                             img_acts_mediumcolor_sparse_rand<2, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8201                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8202                         }
8203                     }
8204                 }
8205             }
8206         } else { // do scale
8207             if (numFilterColors % 8 == 0) {
8208                 if (imgsPerThread == 4) {
8209                     if (checkCaseBounds) {
8210                         if (numFilterColors % 16 == 0) {
8211                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 4, true, true, false>, cudaFuncCachePreferShared);
8212                             img_acts_manycolor_sparse_rand<4, 32, 4, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8213                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8214                         } else {
8215                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 2, true, true, false>, cudaFuncCachePreferShared);
8216                             img_acts_manycolor_sparse_rand<4, 32, 4, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8217                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8218                         }
8219                     } else {
8220                         if (numFilterColors % 16 == 0) {
8221                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 4, true, false, false>, cudaFuncCachePreferShared);
8222                             img_acts_manycolor_sparse_rand<4, 32, 4, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8223                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8224                         } else {
8225                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 2, true, false, false>, cudaFuncCachePreferShared);
8226                             img_acts_manycolor_sparse_rand<4, 32, 4, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8227                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8228                         }
8229                     }
8230                 } else if (imgsPerThread == 2) {
8231                     if (checkCaseBounds) {
8232                         if (numFilterColors % 16 == 0) {
8233                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 4, true, true, false>, cudaFuncCachePreferShared);
8234                             img_acts_manycolor_sparse_rand<4, 32, 2, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8235                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8236                         } else {
8237                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 2, true, true, false>, cudaFuncCachePreferShared);
8238                             img_acts_manycolor_sparse_rand<4, 32, 2, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8239                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8240                         }
8241                     } else {
8242                         if (numFilterColors % 16 == 0) {
8243                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 4, true, false, false>, cudaFuncCachePreferShared);
8244                             img_acts_manycolor_sparse_rand<4, 32, 2, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8245                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8246                         } else {
8247                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 2, true, false, false>, cudaFuncCachePreferShared);
8248                             img_acts_manycolor_sparse_rand<4, 32, 2, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8249                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8250                         }
8251                     }
8252                 } else {
8253                     if (checkCaseBounds) {
8254                         if (numFilterColors % 16 == 0) {
8255                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 4, true, true, false>, cudaFuncCachePreferShared);
8256                             img_acts_manycolor_sparse_rand<4, 32, 1, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8257                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8258                         } else {
8259                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 2, true, true, false>, cudaFuncCachePreferShared);
8260                             img_acts_manycolor_sparse_rand<4, 32, 1, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8261                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8262                         }
8263                     } else {
8264                         if (numFilterColors % 16 == 0) {
8265                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 4, true, false, false>, cudaFuncCachePreferShared);
8266                             img_acts_manycolor_sparse_rand<4, 32, 1, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8267                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8268                         } else {
8269                             cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 2, true, false, false>, cudaFuncCachePreferShared);
8270                             img_acts_manycolor_sparse_rand<4, 32, 1, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8271                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8272                         }
8273                     }
8274                 }
8275             } else if (numFilterColors > 3) {
8276                 if (imgsPerThread == 8) {
8277                     if (checkCaseBounds) {
8278                         if (colorsPerThread == 4) {
8279                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 4, true, true, false>, cudaFuncCachePreferShared);
8280                             img_acts_mediumcolor_sparse_rand<8, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8281                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8282                         } else {
8283                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 2, true, true, false>, cudaFuncCachePreferShared);
8284                             img_acts_mediumcolor_sparse_rand<8, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8285                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8286                         }
8287                     } else {
8288                         if (colorsPerThread == 4) {
8289                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 4, true, false, false>, cudaFuncCachePreferShared);
8290                             img_acts_mediumcolor_sparse_rand<8, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8291                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8292                         } else {
8293                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 2, true, false, false>, cudaFuncCachePreferShared);
8294                             img_acts_mediumcolor_sparse_rand<8, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8295                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8296                         }
8297                     }
8298                 } else if (imgsPerThread == 4) {
8299                     if (checkCaseBounds) {
8300                         if (colorsPerThread == 4) {
8301                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 4, true, true, false>, cudaFuncCachePreferShared);
8302                             img_acts_mediumcolor_sparse_rand<4, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8303                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8304                         } else {
8305                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 2, true, true, false>, cudaFuncCachePreferShared);
8306                             img_acts_mediumcolor_sparse_rand<4, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8307                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8308                         }
8309                     } else {
8310                         if (colorsPerThread == 4) {
8311                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 4, true, false, false>, cudaFuncCachePreferShared);
8312                             img_acts_mediumcolor_sparse_rand<4, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8313                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8314                         } else {
8315                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 2, true, false, false>, cudaFuncCachePreferShared);
8316                             img_acts_mediumcolor_sparse_rand<4, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8317                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8318                         }
8319                     }
8320                 } else {
8321                     if (checkCaseBounds) {
8322                         if (colorsPerThread == 4) {
8323                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 4, true, true, false>, cudaFuncCachePreferShared);
8324                             img_acts_mediumcolor_sparse_rand<2, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8325                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8326                         } else {
8327                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 2, true, true, false>, cudaFuncCachePreferShared);
8328                             img_acts_mediumcolor_sparse_rand<2, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8329                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8330                         }
8331                     } else {
8332                         if (colorsPerThread == 4) {
8333                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 4, true, false, false>, cudaFuncCachePreferShared);
8334                             img_acts_mediumcolor_sparse_rand<2, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8335                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8336                         } else {
8337                             cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 2, true, false, false>, cudaFuncCachePreferShared);
8338                             img_acts_mediumcolor_sparse_rand<2, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8339                                                                 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8340                         }
8341                     }
8342                 }
8343             }
8344         }
8345     }
8346
8347     cutilCheckMsg("imgActsSparse: kernel execution failed");
8348 }
8349
8350 void convImgActsSparse(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
8351                     int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numFilterColors, int numGroups) {
8352     _imgActsSparse(hidActs, filters, targets, dColorIndices, imgSizeY, imgSizeX, numModulesY, paddingStart,
8353                    moduleStride, numImgColors, numFilterColors, numGroups, 0, 1, true);
8354 }
8355
8356 void convImgActsSparse(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
8357                        int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numFilterColors, int numGroups,
8358                        float scaleTargets, float scaleOutput) {
8359     _imgActsSparse(hidActs, filters, targets, dColorIndices, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride,
8360                    numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput, true);
8361 }
8362
8363 void localImgActsSparse(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
8364                     int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numFilterColors, int numGroups) {
8365     _imgActsSparse(hidActs, filters, targets, dColorIndices, imgSizeY, imgSizeX, numModulesY, paddingStart,
8366                    moduleStride, numImgColors, numFilterColors, numGroups, 0, 1, false);
8367 }
8368
8369 void localImgActsSparse(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
8370                        int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numFilterColors, int numGroups,
8371                        float scaleTargets, float scaleOutput) {
8372     _imgActsSparse(hidActs, filters, targets, dColorIndices, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride,
8373                    numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput, false);
8374 }
8375
8376 /*
8377  * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
8378  * All rights reserved.
8379  *
8380  * Redistribution and use in source and binary forms, with or without modification,
8381  * are permitted provided that the following conditions are met:
8382  *
8383  * - Redistributions of source code must retain the above copyright notice,
8384  *   this list of conditions and the following disclaimer.
8385  *
8386  * - Redistributions in binary form must reproduce the above copyright notice,
8387  *   this list of conditions and the following disclaimer in the documentation
8388  *   and/or other materials provided with the distribution.
8389  *
8390  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
8391  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
8392  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
8393  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
8394  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
8395  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
8396  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
8397  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
8398  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
8399  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8400  */
8401
8402 #ifndef _WEIGHT_ACTS_EXPORT
8403 #define _WEIGHT_ACTS_EXPORT
8404 #endif
8405
8406 #include <weight_acts.cuh>
8407 #include <cudaconv2.cuh>
8408
8409 #define LO16(x)     ((x) & 0x0000FFFF)
8410 #define HI16(x)     ((x) >> 16)
8411
8412 /*
8413  * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X filters
8414  * threadIdx.x determines filter
8415  * threadIdx.y determines pixel in filter
8416  *
8417  * blockIdx.x determines filter batch of B_X, module batch of partialSum
8418  * blockIdx.y determines pixel batch of B_Y * pixelsPerThread
8419  *
8420  * Number of filters must be divisible by B_X
8421  * Number of images (cases) should be divisible by preloadCases if checkCaseBounds is false.
8422  *
8423  * images:      (numColors, imgSizeY, imgSizeX, numImages), with stride given
8424  * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
8425  *
8426  * targets:     (numModulesY*numModulesX/partialSum, numColors, filterPixels, numFilters)
8427  *
8428  * B_Y * B_X should be divisible by preloadCases.
8429  * preloadCases one of 16, 32.
8430  * B_X one of 4, 8, 16, 32
8431  * B_Y arbitrary (satisfying divisibility constraints)
8432  * numModules must be divisible by partialSum
8433  *
8434  * After adding pixelsPerThread, register usage went from 20 to 23 (when pixelsPerThread = 1)...
8435  * so the compiler is messing up here somehow. It's unable to optimize that case away.
8436  */
8437 template <int B_Y, int B_X, int pixelsPerThread, int preloadCases, int numColors, bool scale, bool checkCaseBounds>
8438 __global__ void conv_weight_acts_c(float* images, float* hidActs, float* targets,
8439                                    const int numImages, const int numFilters,
8440                                    const int numModulesY, const int numModulesX,
8441                                    const int imgSizeY, const int imgSizeX, const int filterSize,
8442                                    const int paddingStart, const int moduleStride, const int imgStride,
8443                                    const int partialSum,
8444                                    const float scaleTargets, const float scaleOutputs) {
8445     __shared__ float shImages[pixelsPerThread * B_Y * numColors][preloadCases]; // preload preloadCases cases of B_Y * pixelsPerThread pixels
8446     __shared__ float shHidActs[B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidActs
8447
8448     const int tidx = B_X * threadIdx.y + threadIdx.x;
8449     const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
8450
8451     const int filterPixels = filterSize * filterSize;
8452     const int imgPixels = imgSizeY * imgSizeX;
8453
8454     const int filterBlocksPerModule = numFilters / B_X;
8455     const int outputModuleIdx = blockIdx.x / filterBlocksPerModule;
8456     const int moduleIdx = partialSum * outputModuleIdx;
8457     const int blockFilterIdx = B_X * (blockIdx.x % filterBlocksPerModule);
8458
8459 //    const int moduleStride = (imgSize - filterSize + 1) / numModulesX;
8460     const int numModules = numModulesY * numModulesX;
8461
8462     const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread;
8463
8464     images += loadX;
8465     hidActs += moduleIdx * numImages
8466             + blockFilterIdx * numImages * numModules
8467             + loadY * numImages * numModules
8468             + loadX;
8469
8470     targets += (outputModuleIdx * numFilters) * filterPixels * numColors
8471             + blockPixelOffset * numFilters
8472             + blockFilterIdx
8473             + threadIdx.y * numFilters + threadIdx.x;
8474
8475     float* shImgLoad = &shImages[loadY][loadX];
8476     float* shHidActLoad = &shHidActs[loadY][loadX];
8477
8478     float prod[numColors][pixelsPerThread];
8479     #pragma unroll
8480     for (int c = 0; c < numColors; c++) {
8481         #pragma unroll
8482         for (int p = 0; p < pixelsPerThread; p++) {
8483             prod[c][p] = 0;
8484         }
8485     }
8486
8487     __shared__ int pxDivs[B_Y*pixelsPerThread];
8488     if (tidx < B_Y * pixelsPerThread) {
8489         pxDivs[tidx] = (((blockPixelOffset + tidx) / filterSize) << 16) + ((blockPixelOffset + tidx) % filterSize);
8490     }
8491     __syncthreads();
8492     for (int m = moduleIdx; m < moduleIdx + partialSum; m++) {
8493         const int imgLoadModPosY = paddingStart + (m / numModulesX) * moduleStride;
8494         const int imgLoadModPosX = paddingStart + (m % numModulesX) * moduleStride;
8495         for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
8496             if (loadY < B_Y * pixelsPerThread) {
8497                 /*
8498                  * As long as B_Y * B_X is divisible by preloadCases this will loop the right
8499                  * number of times.
8500                  *
8501                  * This will load some imgGrads from filter pixels that don't exit (it'll set those to 0),
8502                  * but the code does not produce any output for those pixels (see last lines).
8503                  */
8504     //            #pragma unroll
8505                 for (int y = 0; y < B_Y * pixelsPerThread; y += (B_X * B_Y) / preloadCases) {
8506                     // Make sure number of rows in the array is divisible by number of rows filled per iteration
8507                     if ((B_Y * pixelsPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y * pixelsPerThread) {
8508                         const int pxIdx = loadY + y; // pixel idx in filter
8509
8510                         if (pxIdx + blockPixelOffset < filterPixels && (!checkCaseBounds || caseIdx + loadX < numImages)) {
8511                             const int pxY = imgLoadModPosY + HI16(pxDivs[pxIdx]); // pixel x,y coords in image
8512                             const int pxX = imgLoadModPosX + LO16(pxDivs[pxIdx]);
8513                             if (pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX) {
8514                                 const int pixIdx = (pxY * imgSizeX + pxX) * imgStride;
8515                                 #pragma unroll
8516                                 for (int c = 0; c < numColors; c++) {
8517                                     shImgLoad[(y + c * pixelsPerThread * B_Y) * preloadCases] = images[caseIdx + c * imgPixels * imgStride + pixIdx];
8518                                 }
8519                             } else {
8520                                 #pragma unroll
8521                                 for (int c = 0; c < numColors; c++) {
8522                                     shImgLoad[(y + c * pixelsPerThread * B_Y) * preloadCases] = 0;
8523                                 }
8524                             }
8525                         } else {
8526                             #pragma unroll
8527                             for (int c = 0; c < numColors; c++) {
8528                                 shImgLoad[(y + c * pixelsPerThread * B_Y) * preloadCases] = 0;
8529                             }
8530                         }
8531                     }
8532                 }
8533             }
8534             if (loadY < B_X && (!checkCaseBounds || caseIdx + loadX < numImages)) {
8535                 #pragma unroll
8536                 for (int y = 0; y < B_X; y += (B_X * B_Y) / preloadCases) {
8537                     // Make sure number of rows in the array is divisible by number of rows filled per iteration
8538                     if (B_X % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_X) {
8539                         shHidActLoad[y * (preloadCases + 1)] = hidActs[caseIdx + y * numImages * numModules];
8540                     }
8541                 }
8542             }
8543
8544             __syncthreads();
8545             #pragma unroll
8546             for (int p = 0; p < pixelsPerThread; p++) {
8547                 #pragma unroll
8548                 for (int i = 0; i < preloadCases; i++) {
8549                     #pragma unroll
8550                     for (int c = 0; c < numColors; c++) {
8551                         prod[c][p] += shImages[threadIdx.y + p * B_Y + c * pixelsPerThread * B_Y][i] * shHidActs[threadIdx.x][i];
8552                     }
8553                 }
8554             }
8555             __syncthreads();
8556         }
8557         hidActs += numImages;
8558     }
8559
8560     if (scale) {
8561         #pragma unroll
8562         for (int p = 0; p < pixelsPerThread; p++) {
8563             if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
8564                 #pragma unroll
8565                 for (int c = 0; c < numColors; c++) {
8566                     targets[p * B_Y * numFilters + c * filterPixels * numFilters] = scaleTargets * targets[p * B_Y * numFilters + c * filterPixels * numFilters] + scaleOutputs * prod[c][p];
8567                 }
8568             }
8569         }
8570     } else {
8571         #pragma unroll
8572         for (int p = 0; p < pixelsPerThread; p++) {
8573             if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
8574                 #pragma unroll
8575                 for (int c = 0; c < numColors; c++) {
8576                     targets[p * B_Y * numFilters + c * filterPixels * numFilters] = scaleOutputs * prod[c][p];
8577                 }
8578             }
8579         }
8580     }
8581 }
8582
8583 /*
8584  * Each block computes weight gradients for B_Y pixels and B_X * filtersPerThread filters
8585  * threadIdx.x determines filter
8586  * threadIdx.y determines pixel in filter
8587  *
8588  * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum
8589  * blockIdx.y determines pixel, color batch of B_Y * colorsPerThread
8590  *      In essence, blockIdx.y.x = 0...numFilterColors / colorsPerThread
8591  *                  blockIdx.y.y = 0...DIVUP(numPixels, B_Y)
8592  * ============
8593  * CONSTRAINTS:
8594  * ============
8595  * numFilters/numGroups must be divisible by B_X * filtersPerThread
8596  * numImgColors/numGroups must be divisible by colorsPerThread
8597  * numFilters must be divisible by numGroups
8598  * numImgColors must be divisible by numGroups
8599  * Number of images (cases) should be divisible by preloadCases if checkCaseBounds is false.
8600  *
8601  * images:      (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
8602  * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
8603  *
8604  * targets:     (numModulesY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters)
8605  *
8606  * B_Y * B_X should be divisible by preloadCases.
8607  * preloadCases one of 16, 32.
8608  * B_X one of 4, 8, 16, 32
8609  * B_Y arbitrary (satisfying divisibility constraints)
8610  *
8611  * This routine is especially fast when numFilters >= 32. That's when it should be used.
8612  */
8613 template <int B_Y, int B_X, int filtersPerThread, int colorsPerThread, int preloadCases, bool scale, bool checkCaseBounds>
8614 __global__ void conv_weight_acts_mc_mf(float* images, float* hidActs, float* targets,
8615                                        const int numImages, const int numFilters,
8616                                        const int numModulesY, const int numModulesX,
8617                                        const int imgSizeY, const int imgSizeX, const int filterSize,
8618                                        const int paddingStart, const int moduleStride, const int imgStride,
8619                                        const int numImgColors, const int numGroups, const int partialSum,
8620                                        const float scaleTargets, const float scaleOutputs) {
8621     __shared__ float shImages[colorsPerThread * B_Y][preloadCases]; // preload preloadCases cases of B_Y * pixelsPerThread pixels
8622     __shared__ float shHidActs[filtersPerThread * B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidacts
8623
8624     const int tidx = B_X * threadIdx.y + threadIdx.x;
8625     const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
8626
8627     const int filterPixels = filterSize * filterSize;
8628     const int imgPixels = imgSizeY * imgSizeX;
8629
8630     const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
8631     const int outputModuleIdx = blockIdx.x / numFilterBlocks;
8632     const int moduleIdx = partialSum * outputModuleIdx;
8633     const int blockFilterIdx = filtersPerThread * B_X * (blockIdx.x % numFilterBlocks);
8634     const int numModules = numModulesY * numModulesX;
8635
8636     const int numFiltersPerGroup = numFilters / numGroups;
8637     const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
8638     const int numFilterColors = numImgColors / numGroups;
8639
8640     const int blockPixelOffset = (blockIdx.y / (numFilterColors/colorsPerThread)) * B_Y;
8641     const int filterColorIdx = (blockIdx.y % (numFilterColors/colorsPerThread)) * colorsPerThread;
8642     const int imgColorIdx = filterColorIdx + blockGroupIdx * numFilterColors;
8643
8644     images += imgColorIdx * imgPixels * imgStride + loadX;
8645
8646     hidActs += moduleIdx * numImages
8647             + blockFilterIdx * numImages * numModules
8648             + loadY * numImages * numModules
8649             + loadX;
8650
8651     targets += outputModuleIdx * numFilters * filterPixels * numFilterColors
8652             + filterColorIdx * filterPixels * numFilters
8653             + blockPixelOffset * numFilters
8654             + blockFilterIdx
8655             + threadIdx.y * numFilters + threadIdx.x;
8656
8657     float* shHidActLoad = &shHidActs[loadY][loadX];
8658     float* shImgLoad = &shImages[loadY][loadX];
8659     float prod[colorsPerThread][filtersPerThread];
8660     #pragma unroll
8661     for (int c = 0; c < colorsPerThread; c++) {
8662         #pragma unroll
8663         for (int f = 0; f < filtersPerThread; f++) {
8664             prod[c][f] = 0;
8665         }
8666     }
8667     // This avoids doing a division in an inner loop
8668     __shared__ int pxDivs[B_Y];
8669     if (tidx < B_Y) {
8670         pxDivs[tidx] = (((blockPixelOffset + tidx) / filterSize) << 16) + (blockPixelOffset + tidx) % filterSize;
8671     }
8672     __syncthreads();
8673     for (int m = moduleIdx; m < moduleIdx + partialSum; m++) {
8674         const int imgLoadModPosY = paddingStart + (m / numModulesX) * moduleStride;
8675         const int imgLoadModPosX = paddingStart + (m % numModulesX) * moduleStride;
8676         for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
8677             if (loadY < B_Y) {
8678                 /*
8679                  * As long as B_Y * B_X is divisible by preloadCases this will loop the right
8680                  * number of times.
8681                  *
8682                  * This will load some images from filter pixels that don't exist (it'll set those to 0),
8683                  * but the code does not produce any output for those pixels (see last lines).
8684                  */
8685     //            #pragma unroll
8686                 for (int y = 0; y < B_Y; y += (B_X * B_Y) / preloadCases) {
8687                     // Make sure number of rows in the array is divisible by number of rows filled per iteration
8688                     if (B_Y % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y) {
8689                         const int pxIdx = loadY + y; // pixel idx in filter
8690
8691                         if (pxIdx + blockPixelOffset < filterPixels && (!checkCaseBounds || caseIdx + loadX < numImages)) {
8692                             const int pxY = imgLoadModPosY + HI16(pxDivs[pxIdx]);//pxIdx / filterSize; // pixel x,y coords in image
8693                             const int pxX = imgLoadModPosX + LO16(pxDivs[pxIdx]);
8694                             if (pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX) {
8695                                 const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image
8696                                 #pragma unroll
8697                                 for (int c = 0; c < colorsPerThread; c++) {
8698                                     shImgLoad[(y + c * B_Y) * preloadCases] = images[caseIdx + c * imgPixels * imgStride + pixIdx];
8699                                 }
8700                             } else {
8701                                 #pragma unroll
8702                                 for (int c = 0; c < colorsPerThread; c++) {
8703                                     shImgLoad[(y + c * B_Y) * preloadCases] = 0;
8704                                 }
8705                             }
8706                         } else {
8707                             #pragma unroll
8708                             for (int c = 0; c < colorsPerThread; c++) {
8709                                 shImgLoad[(y + c * B_Y) * preloadCases] = 0;
8710                             }
8711                         }
8712                     }
8713                 }
8714             }
8715             if (loadY < B_X * filtersPerThread && (!checkCaseBounds || caseIdx + loadX < numImages)) {
8716                 #pragma unroll
8717                 for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) {
8718                     // Make sure number of rows in the array is divisible by number of rows filled per iteration
8719                     if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_X * filtersPerThread) {
8720                         shHidActLoad[y * (preloadCases + 1)] = hidActs[caseIdx + y * numImages * numModules];
8721                     }
8722                 }
8723             }
8724
8725             __syncthreads();
8726
8727             #pragma unroll
8728             for (int c = 0; c < colorsPerThread; c++) {
8729                 #pragma unroll
8730                 for (int i = 0; i < preloadCases; i++) {
8731                     #pragma unroll
8732                     for (int f = 0; f < filtersPerThread; f++) {
8733                         prod[c][f] += shImages[threadIdx.y + c * B_Y][i] * shHidActs[threadIdx.x + f * B_X][i];
8734                     }
8735                 }
8736             }
8737             __syncthreads();
8738         }
8739         hidActs += numImages;
8740     }
8741     if (blockPixelOffset + threadIdx.y < filterPixels) {
8742         if (scale) {
8743             #pragma unroll
8744             for (int f = 0; f < filtersPerThread; f++) {
8745                 #pragma unroll
8746                 for (int c = 0; c < colorsPerThread; c++) {
8747                     targets[c * filterPixels * numFilters + f * B_X] = scaleTargets * targets[c * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[c][f];
8748                 }
8749             }
8750         } else {
8751             #pragma unroll
8752             for (int f = 0; f < filtersPerThread; f++) {
8753                 #pragma unroll
8754                 for (int c = 0; c < colorsPerThread; c++) {
8755                     targets[c * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[c][f];
8756                 }
8757             }
8758         }
8759     }
8760 }
8761
8762 /*
8763  * Each block computes weight gradients for B_Y pixels and B_X * filtersPerThread filters
8764  * threadIdx.x determines filter
8765  * threadIdx.y determines pixel in filter
8766  *
8767  * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum
8768  * blockIdx.y determines pixel, color batch of B_Y * colorsPerThread
8769  *      In essence, blockIdx.y.x = 0...numFilterColors / colorsPerThread
8770  *                  blockIdx.y.y = 0...DIVUP(numPixels, B_Y)
8771  * ============
8772  * CONSTRAINTS:
8773  * ============
8774  * numFilters/numGroups must be divisible by B_X * filtersPerThread
8775  * numFilterColors must be divisible by colorsPerThread
8776  * numFilters must be divisible by numGroups
8777  * numImgColors must be divisible by numFilterColors
8778  * Number of images (cases) should be divisible by preloadCases if checkCaseBounds is false.
8779  *
8780  * images:          (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
8781  * hidActs:         (numFilters, numModulesY, numModulesX, numImages)
8782  *
8783  * targets:         (numModules, numFilterColors, filterPixels, numFilters)
8784  * colorIndices:    (numGroups, numFilterColors)
8785  *
8786  * B_Y * B_X should be divisible by preloadCases.
8787  * preloadCases one of 16, 32.
8788  * B_X one of 4, 8, 16, 32
8789  * B_Y arbitrary (satisfying divisibility constraints)
8790  *
8791  * This routine is especially fast when numFilters >= 32. That's when it should be used.
8792  */
8793 template <int B_Y, int B_X, int filtersPerThread, int colorsPerThread, int preloadCases, bool scale, bool checkCaseBounds>
8794 __global__ void conv_weight_acts_mc_mf_rand(float* images, float* hidActs, float* targets, int* colorIndices,
8795                                            const int numImages, const int numFilters,
8796                                            const int numModulesY, const int numModulesX,
8797                                            const int imgSizeY, const int imgSizeX, const int filterSize,
8798                                            const int paddingStart, const int moduleStride, const int imgStride,
8799                                            const int numFilterColors, const int numGroups, const int partialSum,
8800                                            const float scaleTargets, const float scaleOutputs) {
8801     __shared__ float shImages[colorsPerThread * B_Y][preloadCases]; // preload preloadCases cases of B_Y * pixelsPerThread pixels
8802     __shared__ float shHidActs[filtersPerThread * B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidacts
8803     __shared__ int shColors[colorsPerThread];
8804     // This avoids doing a division in an inner loop
8805     __shared__ int pxDivs[B_Y];
8806     const int tidx = B_X * threadIdx.y + threadIdx.x;
8807     const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
8808
8809     const int filterPixels = filterSize * filterSize;
8810     const int imgPixels = imgSizeY * imgSizeX;
8811
8812     const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
8813     const int outputModuleIdx = blockIdx.x / numFilterBlocks;
8814     const int moduleIdx = partialSum * outputModuleIdx;
8815     const int blockFilterIdx = filtersPerThread * B_X * (blockIdx.x % numFilterBlocks);
8816     const int numModules = numModulesY * numModulesX;
8817
8818     const int numFiltersPerGroup = numFilters / numGroups;
8819     const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
8820
8821     const int blockPixelOffset = (blockIdx.y / (numFilterColors/colorsPerThread)) * B_Y;
8822     const int filterColorIdx = (blockIdx.y % (numFilterColors/colorsPerThread)) * colorsPerThread;
8823 //    const int imgColorIdx = filterColorIdx + blockGroupIdx * numFilterColors;
8824
8825     images += loadX;
8826
8827     hidActs += moduleIdx * numImages
8828             + blockFilterIdx * numImages * numModules
8829             + loadY * numImages * numModules
8830             + loadX;
8831
8832     targets += outputModuleIdx * numFilters * filterPixels * numFilterColors
8833             + filterColorIdx * filterPixels * numFilters
8834             + blockPixelOffset * numFilters
8835             + blockFilterIdx
8836             + threadIdx.y * numFilters + threadIdx.x;
8837
8838     float* shHidActLoad = &shHidActs[loadY][loadX];
8839     float* shImgLoad = &shImages[loadY][loadX];
8840     float prod[colorsPerThread][filtersPerThread];
8841     #pragma unroll
8842     for (int c = 0; c < colorsPerThread; c++) {
8843         #pragma unroll
8844         for (int f = 0; f < filtersPerThread; f++) {
8845             prod[c][f] = 0;
8846         }
8847     }
8848
8849     if (tidx < B_Y) {
8850         pxDivs[tidx] = ((blockPixelOffset + tidx) / filterSize << 16) + ((blockPixelOffset + tidx) % filterSize);
8851     }
8852     if (tidx < colorsPerThread) {
8853         shColors[tidx] = colorIndices[blockGroupIdx * numFilterColors + filterColorIdx + tidx] * imgPixels * imgStride;
8854     }
8855     __syncthreads();
8856     for (int m = moduleIdx; m < moduleIdx + partialSum; m++) {
8857         const int imgLoadModPosY = paddingStart + (m / numModulesX) * moduleStride;
8858         const int imgLoadModPosX = paddingStart + (m % numModulesX) * moduleStride;
8859         for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
8860             if (loadY < B_Y) {
8861                 /*
8862                  * As long as B_Y * B_X is divisible by preloadCases this will loop the right
8863                  * number of times.
8864                  *
8865                  * This will load some images from filter pixels that don't exist (it'll set those to 0),
8866                  * but the code does not produce any output for those pixels (see last lines).
8867                  */
8868     //            #pragma unroll
8869                 for (int y = 0; y < B_Y; y += (B_X * B_Y) / preloadCases) {
8870                     // Make sure number of rows in the array is divisible by number of rows filled per iteration
8871                     if (B_Y % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y) {
8872                         const int pxIdx = loadY + y; // pixel idx in filter
8873
8874                         if (pxIdx + blockPixelOffset < filterPixels && (!checkCaseBounds || caseIdx + loadX < numImages)) {
8875                             const int pxY = imgLoadModPosY + HI16(pxDivs[pxIdx]);//pxIdx / filterSize; // pixel x,y coords in image
8876                             const int pxX = imgLoadModPosX + LO16(pxDivs[pxIdx]);
8877                             if (pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX) {
8878                                 const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image
8879                                 #pragma unroll
8880                                 for (int c = 0; c < colorsPerThread; c++) {
8881                                     shImgLoad[(y + c * B_Y) * preloadCases] = images[caseIdx + shColors[c] + pixIdx];
8882                                 }
8883                             } else {
8884                                 #pragma unroll
8885                                 for (int c = 0; c < colorsPerThread; c++) {
8886                                     shImgLoad[(y + c * B_Y) * preloadCases] = 0;
8887                                 }
8888                             }
8889                         } else {
8890                             #pragma unroll
8891                             for (int c = 0; c < colorsPerThread; c++) {
8892                                 shImgLoad[(y + c * B_Y) * preloadCases] = 0;
8893                             }
8894                         }
8895                     }
8896                 }
8897             }
8898             if (loadY < B_X * filtersPerThread && (!checkCaseBounds || caseIdx + loadX < numImages)) {
8899                 #pragma unroll
8900                 for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) {
8901                     // Make sure number of rows in the array is divisible by number of rows filled per iteration
8902                     if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_X * filtersPerThread) {
8903                         shHidActLoad[y * (preloadCases + 1)] = hidActs[caseIdx + y * numImages * numModules];
8904                     }
8905                 }
8906             }
8907
8908             __syncthreads();
8909
8910             #pragma unroll
8911             for (int c = 0; c < colorsPerThread; c++) {
8912                 #pragma unroll
8913                 for (int i = 0; i < preloadCases; i++) {
8914                     #pragma unroll
8915                     for (int f = 0; f < filtersPerThread; f++) {
8916                         prod[c][f] += shImages[threadIdx.y + c * B_Y][i] * shHidActs[threadIdx.x + f * B_X][i];
8917                     }
8918                 }
8919             }
8920             __syncthreads();
8921         }
8922         hidActs += numImages;
8923     }
8924     if (blockPixelOffset + threadIdx.y < filterPixels) {
8925         if (scale) {
8926             #pragma unroll
8927             for (int f = 0; f < filtersPerThread; f++) {
8928                 #pragma unroll
8929                 for (int c = 0; c < colorsPerThread; c++) {
8930                     targets[c * filterPixels * numFilters + f * B_X] = scaleTargets * targets[c * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[c][f];
8931                 }
8932             }
8933         } else {
8934             #pragma unroll
8935             for (int f = 0; f < filtersPerThread; f++) {
8936                 #pragma unroll
8937                 for (int c = 0; c < colorsPerThread; c++) {
8938                     targets[c * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[c][f];
8939                 }
8940             }
8941         }
8942     }
8943 }
8944
8945 /*
8946  * images:      (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
8947  * hidActs:     (numFilters, numModules, numImages)
8948  *
8949  * targets:     (numModuleY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters)
8950  *
8951  * TODO: you can get a slight speed boost for local non-convolutional units by writing special
8952  * routines for partialSum = 1. But I dunno if the code duplication is worth it...
8953  *
8954  * Note: all of these convolution routines are optimized for the case when
8955  * the number of images (i.e. the minibatch size) is a multiple of 128.
8956  * Other batch sizes will work, but but I made no attempt whatsoever
8957  * to make them work fast.
8958  */
8959 void _weightActs(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets,
8960         int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors,
8961         int numGroups, int partialSum, float scaleTargets, float scaleOutput) {
8962     int numFilterColors = numImgColors / numGroups;
8963     int imgStride = images.getStride();
8964     int numImages = images.getNumCols();
8965     int imgPixels = images.getNumRows() / numImgColors;
8966     int imgSizeX = imgPixels / imgSizeY;
8967     int numModules = numModulesY * numModulesX;
8968     int numFilters = hidActs.getNumRows() / numModules;
8969     int numFiltersPerGroup = numFilters / numGroups;
8970
8971     assert(numImgColors % numGroups == 0);
8972     assert(numFilters % (16*numGroups) == 0);
8973     if (!(numGroups > 1 || (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 4 == 0))))
8974     {
8975         printf("numGroups: %d\n", numGroups);
8976         printf("numImgColors: %d\n", numImgColors);
8977         assert(false);
8978     }
8979     assert(numGroups == 1 || numFilterColors % 4 == 0);
8980     assert(imgSizeY * imgSizeX == imgPixels);
8981     assert(images.getNumRows() == imgPixels * numImgColors);
8982
8983     int filterPixels = filterSize * filterSize;
8984     partialSum = partialSum == 0 ? numModules : partialSum;
8985
8986     assert(numModules % partialSum == 0);
8987     assert(hidActs.getNumCols() == numImages);
8988
8989     // These routines don't handle the case when only part of the image is visited in the convolution
8990     assert(paddingStart <= 0);
8991     // assert changed to if statement by Ian Goodfellow
8992     if (paddingStart + (numModulesX-1)*moduleStride + filterSize < imgSizeX)
8993     {
8994         printf("imgSizeX: %d\n", imgSizeX);
8995         printf("numModulesX: %d\n", numModulesX);
8996         assert(false);
8997     }
8998     assert(paddingStart + (numModulesY-1)*moduleStride + filterSize >= imgSizeY);
8999     assert(moduleStride <= filterSize);
9000
9001     assert(numModules * numFilters == hidActs.getNumRows());
9002
9003     assert(!images.isTrans());
9004     assert(!hidActs.isTrans());
9005     assert(hidActs.isContiguous());
9006
9007     assert(!targets.isTrans());
9008     assert(targets.isContiguous());
9009
9010     int preloadCases = 32;
9011
9012     dim3 blocks, threads;
9013     int bx, by;
9014     int pixelsPerThread, filtersPerThread, colorsPerThread;
9015     // Worth playing with these parameters to find best values for your problem.
9016     // These values work relatively well, but not optimal for all problems.
9017     if (numFilterColors > 3) {
9018         filtersPerThread = numFiltersPerGroup % 32 == 0 ? 2 : 1;
9019         colorsPerThread = numFilterColors % 8 == 0 ? 8 : 4;
9020         by = numFiltersPerGroup % 64 == 0 ? 4 : 8;
9021         bx = numFiltersPerGroup % 64 == 0 ? 32 : 16;
9022         blocks = dim3((numModules/partialSum)*(numFilters/(bx*filtersPerThread)), DIVUP(filterPixels, by) * (numFilterColors / colorsPerThread));
9023     } else {
9024         assert(numGroups == 1); // Just for sanity
9025         pixelsPerThread = numFilters % 32 == 0 ? (numImgColors == 1 ? 8 : 5) : (numImgColors == 1 ? 5 : 2);
9026         by = numFilters % 32 == 0 ? 4 : 8; // by == 4 seems to work best
9027         bx = numFilters % 32 == 0 ? 32 : 16;
9028         blocks = dim3((numModules/partialSum)*(numFilters/bx), DIVUP(filterPixels, by*pixelsPerThread));
9029     }
9030     assert((by * bx) % preloadCases == 0);
9031     threads = dim3(bx, by);
9032     bool checkCaseBounds = numImages % 32 != 0;
9033
9034     /* Modified by Ian Goodfellow. I removed the branch here, because our wrapper doesn't
9035        support resizing when the data isn't owned by the NVMatrix. Also, the resize should
9036        always be a no-op, because in the context we're likely to use this, we should always
9037        have allocated the right size of NVMatrix to receive the gradient.
9038     if (scaleTargets == 0) {
9039         targets.resize((numModules/partialSum) * numFilterColors*filterPixels, numFilters);
9040     } else {
9041         assert(targets.getNumRows() == (numModules/partialSum) * numFilterColors*filterPixels);
9042         assert(targets.getNumCols() == numFilters);
9043     }
9044     */
9045
9046
9047     assert(targets.getNumRows() == (numModules/partialSum) * numFilterColors*filterPixels);
9048     assert(targets.getNumCols() == numFilters);
9049
9050     if (numFilterColors > 3) {
9051         if (scaleTargets == 0) { // do not scale
9052             if (numFiltersPerGroup % 64 == 0) {
9053                 if (numFilterColors % 8 == 0) {
9054                     if (checkCaseBounds) {
9055                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<4,32,2,8,32, false, true>, cudaFuncCachePreferShared);
9056                         conv_weight_acts_mc_mf<4,32,2,8,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9057                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9058                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9059                     } else {
9060                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<4,32,2,8,32, false, false>, cudaFuncCachePreferShared);
9061                         conv_weight_acts_mc_mf<4,32,2,8,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9062                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9063                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9064                     }
9065                 } else {
9066                     if (checkCaseBounds) {
9067                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<4,32,2,4,32, false, true>, cudaFuncCachePreferShared);
9068                         conv_weight_acts_mc_mf<4,32,2,4,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9069                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9070                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9071                     } else {
9072                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<4,32,2,4,32, false, false>, cudaFuncCachePreferShared);
9073                         conv_weight_acts_mc_mf<4,32,2,4,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9074                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9075                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9076                     }
9077                 }
9078             } else if (numFiltersPerGroup % 32 == 0) {
9079                 if (numFilterColors % 8 == 0) {
9080                     if (checkCaseBounds) {
9081                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,2,8,32, false, true>, cudaFuncCachePreferShared);
9082                         conv_weight_acts_mc_mf<8,16,2,8,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9083                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9084                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9085                     } else {
9086                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,2,8,32, false, false>, cudaFuncCachePreferShared);
9087                         conv_weight_acts_mc_mf<8,16,2,8,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9088                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9089                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9090                     }
9091                 } else {
9092                     if (checkCaseBounds) {
9093                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,2,4,32, false, true>, cudaFuncCachePreferShared);
9094                         conv_weight_acts_mc_mf<8,16,2,4,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9095                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9096                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9097                     } else {
9098                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,2,4,32, false, false>, cudaFuncCachePreferShared);
9099                         conv_weight_acts_mc_mf<8,16,2,4,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9100                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9101                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9102                     }
9103                 }
9104             } else {
9105                 if (numFilterColors % 8 == 0) {
9106                     if (checkCaseBounds) {
9107                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,1,8,32, false, true>, cudaFuncCachePreferShared);
9108                         conv_weight_acts_mc_mf<8,16,1,8,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9109                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9110                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9111                     } else {
9112                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,1,8,32, false, false>, cudaFuncCachePreferShared);
9113                         conv_weight_acts_mc_mf<8,16,1,8,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9114                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9115                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9116                     }
9117                 } else {
9118                     if (checkCaseBounds) {
9119                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,1,4,32, false, true>, cudaFuncCachePreferShared);
9120                         conv_weight_acts_mc_mf<8,16,1,4,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9121                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9122                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9123                     } else {
9124                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,1,4,32, false, false>, cudaFuncCachePreferShared);
9125                         conv_weight_acts_mc_mf<8,16,1,4,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9126                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9127                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9128                     }
9129                 }
9130             }
9131         } else {
9132
9133             if (numFiltersPerGroup % 64 == 0) {
9134                 if (numFilterColors % 8 == 0) {
9135                     if (checkCaseBounds) {
9136                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<4,32,2,8,32, false, true>, cudaFuncCachePreferShared);
9137                         conv_weight_acts_mc_mf<4,32,2,8,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9138                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9139                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9140                     } else {
9141                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<4,32,2,8,32, false, false>, cudaFuncCachePreferShared);
9142                         conv_weight_acts_mc_mf<4,32,2,8,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9143                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9144                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9145                     }
9146                 } else {
9147                     if (checkCaseBounds) {
9148                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<4,32,2,4,32, false, true>, cudaFuncCachePreferShared);
9149                         conv_weight_acts_mc_mf<4,32,2,4,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9150                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9151                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9152                     } else {
9153                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<4,32,2,4,32, false, false>, cudaFuncCachePreferShared);
9154                         conv_weight_acts_mc_mf<4,32,2,4,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9155                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9156                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9157                     }
9158                 }
9159             } else if (numFiltersPerGroup % 32 == 0) {
9160                 if (numFilterColors % 8 == 0) {
9161                     if (checkCaseBounds) {
9162                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,2,8,32, false, true>, cudaFuncCachePreferShared);
9163                         conv_weight_acts_mc_mf<8,16,2,8,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9164                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9165                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9166                     } else {
9167                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,2,8,32, false, false>, cudaFuncCachePreferShared);
9168                         conv_weight_acts_mc_mf<8,16,2,8,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9169                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9170                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9171                     }
9172                 } else {
9173                     if (checkCaseBounds) {
9174                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,2,4,32, false, true>, cudaFuncCachePreferShared);
9175                         conv_weight_acts_mc_mf<8,16,2,4,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9176                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9177                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9178                     } else {
9179                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,2,4,32, false, false>, cudaFuncCachePreferShared);
9180                         conv_weight_acts_mc_mf<8,16,2,4,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9181                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9182                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9183                     }
9184                 }
9185             } else {
9186                 if (numFilterColors % 8 == 0) {
9187                     if (checkCaseBounds) {
9188                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,1,8,32, false, true>, cudaFuncCachePreferShared);
9189                         conv_weight_acts_mc_mf<8,16,1,8,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9190                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9191                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9192                     } else {
9193                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,1,8,32, false, false>, cudaFuncCachePreferShared);
9194                         conv_weight_acts_mc_mf<8,16,1,8,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9195                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9196                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9197                     }
9198                 } else {
9199                     if (checkCaseBounds) {
9200                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,1,4,32, false, true>, cudaFuncCachePreferShared);
9201                         conv_weight_acts_mc_mf<8,16,1,4,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9202                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9203                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9204                     } else {
9205                         cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,1,4,32, false, false>, cudaFuncCachePreferShared);
9206                         conv_weight_acts_mc_mf<8,16,1,4,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9207                                                                                        numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9208                                                                                        paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9209                     }
9210                 }
9211             }
9212         }
9213     } else { // numColors in 1,2,3
9214         if (scaleTargets == 0) { // do not scale
9215             if (numFilterColors == 1) {
9216                 if (checkCaseBounds) {
9217                     if (numFilters % 32 == 0) {
9218                         cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,8,32,1, false, true>, cudaFuncCachePreferShared);
9219                         conv_weight_acts_c<4,32,8,32,1,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9220                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9221                     } else {
9222                         cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,5,32,1, false, true>, cudaFuncCachePreferShared);
9223                         conv_weight_acts_c<8,16,5,32,1,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9224                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9225                     }
9226                 } else {
9227                     if (numFilters % 32 == 0) {
9228                         cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,8,32,1, false, false>, cudaFuncCachePreferShared);
9229                         conv_weight_acts_c<4,32,8,32,1,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9230                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9231                     } else {
9232                         cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,5,32,1, false, false>, cudaFuncCachePreferShared);
9233                         conv_weight_acts_c<8,16,5,32,1,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9234                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9235                     }
9236                 }
9237             } else if (numFilterColors == 2) {
9238                 if (checkCaseBounds) {
9239                     if (numFilters % 32 == 0) {
9240                         cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,5,32,2, false, true>, cudaFuncCachePreferShared);
9241                         conv_weight_acts_c<4,32,5,32,2,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9242                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9243                     } else {
9244                         cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,2,32,2, false, true>, cudaFuncCachePreferShared);
9245                         conv_weight_acts_c<8,16,2,32,2,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9246                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9247                     }
9248                 } else {
9249                     if (numFilters % 32 == 0) {
9250                         cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,5,32,2, false, false>, cudaFuncCachePreferShared);
9251                         conv_weight_acts_c<4,32,5,32,2,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9252                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9253                     } else {
9254                         cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,2,32,2, false, false>, cudaFuncCachePreferShared);
9255                         conv_weight_acts_c<8,16,2,32,2,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9256                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9257                     }
9258                 }
9259             } else if (numFilterColors == 3) {
9260                 if (checkCaseBounds) {
9261                     if (numFilters % 32 == 0) {
9262                         cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,5,32,3, false, true>, cudaFuncCachePreferShared);
9263                         conv_weight_acts_c<4,32,5,32,3,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9264                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9265                     } else {
9266                         cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,2,32,3, false, true>, cudaFuncCachePreferShared);
9267                         conv_weight_acts_c<8,16,2,32,3,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9268                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9269                     }
9270                 } else {
9271                     if (numFilters % 32 == 0) {
9272                         cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,5,32,3, false, false>, cudaFuncCachePreferShared);
9273                         conv_weight_acts_c<4,32,5,32,3,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9274                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9275                     } else {
9276                         cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,2,32,3, false, false>, cudaFuncCachePreferShared);
9277                         conv_weight_acts_c<8,16,2,32,3,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9278                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9279                     }
9280                 }
9281             }
9282
9283         } else { // do scale
9284             if (numFilterColors == 1) {
9285                 if (checkCaseBounds) {
9286                     if (numFilters % 32 == 0) {
9287                         cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,8,32,1, true, true>, cudaFuncCachePreferShared);
9288                         conv_weight_acts_c<4,32,8,32,1,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9289                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9290                     } else {
9291                         cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,5,32,1, true, true>, cudaFuncCachePreferShared);
9292                         conv_weight_acts_c<8,16,5,32,1,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9293                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9294                     }
9295                 } else {
9296                     if (numFilters % 32 == 0) {
9297                         cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,8,32,1, true, false>, cudaFuncCachePreferShared);
9298                         conv_weight_acts_c<4,32,8,32,1,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9299                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9300                     } else {
9301                         cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,5,32,1, true, false>, cudaFuncCachePreferShared);
9302                         conv_weight_acts_c<8,16,5,32,1,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9303                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9304                     }
9305                 }
9306             } else if (numFilterColors == 2) {
9307                 if (checkCaseBounds) {
9308                     if (numFilters % 32 == 0) {
9309                         cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,5,32,2, true, true>, cudaFuncCachePreferShared);
9310                         conv_weight_acts_c<4,32,5,32,2,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9311                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9312                     } else {
9313                         cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,2,32,2, true, true>, cudaFuncCachePreferShared);
9314                         conv_weight_acts_c<8,16,2,32,2,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9315                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9316                     }
9317                 } else {
9318                     if (numFilters % 32 == 0) {
9319                         cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,5,32,2, true, false>, cudaFuncCachePreferShared);
9320                         conv_weight_acts_c<4,32,5,32,2,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9321                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9322                     } else {
9323                         cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,2,32,2, true, false>, cudaFuncCachePreferShared);
9324                         conv_weight_acts_c<8,16,2,32,2,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9325                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9326                     }
9327                 }
9328             } else if (numFilterColors == 3) {
9329                 if (checkCaseBounds) {
9330                     if (numFilters % 32 == 0) {
9331                         cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,5,32,3, true, true>, cudaFuncCachePreferShared);
9332                         conv_weight_acts_c<4,32,5,32,3,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9333                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9334                     } else {
9335                         cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,2,32,3, true, true>, cudaFuncCachePreferShared);
9336                         conv_weight_acts_c<8,16,2,32,3,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9337                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9338                     }
9339                 } else {
9340                     if (numFilters % 32 == 0) {
9341                         cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,5,32,3, true, false>, cudaFuncCachePreferShared);
9342                         conv_weight_acts_c<4,32,5,32,3,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9343                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9344                     } else {
9345                         cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,2,32,3, true, false>, cudaFuncCachePreferShared);
9346                         conv_weight_acts_c<8,16,2,32,3,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9347                                                                 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9348                     }
9349                 }
9350             }
9351         }
9352     }
9353     cutilCheckMsg("weightActs: kernel execution failed");
9354 }
9355
9356 void convWeightActs(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets,
9357                        int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups, int partialSum) {
9358     _weightActs(images, hidActs, targets, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart, moduleStride, numImgColors, numGroups, partialSum, 0, 1);
9359 }
9360
9361 void convWeightActs(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets,
9362                     int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups, int partialSum,
9363                     float scaleTargets, float scaleOutput) {
9364     _weightActs(images, hidActs, targets, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart, moduleStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9365 }
9366
9367 void localWeightActs(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets,
9368                        int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups) {
9369     _weightActs(images, hidActs, targets, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart, moduleStride, numImgColors, numGroups, 1, 0, 1);
9370 }
9371
9372 void localWeightActs(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets,
9373                     int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride,
9374                     int numImgColors, int numGroups, float scaleTargets, float scaleOutput) {
9375     _weightActs(images, hidActs, targets, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart, moduleStride, numImgColors, numGroups, 1,
9376                 scaleTargets, scaleOutput);
9377 }
9378
9379 /*
9380  * images:          (numImgColors, imgPixels, numImages), with stride given
9381  * hidActs:         (numFilters, numModules, numImages)
9382  *
9383  * targets:         (numModules/partialSum, numFilterColors, filterPixels, numFilters)
9384  * colorIndices:    (numGroups, numFilterColors)
9385  *
9386  * Note: all of these convolution routines are optimized for the case when
9387  * the number of images (i.e. the minibatch size) is a multiple of 128.
9388  * Other batch sizes will work, but but I made no attempt whatsoever
9389  * to make them work fast.
9390  */
9391 void _weightActsSparse(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, int* dColorIndices,
9392                         int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride,
9393                         int numImgColors, int numFilterColors, int numGroups, int partialSum,
9394                         float scaleTargets, float scaleOutput) {
9395     int imgStride = images.getStride();
9396     int numImages = images.getNumCols();
9397     int imgPixels = images.getNumRows() / numImgColors;
9398     int imgSizeX = imgPixels / imgSizeY;
9399     int numModules = numModulesY * numModulesX;
9400     int numFilters = hidActs.getNumRows() / numModules;
9401     int numFiltersPerGroup = numFilters / numGroups;
9402
9403     assert(numGroups > 1);
9404     assert(numImgColors % numFilterColors == 0);
9405     assert((numFilterColors * numGroups) % numImgColors == 0);
9406     assert(numFilters % (16*numGroups) == 0);
9407     assert(numFilterColors % 4 == 0);
9408     assert(imgSizeY * imgSizeX == imgPixels);
9409     assert(images.getNumRows() == imgPixels * numImgColors);
9410
9411     int filterPixels = filterSize * filterSize;
9412     partialSum = partialSum == 0 ? numModules : partialSum;
9413
9414     assert(numModules % partialSum == 0);
9415     assert(hidActs.getNumCols() == numImages);
9416
9417     // These routines don't handle the case when only part of the image is visited in the convolution
9418     assert(paddingStart <= 0);
9419     assert(paddingStart + (numModulesX-1)*moduleStride + filterSize >= imgSizeX);
9420     assert(paddingStart + (numModulesY-1)*moduleStride + filterSize >= imgSizeY);
9421     assert(moduleStride <= filterSize);
9422
9423     assert(numModules * numFilters == hidActs.getNumRows());
9424
9425     assert(!images.isTrans());
9426     assert(!hidActs.isTrans());
9427     assert(hidActs.isContiguous());
9428
9429     assert(!targets.isTrans());
9430     assert(targets.isContiguous());
9431
9432     int preloadCases = 32;
9433
9434     dim3 blocks, threads;
9435     int bx, by;
9436     int filtersPerThread, colorsPerThread;
9437
9438     filtersPerThread = numFiltersPerGroup % 32 == 0 ? 2 : 1;
9439     colorsPerThread = numFilterColors % 8 == 0 ? 8 : 4;
9440     by = numFiltersPerGroup % 64 == 0 ? 4 : 8;
9441     bx = numFiltersPerGroup % 64 == 0 ? 32 : 16;
9442     blocks = dim3((numModules/partialSum)*(numFilters/(bx*filtersPerThread)), DIVUP(filterPixels, by) * (numFilterColors / colorsPerThread));
9443
9444     assert((by * bx) % preloadCases == 0);
9445     threads = dim3(bx, by);
9446     bool checkCaseBounds = numImages % 32 != 0;
9447
9448     if (scaleTargets == 0) {
9449         targets.resize((numModules/partialSum) * numFilterColors*filterPixels, numFilters);
9450     } else {
9451         assert(targets.getNumRows() == (numModules/partialSum) * numFilterColors*filterPixels);
9452         assert(targets.getNumCols() == numFilters);
9453     }
9454
9455     if (scaleTargets == 0) { // do not scale
9456         if (numFiltersPerGroup % 64 == 0) {
9457             if (numFilterColors % 8 == 0) {
9458                 if (checkCaseBounds) {
9459                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<4,32,2,8,32, false, true>, cudaFuncCachePreferShared);
9460                     conv_weight_acts_mc_mf_rand<4,32,2,8,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9461                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9462                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9463                 } else {
9464                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<4,32,2,8,32, false, false>, cudaFuncCachePreferShared);
9465                     conv_weight_acts_mc_mf_rand<4,32,2,8,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9466                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9467                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9468                 }
9469             } else {
9470                 if (checkCaseBounds) {
9471                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<4,32,2,4,32, false, true>, cudaFuncCachePreferShared);
9472                     conv_weight_acts_mc_mf_rand<4,32,2,4,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9473                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9474                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9475                 } else {
9476                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<4,32,2,4,32, false, false>, cudaFuncCachePreferShared);
9477                     conv_weight_acts_mc_mf_rand<4,32,2,4,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9478                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9479                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9480                 }
9481             }
9482         } else if (numFiltersPerGroup % 32 == 0) {
9483             if (numFilterColors % 8 == 0) {
9484                 if (checkCaseBounds) {
9485                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,8,32, false, true>, cudaFuncCachePreferShared);
9486                     conv_weight_acts_mc_mf_rand<8,16,2,8,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9487                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9488                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9489                 } else {
9490                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,8,32, false, false>, cudaFuncCachePreferShared);
9491                     conv_weight_acts_mc_mf_rand<8,16,2,8,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9492                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9493                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9494                 }
9495             } else {
9496                 if (checkCaseBounds) {
9497                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,4,32, false, true>, cudaFuncCachePreferShared);
9498                     conv_weight_acts_mc_mf_rand<8,16,2,4,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9499                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9500                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9501                 } else {
9502                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,4,32, false, false>, cudaFuncCachePreferShared);
9503                     conv_weight_acts_mc_mf_rand<8,16,2,4,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9504                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9505                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9506                 }
9507             }
9508         } else {
9509             if (numFilterColors % 8 == 0) {
9510                 if (checkCaseBounds) {
9511                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,1,8,32, false, true>, cudaFuncCachePreferShared);
9512                     conv_weight_acts_mc_mf_rand<8,16,1,8,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9513                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9514                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9515                 } else {
9516                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,1,8,32, false, false>, cudaFuncCachePreferShared);
9517                     conv_weight_acts_mc_mf_rand<8,16,1,8,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9518                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9519                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9520                 }
9521             } else {
9522                 if (checkCaseBounds) {
9523                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,1,4,32, false, true>, cudaFuncCachePreferShared);
9524                     conv_weight_acts_mc_mf_rand<8,16,1,4,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9525                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9526                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9527                 } else {
9528                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,1,4,32, false, false>, cudaFuncCachePreferShared);
9529                     conv_weight_acts_mc_mf_rand<8,16,1,4,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9530                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9531                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9532                 }
9533             }
9534         }
9535
9536     } else {
9537         if (numFiltersPerGroup % 64 == 0) {
9538             if (numFilterColors % 8 == 0) {
9539                 if (checkCaseBounds) {
9540                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<4,32,2,8,32, false, true>, cudaFuncCachePreferShared);
9541                     conv_weight_acts_mc_mf_rand<4,32,2,8,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9542                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9543                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9544                 } else {
9545                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<4,32,2,8,32, false, false>, cudaFuncCachePreferShared);
9546                     conv_weight_acts_mc_mf_rand<4,32,2,8,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9547                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9548                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9549                 }
9550             } else {
9551                 if (checkCaseBounds) {
9552                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<4,32,2,4,32, false, true>, cudaFuncCachePreferShared);
9553                     conv_weight_acts_mc_mf_rand<4,32,2,4,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9554                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9555                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9556                 } else {
9557                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<4,32,2,4,32, false, false>, cudaFuncCachePreferShared);
9558                     conv_weight_acts_mc_mf_rand<4,32,2,4,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9559                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9560                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9561                 }
9562             }
9563         } else if (numFiltersPerGroup % 32 == 0) {
9564             if (numFilterColors % 8 == 0) {
9565                 if (checkCaseBounds) {
9566                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,8,32, false, true>, cudaFuncCachePreferShared);
9567                     conv_weight_acts_mc_mf_rand<8,16,2,8,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9568                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9569                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9570                 } else {
9571                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,8,32, false, false>, cudaFuncCachePreferShared);
9572                     conv_weight_acts_mc_mf_rand<8,16,2,8,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9573                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9574                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9575                 }
9576             } else {
9577                 if (checkCaseBounds) {
9578                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,4,32, false, true>, cudaFuncCachePreferShared);
9579                     conv_weight_acts_mc_mf_rand<8,16,2,4,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9580                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9581                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9582                 } else {
9583                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,4,32, false, false>, cudaFuncCachePreferShared);
9584                     conv_weight_acts_mc_mf_rand<8,16,2,4,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9585                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9586                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9587                 }
9588             }
9589         } else {
9590             if (numFilterColors % 8 == 0) {
9591                 if (checkCaseBounds) {
9592                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,1,8,32, false, true>, cudaFuncCachePreferShared);
9593                     conv_weight_acts_mc_mf_rand<8,16,1,8,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9594                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9595                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9596                 } else {
9597                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,1,8,32, false, false>, cudaFuncCachePreferShared);
9598                     conv_weight_acts_mc_mf_rand<8,16,1,8,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9599                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9600                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9601                 }
9602             } else {
9603                 if (checkCaseBounds) {
9604                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,1,4,32, false, true>, cudaFuncCachePreferShared);
9605                     conv_weight_acts_mc_mf_rand<8,16,1,4,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9606                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9607                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9608                 } else {
9609                     cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,4,32, false, false>, cudaFuncCachePreferShared);
9610                     conv_weight_acts_mc_mf_rand<8,16,1,4,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9611                                                                                    numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9612                                                                                    paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9613                 }
9614             }
9615         }
9616     }
9617     cutilCheckMsg("weightActsSparse: kernel execution failed");
9618 }
9619
9620 void convWeightActsSparse(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, int* dColorIndices,
9621                          int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride,
9622                          int numImgColors, int numFilterColors, int numGroups) {
9623     _weightActsSparse(images, hidActs, targets, dColorIndices, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart,
9624                       moduleStride, numImgColors, numFilterColors, numGroups, 0, 1, 0);
9625 }
9626
9627 void convWeightActsSparse(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, int* dColorIndices,
9628                         int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numFilterColors,
9629                         int numGroups, int partialSum, float scaleTargets, float scaleOutput) {
9630      _weightActsSparse(images, hidActs, targets, dColorIndices, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart,
9631                       moduleStride, numImgColors, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9632 }
9633
9634 void localWeightActsSparse(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, int* dColorIndices,
9635                          int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride,
9636                          int numImgColors, int numFilterColors, int numGroups) {
9637     _weightActsSparse(images, hidActs, targets, dColorIndices, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart,
9638                       moduleStride, numImgColors, numFilterColors, numGroups, 1, 1, 0);
9639 }
9640
9641 void localWeightActsSparse(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, int* dColorIndices,
9642                         int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numFilterColors,
9643                         int numGroups, float scaleTargets, float scaleOutput) {
9644      _weightActsSparse(images, hidActs, targets, dColorIndices, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart,
9645                       moduleStride, numImgColors, numFilterColors, numGroups, 1, scaleTargets, scaleOutput);
9646 }
9647
===============================
In file included from /usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/ndarraytypes.h:1804:0,
                 from /usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/ndarrayobject.h:17,
                 from /usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/arrayobject.h:4,
                 from /usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda/cuda_ndarray.cuh:35,
                 from /home/ubuntu/pylearn2/pylearn2/sandbox/cuda_convnet/nvmatrix.cuh:49,
                 from mod.cu:130:
/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h:15:2: warning: #warning "Using deprecated NumPy API, disable it by " "#defining NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
 #warning "Using deprecated NumPy API, disable it by " \
  ^
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(90): error: more than one instance of overloaded function "cublasGetVersion_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(102): error: more than one instance of overloaded function "cublasSnrm2_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(103): error: more than one instance of overloaded function "cublasDnrm2_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(104): error: more than one instance of overloaded function "cublasScnrm2_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(105): error: more than one instance of overloaded function "cublasDznrm2_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(108): error: more than one instance of overloaded function "cublasSdot_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(110): error: more than one instance of overloaded function "cublasDdot_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(112): error: more than one instance of overloaded function "cublasCdotu_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(114): error: more than one instance of overloaded function "cublasCdotc_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(116): error: more than one instance of overloaded function "cublasZdotu_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(118): error: more than one instance of overloaded function "cublasZdotc_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(122): error: more than one instance of overloaded function "cublasSscal_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(123): error: more than one instance of overloaded function "cublasDscal_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(124): error: more than one instance of overloaded function "cublasCscal_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(125): error: more than one instance of overloaded function "cublasZscal_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(127): error: more than one instance of overloaded function "cublasCsscal_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(128): error: more than one instance of overloaded function "cublasZdscal_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(131): error: more than one instance of overloaded function "cublasSaxpy_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(133): error: more than one instance of overloaded function "cublasDaxpy_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(135): error: more than one instance of overloaded function "cublasCaxpy_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(137): error: more than one instance of overloaded function "cublasZaxpy_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(141): error: more than one instance of overloaded function "cublasScopy_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(143): error: more than one instance of overloaded function "cublasDcopy_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(145): error: more than one instance of overloaded function "cublasCcopy_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(147): error: more than one instance of overloaded function "cublasZcopy_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(151): error: more than one instance of overloaded function "cublasSswap_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(152): error: more than one instance of overloaded function "cublasDswap_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(153): error: more than one instance of overloaded function "cublasCswap_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(154): error: more than one instance of overloaded function "cublasZswap_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(157): error: more than one instance of overloaded function "cublasIsamax_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(158): error: more than one instance of overloaded function "cublasIdamax_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(159): error: more than one instance of overloaded function "cublasIcamax_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(160): error: more than one instance of overloaded function "cublasIzamax_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(163): error: more than one instance of overloaded function "cublasIsamin_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(164): error: more than one instance of overloaded function "cublasIdamin_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(166): error: more than one instance of overloaded function "cublasIcamin_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(167): error: more than one instance of overloaded function "cublasIzamin_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(170): error: more than one instance of overloaded function "cublasSasum_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(171): error: more than one instance of overloaded function "cublasDasum_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(172): error: more than one instance of overloaded function "cublasScasum_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(173): error: more than one instance of overloaded function "cublasDzasum_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(176): error: more than one instance of overloaded function "cublasSrot_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(178): error: more than one instance of overloaded function "cublasDrot_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(180): error: more than one instance of overloaded function "cublasCrot_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(182): error: more than one instance of overloaded function "cublasZrot_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(185): error: more than one instance of overloaded function "cublasCsrot_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(187): error: more than one instance of overloaded function "cublasZdrot_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(191): error: more than one instance of overloaded function "cublasSrotg_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(192): error: more than one instance of overloaded function "cublasDrotg_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(193): error: more than one instance of overloaded function "cublasCrotg_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(195): error: more than one instance of overloaded function "cublasZrotg_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(199): error: more than one instance of overloaded function "cublasSrotm_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(201): error: more than one instance of overloaded function "cublasDrotm_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(205): error: more than one instance of overloaded function "cublasSrotmg_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(207): error: more than one instance of overloaded function "cublasDrotmg_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(212): error: more than one instance of overloaded function "cublasSgemv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(215): error: more than one instance of overloaded function "cublasDgemv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(218): error: more than one instance of overloaded function "cublasCgemv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(221): error: more than one instance of overloaded function "cublasZgemv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(226): error: more than one instance of overloaded function "cublasSgbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(230): error: more than one instance of overloaded function "cublasDgbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(234): error: more than one instance of overloaded function "cublasCgbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(238): error: more than one instance of overloaded function "cublasZgbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(244): error: more than one instance of overloaded function "cublasStrmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(246): error: more than one instance of overloaded function "cublasDtrmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(248): error: more than one instance of overloaded function "cublasCtrmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(250): error: more than one instance of overloaded function "cublasZtrmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(254): error: more than one instance of overloaded function "cublasStbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(256): error: more than one instance of overloaded function "cublasDtbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(258): error: more than one instance of overloaded function "cublasCtbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(260): error: more than one instance of overloaded function "cublasZtbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(264): error: more than one instance of overloaded function "cublasStpmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(266): error: more than one instance of overloaded function "cublasDtpmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(268): error: more than one instance of overloaded function "cublasCtpmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(270): error: more than one instance of overloaded function "cublasZtpmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(273): error: more than one instance of overloaded function "cublasStrsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(275): error: more than one instance of overloaded function "cublasDtrsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(277): error: more than one instance of overloaded function "cublasCtrsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(279): error: more than one instance of overloaded function "cublasZtrsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(283): error: more than one instance of overloaded function "cublasStpsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(286): error: more than one instance of overloaded function "cublasDtpsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(288): error: more than one instance of overloaded function "cublasCtpsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(290): error: more than one instance of overloaded function "cublasZtpsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(294): error: more than one instance of overloaded function "cublasStbsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(298): error: more than one instance of overloaded function "cublasDtbsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(301): error: more than one instance of overloaded function "cublasCtbsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(305): error: more than one instance of overloaded function "cublasZtbsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(310): error: more than one instance of overloaded function "cublasSsymv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(313): error: more than one instance of overloaded function "cublasDsymv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(316): error: more than one instance of overloaded function "cublasChemv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(319): error: more than one instance of overloaded function "cublasZhemv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(324): error: more than one instance of overloaded function "cublasSsbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(327): error: more than one instance of overloaded function "cublasDsbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(330): error: more than one instance of overloaded function "cublasChbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(333): error: more than one instance of overloaded function "cublasZhbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(338): error: more than one instance of overloaded function "cublasSspmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(341): error: more than one instance of overloaded function "cublasDspmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(344): error: more than one instance of overloaded function "cublasChpmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(347): error: more than one instance of overloaded function "cublasZhpmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(353): error: more than one instance of overloaded function "cublasSger_v2" has "C" linkage
Error limit reached.
100 errors detected in the compilation of "/tmp/tmpxft_000009bf_00000000-8_mod.cpp1.ii".
Compilation terminated.
ERROR (pylearn2.sandbox.cuda_convnet.convnet_compile): Failed to compile /home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_convnet/mod.cu ('nvmatrix_kernels.cu', 'nvmatrix.cu', 'conv_util.cu', 'filter_acts.cu', 'img_acts.cu', 'weight_acts.cu'): ('nvcc return status', 4, 'for cmd', 'nvcc -shared -g -O3 -arch=sm_30 -m64 -Xcompiler -DCUDA_NDARRAY_CUH=d67f7c8a21306c67152a70a88a837011,-fPIC -Xlinker -rpath,/home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_ndarray -I/home/ubuntu/pylearn2/pylearn2/sandbox/cuda_convnet/ -I/usr/local/lib/python2.7/dist-packages/numpy/core/include -I/usr/include/python2.7 -I/usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda -o /home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_convnet/cuda_convnet.so mod.cu -L/home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_ndarray -L/home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_convnet -LNone/lib -LNone/lib64 -L/usr/lib -lpython2.7 -lcublas -lcudart')

['nvcc', '-shared', '-g', '-O3', '-arch=sm_30', '-m64', '-Xcompiler', '-DCUDA_NDARRAY_CUH=d67f7c8a21306c67152a70a88a837011,-fPIC', '-Xlinker', '-rpath,/home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_ndarray', '-I/home/ubuntu/pylearn2/pylearn2/sandbox/cuda_convnet/', '-I/usr/local/lib/python2.7/dist-packages/numpy/core/include', '-I/usr/include/python2.7', '-I/usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda', '-o', '/home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_convnet/cuda_convnet.so', 'mod.cu', '-L/home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_ndarray', '-L/home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_convnet', '-LNone/lib', '-LNone/lib64', '-L/usr/lib', '-lpython2.7', '-lcublas', '-lcudart']
Traceback (most recent call last):
  File "/home/ubuntu/pylearn2/pylearn2/scripts/train.py", line 252, in <module>
    args.verbose_logging, args.debug)
  File "/home/ubuntu/pylearn2/pylearn2/scripts/train.py", line 197, in train
    train_obj = serial.load_train_file(config)
  File "/home/ubuntu/pylearn2/pylearn2/utils/serial.py", line 524, in load_train_file
    return yaml_parse.load_path(config_file_path, environ=environ)
  File "/home/ubuntu/pylearn2/pylearn2/config/yaml_parse.py", line 379, in load_path
    return load(content, instantiate=instantiate, environ=environ, **kwargs)
  File "/home/ubuntu/pylearn2/pylearn2/config/yaml_parse.py", line 337, in load
    return _instantiate(proxy_graph)
  File "/home/ubuntu/pylearn2/pylearn2/config/yaml_parse.py", line 280, in _instantiate
    return _instantiate_proxy_tuple(proxy, bindings)
  File "/home/ubuntu/pylearn2/pylearn2/config/yaml_parse.py", line 229, in _instantiate_proxy_tuple
    for k, v in proxy.keywords.iteritems())
  File "/home/ubuntu/pylearn2/pylearn2/config/yaml_parse.py", line 229, in <genexpr>
    for k, v in proxy.keywords.iteritems())
  File "/home/ubuntu/pylearn2/pylearn2/config/yaml_parse.py", line 280, in _instantiate
    return _instantiate_proxy_tuple(proxy, bindings)
  File "/home/ubuntu/pylearn2/pylearn2/config/yaml_parse.py", line 230, in _instantiate_proxy_tuple
    obj = checked_call(proxy.callable, kwargs)
  File "/home/ubuntu/pylearn2/pylearn2/utils/call_check.py", line 99, in checked_call
    return to_call(**kwargs)
  File "/home/ubuntu/pylearn2/pylearn2/models/mlp.py", line 490, in __init__
    self._update_layer_input_spaces()
  File "/home/ubuntu/pylearn2/pylearn2/models/mlp.py", line 555, in _update_layer_input_spaces
    layers[0].set_input_space(self.get_input_space())
  File "/home/ubuntu/pylearn2/pylearn2/models/maxout.py", line 803, in set_input_space
    dummy_p = dummy_p.eval()
  File "/usr/local/lib/python2.7/dist-packages/theano/gof/graph.py", line 420, in eval
    self._fn = theano.function(self._fn_inputs, self)
  File "/usr/local/lib/python2.7/dist-packages/theano/compile/function.py", line 223, in function
    profile=profile)
  File "/usr/local/lib/python2.7/dist-packages/theano/compile/pfunc.py", line 512, in pfunc
    on_unused_input=on_unused_input)
  File "/usr/local/lib/python2.7/dist-packages/theano/compile/function_module.py", line 1312, in orig_function
    defaults)
  File "/usr/local/lib/python2.7/dist-packages/theano/compile/function_module.py", line 1181, in create
    _fn, _i, _o = self.linker.make_thunk(input_storage=input_storage_lists)
  File "/usr/local/lib/python2.7/dist-packages/theano/gof/link.py", line 434, in make_thunk
    output_storage=output_storage)[:3]
  File "/usr/local/lib/python2.7/dist-packages/theano/gof/vm.py", line 847, in make_all
    no_recycling))
  File "/home/ubuntu/pylearn2/pylearn2/sandbox/cuda_convnet/pool.py", line 334, in make_thunk
    raise RuntimeError('Could not compile cuda_convnet')
RuntimeError: ('The following error happened while compiling the node', <pylearn2.sandbox.cuda_convnet.pool.MaxPool object at 0x7f82776f8410>(GpuContiguous.0), '\n', 'Could not compile cuda_convnet')